Handle addressing modes for HW intrinsics (#22944)
authorCarol Eidt <carol.eidt@microsoft.com>
Tue, 26 Mar 2019 23:13:40 +0000 (16:13 -0700)
committerGitHub <noreply@github.com>
Tue, 26 Mar 2019 23:13:40 +0000 (16:13 -0700)
* Handle addressing modes for HW intrinsics

Also, eliminate some places where the code size estimates were over-estimating.

Contribute to #19550
Fix #19521

17 files changed:
src/jit/codegen.h
src/jit/codegencommon.cpp
src/jit/codegenlinear.cpp
src/jit/emitxarch.cpp
src/jit/emitxarch.h
src/jit/gentree.cpp
src/jit/hwintrinsiccodegenxarch.cpp
src/jit/hwintrinsiclistxarch.h
src/jit/hwintrinsicxarch.cpp
src/jit/instr.cpp
src/jit/lower.h
src/jit/lowerxarch.cpp
src/jit/lsra.h
src/jit/lsrabuild.cpp
src/jit/lsraxarch.cpp
tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.cs [new file with mode: 0644]
tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.csproj [new file with mode: 0644]

index 046addf..1631480 100644 (file)
@@ -95,6 +95,7 @@ private:
     static bool genShouldRoundFP();
 
     GenTreeIndir indirForm(var_types type, GenTree* base);
+    GenTreeStoreInd storeIndirForm(var_types type, GenTree* base, GenTree* data);
 
     GenTreeIntCon intForm(var_types type, ssize_t value);
 
@@ -1040,6 +1041,9 @@ protected:
 
     void genConsumeRegs(GenTree* tree);
     void genConsumeOperands(GenTreeOp* tree);
+#ifdef FEATURE_HW_INTRINSICS
+    void genConsumeHWIntrinsicOperands(GenTreeHWIntrinsic* tree);
+#endif // FEATURE_HW_INTRINSICS
     void genEmitGSCookieCheck(bool pushReg);
     void genSetRegToIcon(regNumber reg, ssize_t val, var_types type = TYP_INT, insFlags flags = INS_FLAGS_DONT_CARE);
     void genCodeForShift(GenTree* tree);
@@ -1309,6 +1313,7 @@ public:
 
 #if defined(_TARGET_XARCH_)
     void inst_RV_RV_IV(instruction ins, emitAttr size, regNumber reg1, regNumber reg2, unsigned ival);
+    void inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival);
 #endif
 
     void inst_RV_RR(instruction ins, emitAttr size, regNumber reg1, regNumber reg2);
index 9efa517..8f31deb 100644 (file)
@@ -11317,6 +11317,17 @@ GenTreeIndir CodeGen::indirForm(var_types type, GenTree* base)
 }
 
 //------------------------------------------------------------------------
+// indirForm: Make a temporary indir we can feed to pattern matching routines
+//    in cases where we don't want to instantiate all the indirs that happen.
+//
+GenTreeStoreInd CodeGen::storeIndirForm(var_types type, GenTree* base, GenTree* data)
+{
+    GenTreeStoreInd i(type, base, data);
+    i.gtRegNum = REG_NA;
+    return i;
+}
+
+//------------------------------------------------------------------------
 // intForm: Make a temporary int we can feed to pattern matching routines
 //    in cases where we don't want to instantiate.
 //
index 7c5c018..72f9fa6 100644 (file)
@@ -1339,12 +1339,27 @@ void CodeGen::genConsumeRegs(GenTree* tree)
             // Update the life of the lcl var.
             genUpdateLife(tree);
         }
-#endif // _TARGET_XARCH_
-        else if (tree->OperIsInitVal())
+#ifdef FEATURE_HW_INTRINSICS
+        else if (tree->OperIs(GT_HWIntrinsic))
         {
-            genConsumeReg(tree->gtGetOp1());
+            // Only load/store HW intrinsics can be contained (and the address may also be contained).
+            HWIntrinsicCategory category = HWIntrinsicInfo::lookupCategory(tree->AsHWIntrinsic()->gtHWIntrinsicId);
+            assert((category == HW_Category_MemoryLoad) || (category == HW_Category_MemoryStore));
+            int numArgs = HWIntrinsicInfo::lookupNumArgs(tree->AsHWIntrinsic());
+            genConsumeAddress(tree->gtGetOp1());
+            if (category == HW_Category_MemoryStore)
+            {
+                assert((numArgs == 2) && !tree->gtGetOp2()->isContained());
+                genConsumeReg(tree->gtGetOp2());
+            }
+            else
+            {
+                assert(numArgs == 1);
+            }
         }
-        else if (tree->OperIsHWIntrinsic())
+#endif // FEATURE_HW_INTRINSICS
+#endif // _TARGET_XARCH_
+        else if (tree->OperIsInitVal())
         {
             genConsumeReg(tree->gtGetOp1());
         }
@@ -1374,11 +1389,6 @@ void CodeGen::genConsumeRegs(GenTree* tree)
 // Return Value:
 //    None.
 //
-// Notes:
-//    Note that this logic is localized here because we must do the liveness update in
-//    the correct execution order.  This is important because we may have two operands
-//    that involve the same lclVar, and if one is marked "lastUse" we must handle it
-//    after the first.
 
 void CodeGen::genConsumeOperands(GenTreeOp* tree)
 {
@@ -1395,6 +1405,55 @@ void CodeGen::genConsumeOperands(GenTreeOp* tree)
     }
 }
 
+#ifdef FEATURE_HW_INTRINSICS
+//------------------------------------------------------------------------
+// genConsumeHWIntrinsicOperands: Do liveness update for the operands of a GT_HWIntrinsic node
+//
+// Arguments:
+//    node - the GenTreeHWIntrinsic node whose operands will have their liveness updated.
+//
+// Return Value:
+//    None.
+//
+
+void CodeGen::genConsumeHWIntrinsicOperands(GenTreeHWIntrinsic* node)
+{
+    int      numArgs = HWIntrinsicInfo::lookupNumArgs(node);
+    GenTree* op1     = node->gtGetOp1();
+    if (op1 == nullptr)
+    {
+        assert((numArgs == 0) && (node->gtGetOp2() == nullptr));
+        return;
+    }
+    if (op1->OperIs(GT_LIST))
+    {
+        int foundArgs = 0;
+        assert(node->gtGetOp2() == nullptr);
+        for (GenTreeArgList* list = op1->AsArgList(); list != nullptr; list = list->Rest())
+        {
+            GenTree* operand = list->Current();
+            genConsumeRegs(operand);
+            foundArgs++;
+        }
+        assert(foundArgs == numArgs);
+    }
+    else
+    {
+        genConsumeRegs(op1);
+        GenTree* op2 = node->gtGetOp2();
+        if (op2 != nullptr)
+        {
+            genConsumeRegs(op2);
+            assert(numArgs == 2);
+        }
+        else
+        {
+            assert(numArgs == 1);
+        }
+    }
+}
+#endif // FEATURE_HW_INTRINSICS
+
 #if FEATURE_PUT_STRUCT_ARG_STK
 //------------------------------------------------------------------------
 // genConsumePutStructArgStk: Do liveness update for the operands of a PutArgStk node.
index 8bba1f6..e136550 100644 (file)
@@ -2849,6 +2849,12 @@ void emitter::emitInsLoadInd(instruction ins, emitAttr attr, regNumber dstReg, G
     id->idReg1(dstReg);
     emitHandleMemOp(mem, id, IF_RWR_ARD, ins);
     UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
+    if (Is4ByteSSEInstruction(ins))
+    {
+        // The 4-Byte SSE instructions require an additional byte.
+        sz += 1;
+    }
+
     id->idCodeSize(sz);
     dispIns(id);
     emitCurIGsize += sz;
@@ -4037,6 +4043,12 @@ void emitter::emitIns_R_A(instruction ins, emitAttr attr, regNumber reg1, GenTre
     emitHandleMemOp(indir, id, IF_RRW_ARD, ins);
 
     UNATIVE_OFFSET sz = emitInsSizeAM(id, insCodeRM(ins));
+    if (Is4ByteSSEInstruction(ins))
+    {
+        // The 4-Byte SSE instructions require an additional byte.
+        sz += 1;
+    }
+
     id->idCodeSize(sz);
 
     dispIns(id);
@@ -4088,8 +4100,8 @@ void emitter::emitIns_R_AR_I(instruction ins, emitAttr attr, regNumber reg1, reg
 
     if (Is4ByteSSEInstruction(ins))
     {
-        // The 4-Byte SSE instructions require two additional bytes
-        sz += 2;
+        // The 4-Byte SSE instructions require an additional byte.
+        sz += 1;
     }
 
     id->idCodeSize(sz);
@@ -5165,8 +5177,8 @@ void emitter::emitIns_R_AR(instruction ins, emitAttr attr, regNumber ireg, regNu
 
     if (Is4ByteSSEInstruction(ins))
     {
-        // The 4-Byte SSE instructions require two additional bytes
-        sz += 2;
+        // The 4-Byte SSE instructions require an additional byte.
+        sz += 1;
     }
 
     id->idCodeSize(sz);
@@ -5640,7 +5652,7 @@ void emitter::emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNu
 
 #ifdef FEATURE_HW_INTRINSICS
 //------------------------------------------------------------------------
-// emitIns_SIMD_R_R_I: emits the code for a SIMD instruction that takes a register operand, an immediate operand
+// emitIns_SIMD_R_R_I: emits the code for an instruction that takes a register operand, an immediate operand
 //                     and that returns a value in register
 //
 // Arguments:
@@ -5650,6 +5662,13 @@ void emitter::emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNu
 //    op1Reg    -- The register of the first operand
 //    ival      -- The immediate value
 //
+// Notes:
+//    This will handle the required register copy if 'op1Reg' and 'targetReg' are not the same, and
+//    the 3-operand format is not available.
+//    This is not really SIMD-specific, but is currently only used in that context, as that's
+//    where we frequently need to handle the case of generating 3-operand or 2-operand forms
+//    depending on what target ISA is supported.
+//
 void emitter::emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival)
 {
     if (UseVEXEncoding() || IsDstSrcImmAvxInstruction(ins))
@@ -5704,12 +5723,14 @@ void emitter::emitIns_SIMD_R_R_A(
 //    targetReg -- The target register
 //    op1Reg    -- The register of the first operand
 //    base      -- The base register used for the memory address
+//    offset    -- The memory offset
 //
-void emitter::emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base)
+void emitter::emitIns_SIMD_R_R_AR(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int offset)
 {
     if (UseVEXEncoding())
     {
-        emitIns_R_R_AR(ins, attr, targetReg, op1Reg, base, 0);
+        emitIns_R_R_AR(ins, attr, targetReg, op1Reg, base, offset);
     }
     else
     {
@@ -5717,7 +5738,7 @@ void emitter::emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber targ
         {
             emitIns_R_R(INS_movaps, attr, targetReg, op1Reg);
         }
-        emitIns_R_AR(ins, attr, targetReg, base, 0);
+        emitIns_R_AR(ins, attr, targetReg, base, offset);
     }
 }
 
index 5b06838..bad81b7 100644 (file)
@@ -449,7 +449,8 @@ void emitIns_AX_R(instruction ins, emitAttr attr, regNumber ireg, regNumber reg,
 void emitIns_SIMD_R_R_I(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, int ival);
 
 void emitIns_SIMD_R_R_A(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, GenTreeIndir* indir);
-void emitIns_SIMD_R_R_AR(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base);
+void emitIns_SIMD_R_R_AR(
+    instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber base, int offset);
 void emitIns_SIMD_R_R_C(
     instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, CORINFO_FIELD_HANDLE fldHnd, int offs);
 void emitIns_SIMD_R_R_R(instruction ins, emitAttr attr, regNumber targetReg, regNumber op1Reg, regNumber op2Reg);
index 623a98c..2b65206 100644 (file)
@@ -3510,6 +3510,26 @@ unsigned Compiler::gtSetEvalOrder(GenTree* tree)
                     costSz = 2 * 2;
                     break;
 
+#if defined(FEATURE_HW_INTRINSICS) && defined(_TARGET_XARCH_)
+                case GT_HWIntrinsic:
+                {
+                    if (tree->AsHWIntrinsic()->OperIsMemoryLoadOrStore())
+                    {
+                        costEx = IND_COST_EX;
+                        costSz = 2;
+                        // See if we can form a complex addressing mode.
+
+                        GenTree* addr = op1->gtEffectiveVal();
+
+                        if (addr->OperIs(GT_ADD) && gtMarkAddrMode(addr, &costEx, &costSz, tree->TypeGet()))
+                        {
+                            goto DONE;
+                        }
+                    }
+                }
+                break;
+#endif // FEATURE_HW_INTRINSICS && _TARGET_XARCH_
+
                 case GT_BLK:
                 case GT_IND:
 
index 9660ae3..9eada1e 100644 (file)
@@ -109,32 +109,50 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
         {
             case 1:
             {
-                genConsumeOperands(node);
-                op1Reg = op1->gtRegNum;
-
                 if (node->OperIsMemoryLoad())
                 {
-                    emit->emitIns_R_AR(ins, simdSize, targetReg, op1Reg, 0);
-                }
-                else if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
-                {
-                    emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
-                }
-                else if ((ival != -1) && varTypeIsFloating(baseType))
-                {
-                    assert((ival >= 0) && (ival <= 127));
-                    genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival);
+                    genConsumeAddress(op1);
+                    // Until we improve the handling of addressing modes in the emitter, we'll create a
+                    // temporary GT_IND to generate code with.
+                    GenTreeIndir load = indirForm(node->TypeGet(), op1);
+                    emit->emitInsLoadInd(ins, simdSize, node->gtRegNum, &load);
                 }
                 else
                 {
-                    genHWIntrinsic_R_RM(node, ins, simdSize);
+                    genConsumeRegs(op1);
+                    op1Reg = op1->gtRegNum;
+
+                    if ((category == HW_Category_SIMDScalar) && HWIntrinsicInfo::CopiesUpperBits(intrinsicId))
+                    {
+                        emit->emitIns_SIMD_R_R_R(ins, simdSize, targetReg, op1Reg, op1Reg);
+                    }
+                    else if ((ival != -1) && varTypeIsFloating(baseType))
+                    {
+                        assert((ival >= 0) && (ival <= 127));
+                        genHWIntrinsic_R_RM_I(node, ins, (int8_t)ival);
+                    }
+                    else
+                    {
+                        genHWIntrinsic_R_RM(node, ins, simdSize);
+                    }
                 }
                 break;
             }
 
             case 2:
             {
-                genConsumeOperands(node);
+                if (category == HW_Category_MemoryStore)
+                {
+                    genConsumeAddress(op1);
+                    genConsumeReg(op2);
+                    // Until we improve the handling of addressing modes in the emitter, we'll create a
+                    // temporary GT_STORE_IND to generate code with.
+                    GenTreeStoreInd store = storeIndirForm(node->TypeGet(), op1, op2);
+                    emit->emitInsStoreInd(ins, simdSize, &store);
+                    break;
+                }
+                genConsumeRegs(op1);
+                genConsumeRegs(op2);
 
                 op1Reg = op1->gtRegNum;
                 op2Reg = op2->gtRegNum;
@@ -153,25 +171,30 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
                     op1Reg = targetReg;
                 }
 
-                if (category == HW_Category_MemoryStore)
-                {
-                    emit->emitIns_AR_R(ins, simdSize, op2Reg, op1Reg, 0);
-                }
-                else if ((ival != -1) && varTypeIsFloating(baseType))
+                if ((ival != -1) && varTypeIsFloating(baseType))
                 {
                     assert((ival >= 0) && (ival <= 127));
                     genHWIntrinsic_R_R_RM_I(node, ins, ival);
                 }
                 else if (category == HW_Category_MemoryLoad)
                 {
+                    // Get the address and the 'other' register.
+                    GenTree*  addr;
+                    regNumber otherReg;
                     if (intrinsicId == NI_AVX_MaskLoad || intrinsicId == NI_AVX2_MaskLoad)
                     {
-                        emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op2Reg, op1Reg);
+                        addr     = op1;
+                        otherReg = op2Reg;
                     }
                     else
                     {
-                        emit->emitIns_SIMD_R_R_AR(ins, simdSize, targetReg, op1Reg, op2Reg);
+                        addr     = op2;
+                        otherReg = op1Reg;
                     }
+                    // Until we improve the handling of addressing modes in the emitter, we'll create a
+                    // temporary GT_IND to generate code with.
+                    GenTreeIndir load = indirForm(node->TypeGet(), addr);
+                    genHWIntrinsic_R_R_RM(node, ins, simdSize, targetReg, otherReg, &load);
                 }
                 else if (HWIntrinsicInfo::isImmOp(intrinsicId, op2))
                 {
@@ -210,10 +233,6 @@ void CodeGen::genHWIntrinsic(GenTreeHWIntrinsic* node)
 
             case 3:
             {
-                assert(op1->OperIsList());
-                assert(op1->gtGetOp2()->OperIsList());
-                assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
-
                 GenTreeArgList* argList = op1->AsArgList();
                 op1                     = argList->Current();
                 genConsumeRegs(op1);
@@ -520,99 +539,8 @@ void CodeGen::genHWIntrinsic_R_RM_I(GenTreeHWIntrinsic* node, instruction ins, i
     {
         assert(HWIntrinsicInfo::SupportsContainment(node->gtHWIntrinsicId));
         assertIsContainableHWIntrinsicOp(compiler->m_pLowering, node, op1);
-
-        TempDsc* tmpDsc = nullptr;
-        unsigned varNum = BAD_VAR_NUM;
-        unsigned offset = (unsigned)-1;
-
-        if (op1->isUsedFromSpillTemp())
-        {
-            assert(op1->IsRegOptional());
-
-            tmpDsc = getSpillTempDsc(op1);
-            varNum = tmpDsc->tdTempNum();
-            offset = 0;
-
-            regSet.tmpRlsTemp(tmpDsc);
-        }
-        else if (op1->OperIsHWIntrinsic())
-        {
-            emit->emitIns_R_AR_I(ins, simdSize, targetReg, op1->gtGetOp1()->gtRegNum, 0, ival);
-            return;
-        }
-        else if (op1->isIndir())
-        {
-            GenTreeIndir* memIndir = op1->AsIndir();
-            GenTree*      memBase  = memIndir->gtOp1;
-
-            switch (memBase->OperGet())
-            {
-                case GT_LCL_VAR_ADDR:
-                {
-                    varNum = memBase->AsLclVarCommon()->GetLclNum();
-                    offset = 0;
-
-                    // Ensure that all the GenTreeIndir values are set to their defaults.
-                    assert(!memIndir->HasIndex());
-                    assert(memIndir->Scale() == 1);
-                    assert(memIndir->Offset() == 0);
-
-                    break;
-                }
-
-                case GT_CLS_VAR_ADDR:
-                {
-                    emit->emitIns_R_C_I(ins, simdSize, targetReg, memBase->gtClsVar.gtClsVarHnd, 0, ival);
-                    return;
-                }
-
-                default:
-                {
-                    emit->emitIns_R_A_I(ins, simdSize, targetReg, memIndir, ival);
-                    return;
-                }
-            }
-        }
-        else
-        {
-            switch (op1->OperGet())
-            {
-                case GT_LCL_FLD:
-                {
-                    GenTreeLclFld* lclField = op1->AsLclFld();
-
-                    varNum = lclField->GetLclNum();
-                    offset = lclField->gtLclFld.gtLclOffs;
-                    break;
-                }
-
-                case GT_LCL_VAR:
-                {
-                    assert(op1->IsRegOptional() || !compiler->lvaTable[op1->gtLclVar.gtLclNum].lvIsRegCandidate());
-                    varNum = op1->AsLclVar()->GetLclNum();
-                    offset = 0;
-                    break;
-                }
-
-                default:
-                    unreached();
-                    break;
-            }
-        }
-
-        // Ensure we got a good varNum and offset.
-        // We also need to check for `tmpDsc != nullptr` since spill temp numbers
-        // are negative and start with -1, which also happens to be BAD_VAR_NUM.
-        assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
-        assert(offset != (unsigned)-1);
-
-        emit->emitIns_R_S_I(ins, simdSize, targetReg, varNum, offset, ival);
-    }
-    else
-    {
-        regNumber op1Reg = op1->gtRegNum;
-        emit->emitIns_SIMD_R_R_I(ins, simdSize, targetReg, op1Reg, ival);
     }
+    inst_RV_TT_IV(ins, simdSize, targetReg, op1, ival);
 }
 
 //------------------------------------------------------------------------
@@ -681,7 +609,11 @@ void CodeGen::genHWIntrinsic_R_R_RM(
         }
         else if (op2->OperIsHWIntrinsic())
         {
-            emit->emitIns_SIMD_R_R_AR(ins, attr, targetReg, op1Reg, op2->gtGetOp1()->gtRegNum);
+            GenTree* addr = op2->gtGetOp1();
+            // Until we improve the handling of addressing modes in the emitter, we'll create a
+            // temporary GT_IND to generate code with.
+            GenTreeIndir load = indirForm(node->TypeGet(), addr);
+            emit->emitIns_SIMD_R_R_A(ins, attr, targetReg, op1Reg, &load);
             return;
         }
         else if (op2->isIndir())
@@ -1267,7 +1199,7 @@ void CodeGen::genHWIntrinsicJumpTableFallback(NamedIntrinsic            intrinsi
 //    node - The hardware intrinsic node
 //
 // Note:
-//    We currently assume that all base intrinsics only have a single operand.
+//    We currently assume that all base intrinsics have zero or one operand.
 //
 void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
 {
@@ -1279,15 +1211,10 @@ void CodeGen::genBaseIntrinsic(GenTreeHWIntrinsic* node)
     assert(compiler->compSupports(InstructionSet_SSE));
     assert((baseType >= TYP_BYTE) && (baseType <= TYP_DOUBLE));
 
-    GenTree*  op1    = node->gtGetOp1();
-    regNumber op1Reg = REG_NA;
+    GenTree* op1 = node->gtGetOp1();
 
-    if (op1 != nullptr)
-    {
-        assert(!op1->OperIsList());
-        op1Reg = op1->gtRegNum;
-        genConsumeOperands(node);
-    }
+    genConsumeHWIntrinsicOperands(node);
+    regNumber op1Reg = (op1 == nullptr) ? REG_NA : op1->gtRegNum;
 
     assert(node->gtGetOp2() == nullptr);
 
@@ -1418,11 +1345,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
     regNumber op4Reg = REG_NA;
     emitter*  emit   = getEmitter();
 
-    if ((op1 != nullptr) && !op1->OperIsList())
-    {
-        op1Reg = op1->gtRegNum;
-        genConsumeOperands(node);
-    }
+    genConsumeHWIntrinsicOperands(node);
 
     switch (intrinsicId)
     {
@@ -1529,6 +1452,7 @@ void CodeGen::genSSEIntrinsic(GenTreeHWIntrinsic* node)
             assert(op2 == nullptr);
 
             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType);
+            op1Reg          = op1->gtRegNum;
             emit->emitIns_AR(ins, emitTypeSize(baseType), op1Reg, 0);
             break;
         }
@@ -1568,11 +1492,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
     regNumber      op2Reg      = REG_NA;
     emitter*       emit        = getEmitter();
 
-    if ((op1 != nullptr) && !op1->OperIsList())
-    {
-        op1Reg = op1->gtRegNum;
-        genConsumeOperands(node);
-    }
+    genConsumeHWIntrinsicOperands(node);
 
     switch (intrinsicId)
     {
@@ -1588,6 +1508,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
             assert((ival >= 0) && (ival <= 127));
 
             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
+            op1Reg          = op1->gtRegNum;
             op2Reg          = op2->gtRegNum;
             emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
 
@@ -1711,6 +1632,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
             if (varTypeIsIntegral(baseType))
             {
                 assert(baseType == TYP_INT || baseType == TYP_UINT || baseType == TYP_LONG || baseType == TYP_ULONG);
+                op1Reg = op1->gtRegNum;
                 emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
             }
             else
@@ -1748,6 +1670,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
 
             op2Reg          = op2->gtRegNum;
             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
+            op1Reg          = op1->gtRegNum;
             emit->emitIns_AR_R(ins, emitTypeSize(baseType), op2Reg, op1Reg, 0);
             break;
         }
@@ -1783,16 +1706,13 @@ void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
     regNumber op4Reg = REG_NA;
     emitter*  emit   = getEmitter();
 
-    if ((op1 != nullptr) && !op1->OperIsList())
-    {
-        op1Reg = op1->gtRegNum;
-        genConsumeOperands(node);
-    }
+    genConsumeHWIntrinsicOperands(node);
 
     switch (intrinsicId)
     {
         case NI_SSE41_TestAllOnes:
         {
+            op1Reg           = op1->gtRegNum;
             regNumber tmpReg = node->GetSingleTempReg();
             assert(HWIntrinsicInfo::lookupIns(intrinsicId, node->gtSIMDBaseType) == INS_ptest);
             emit->emitIns_SIMD_R_R_R(INS_pcmpeqd, emitTypeSize(TYP_SIMD16), tmpReg, tmpReg, tmpReg);
@@ -1845,12 +1765,12 @@ void CodeGen::genSSE41Intrinsic(GenTreeHWIntrinsic* node)
                 if (baseType == TYP_FLOAT)
                 {
                     // extract instructions return to GP-registers, so it needs int size as the emitsize
-                    emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1Reg, i);
+                    inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), tmpTargetReg, op1, i);
                     emit->emitIns_R_R(INS_mov_i2xmm, EA_4BYTE, targetReg, tmpTargetReg);
                 }
                 else
                 {
-                    emit->emitIns_SIMD_R_R_I(ins, emitTypeSize(TYP_INT), targetReg, op1Reg, i);
+                    inst_RV_TT_IV(ins, emitTypeSize(TYP_INT), targetReg, op1, i);
                 }
             };
 
@@ -1896,8 +1816,8 @@ void CodeGen::genSSE42Intrinsic(GenTreeHWIntrinsic* node)
     var_types      targetType  = node->TypeGet();
     emitter*       emit        = getEmitter();
 
+    genConsumeHWIntrinsicOperands(node);
     regNumber op1Reg = op1->gtRegNum;
-    genConsumeOperands(node);
 
     assert(targetReg != REG_NA);
     assert(op1Reg != REG_NA);
@@ -1966,18 +1886,15 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
     regNumber      targetReg   = node->gtRegNum;
     emitter*       emit        = getEmitter();
 
-    if ((op1 != nullptr) && !op1->OperIsList())
-    {
-        op1Reg = op1->gtRegNum;
-        genConsumeOperands(node);
-    }
+    genConsumeHWIntrinsicOperands(node);
 
     switch (intrinsicId)
     {
         case NI_AVX2_ConvertToInt32:
         case NI_AVX2_ConvertToUInt32:
         {
-            assert(op2 == nullptr);
+            op1Reg = op1->gtRegNum;
+            assert(numArgs == 1);
             assert((baseType == TYP_INT) || (baseType == TYP_UINT));
             instruction ins = HWIntrinsicInfo::lookupIns(intrinsicId, baseType);
             emit->emitIns_R_R(ins, emitActualTypeSize(baseType), op1Reg, targetReg);
@@ -1992,16 +1909,13 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
             GenTreeArgList* list = op1->AsArgList();
             op1                  = list->Current();
             op1Reg               = op1->gtRegNum;
-            genConsumeRegs(op1);
 
             list   = list->Rest();
             op2    = list->Current();
             op2Reg = op2->gtRegNum;
-            genConsumeRegs(op2);
 
             list         = list->Rest();
             GenTree* op3 = list->Current();
-            genConsumeRegs(op3);
 
             list             = list->Rest();
             GenTree* op4     = nullptr;
@@ -2017,12 +1931,11 @@ void CodeGen::genAvxOrAvx2Intrinsic(GenTreeHWIntrinsic* node)
             if (numArgs == 5)
             {
                 assert(intrinsicId == NI_AVX2_GatherMaskVector128 || intrinsicId == NI_AVX2_GatherMaskVector256);
-                op4    = list->Current();
-                list   = list->Rest();
-                lastOp = list->Current();
-                op3Reg = op3->gtRegNum;
-                op4Reg = op4->gtRegNum;
-                genConsumeRegs(op4);
+                op4          = list->Current();
+                list         = list->Rest();
+                lastOp       = list->Current();
+                op3Reg       = op3->gtRegNum;
+                op4Reg       = op4->gtRegNum;
                 addrBaseReg  = op2Reg;
                 addrIndexReg = op3Reg;
                 indexOp      = op3;
@@ -2157,10 +2070,7 @@ void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
     assert(targetReg != REG_NA);
     assert(op1 != nullptr);
 
-    if (!op1->OperIsList())
-    {
-        genConsumeOperands(node);
-    }
+    genConsumeHWIntrinsicOperands(node);
 
     switch (intrinsicId)
     {
@@ -2224,16 +2134,13 @@ void CodeGen::genBMI1OrBMI2Intrinsic(GenTreeHWIntrinsic* node)
             {
                 GenTreeArgList* argList = op1->AsArgList();
                 op1                     = argList->Current();
-                genConsumeRegs(op1);
-                op1Reg  = op1->gtRegNum;
-                argList = argList->Rest();
-                op2     = argList->Current();
-                genConsumeRegs(op2);
-                op2Reg       = op2->gtRegNum;
-                argList      = argList->Rest();
-                GenTree* op3 = argList->Current();
-                genConsumeRegs(op3);
-                op3Reg = op3->gtRegNum;
+                op1Reg                  = op1->gtRegNum;
+                argList                 = argList->Rest();
+                op2                     = argList->Current();
+                op2Reg                  = op2->gtRegNum;
+                argList                 = argList->Rest();
+                GenTree* op3            = argList->Current();
+                op3Reg                  = op3->gtRegNum;
                 assert(op3Reg != op1Reg);
                 assert(op3Reg != targetReg);
                 assert(op3Reg != REG_EDX);
@@ -2288,22 +2195,16 @@ void CodeGen::genFMAIntrinsic(GenTreeHWIntrinsic* node)
     regNumber      targetReg   = node->gtRegNum;
 
     assert(HWIntrinsicInfo::lookupNumArgs(node) == 3);
-    assert(op1 != nullptr);
-    assert(op1->OperIsList());
-    assert(op1->gtGetOp2()->OperIsList());
-    assert(op1->gtGetOp2()->gtGetOp2()->OperIsList());
 
+    genConsumeHWIntrinsicOperands(node);
     GenTreeArgList* argList = op1->AsArgList();
     op1                     = argList->Current();
-    genConsumeRegs(op1);
 
     argList      = argList->Rest();
     GenTree* op2 = argList->Current();
-    genConsumeRegs(op2);
 
     argList      = argList->Rest();
     GenTree* op3 = argList->Current();
-    genConsumeRegs(op3);
 
     regNumber op1Reg;
     regNumber op2Reg;
index 60e0f80..6434831 100644 (file)
@@ -114,11 +114,11 @@ HARDWARE_INTRINSIC(SSE_ConvertScalarToVector128Single,              "ConvertScal
 HARDWARE_INTRINSIC(SSE_ConvertToInt32WithTruncation,                "ConvertToInt32WithTruncation",                 SSE,          -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cvttss2si,      INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_Divide,                                      "Divide",                                       SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_divps,          INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE_DivideScalar,                                "DivideScalar",                                 SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_divss,          INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE_LoadAlignedVector128,                        "LoadAlignedVector128",                         SSE,          -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movaps,         INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_LoadHigh,                                    "LoadHigh",                                     SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movhps,         INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_LoadLow,                                     "LoadLow",                                      SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movlps,         INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_LoadScalarVector128,                         "LoadScalarVector128",                          SSE,          -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movss,          INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_LoadVector128,                               "LoadVector128",                                SSE,          -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movups,         INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_LoadAlignedVector128,                        "LoadAlignedVector128",                         SSE,          -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movaps,         INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_LoadHigh,                                    "LoadHigh",                                     SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movhps,         INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_LoadLow,                                     "LoadLow",                                      SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movlps,         INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_LoadScalarVector128,                         "LoadScalarVector128",                          SSE,          -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movss,          INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_LoadVector128,                               "LoadVector128",                                SSE,          -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movups,         INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_Max,                                         "Max",                                          SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_maxps,          INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(SSE_MaxScalar,                                   "MaxScalar",                                    SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_maxss,          INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE_Min,                                         "Min",                                          SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_minps,          INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
@@ -141,13 +141,13 @@ HARDWARE_INTRINSIC(SSE_ReciprocalSqrtScalar,                        "ReciprocalS
 HARDWARE_INTRINSIC(SSE_Shuffle,                                     "Shuffle",                                      SSE,          -1,              16,           3,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_shufps,         INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(SSE_Sqrt,                                        "Sqrt",                                         SSE,          -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_sqrtps,         INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_SqrtScalar,                                  "SqrtScalar",                                   SSE,          -1,              16,          -1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_sqrtss,         INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE_Store,                                       "Store",                                        SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movups,         INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_StoreAligned,                                "StoreAligned",                                 SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movaps,         INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal,                     "StoreAlignedNonTemporal",                      SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movntps,        INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_Store,                                       "Store",                                        SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movups,         INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_StoreAligned,                                "StoreAligned",                                 SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movaps,         INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_StoreAlignedNonTemporal,                     "StoreAlignedNonTemporal",                      SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movntps,        INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_StoreFence,                                  "StoreFence",                                   SSE,          -1,               0,           0,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_Special,                HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_StoreHigh,                                   "StoreHigh",                                    SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movhps,         INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_StoreLow,                                    "StoreLow",                                     SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movlps,         INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE_StoreScalar,                                 "StoreScalar",                                  SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movss,          INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_StoreHigh,                                   "StoreHigh",                                    SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movhps,         INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_StoreLow,                                    "StoreLow",                                     SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movlps,         INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE_StoreScalar,                                 "StoreScalar",                                  SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movss,          INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE_Subtract,                                    "Subtract",                                     SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_subps,          INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE_SubtractScalar,                              "SubtractScalar",                               SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_subss,          INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE_UnpackHigh,                                  "UnpackHigh",                                   SSE,          -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_unpckhps,       INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
@@ -227,12 +227,12 @@ HARDWARE_INTRINSIC(SSE2_Divide,                                     "Divide",
 HARDWARE_INTRINSIC(SSE2_DivideScalar,                               "DivideScalar",                                 SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_divsd},             HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE2_Extract,                                    "Extract",                                      SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_pextrw,         INS_pextrw,         INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_Insert,                                     "Insert",                                       SSE2,         -1,              16,           3,     {INS_invalid,           INS_invalid,        INS_pinsrw,         INS_pinsrw,         INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM)
-HARDWARE_INTRINSIC(SSE2_LoadAlignedVector128,                       "LoadAlignedVector128",                         SSE2,         -1,              16,           1,     {INS_movdqa,            INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_invalid,        INS_movapd},            HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_LoadAlignedVector128,                       "LoadAlignedVector128",                         SSE2,         -1,              16,           1,     {INS_movdqa,            INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_invalid,        INS_movapd},            HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_LoadFence,                                  "LoadFence",                                    SSE2,         -1,               0,           0,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_Special,                HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_LoadHigh,                                   "LoadHigh",                                     SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movhpd},            HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_LoadLow,                                    "LoadLow",                                      SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movlpd},            HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_LoadScalarVector128,                        "LoadScalarVector128",                          SSE2,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_movd,           INS_movd,           INS_movq,           INS_movq,           INS_invalid,        INS_movsdsse2},         HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_LoadVector128,                              "LoadVector128",                                SSE2,         -1,              16,           1,     {INS_movdqu,            INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_invalid,        INS_movupd},            HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_LoadHigh,                                   "LoadHigh",                                     SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movhpd},            HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_LoadLow,                                    "LoadLow",                                      SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movlpd},            HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_LoadScalarVector128,                        "LoadScalarVector128",                          SSE2,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_movd,           INS_movd,           INS_movq,           INS_movq,           INS_invalid,        INS_movsdsse2},         HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_LoadVector128,                              "LoadVector128",                                SSE2,         -1,              16,           1,     {INS_movdqu,            INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_invalid,        INS_movupd},            HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_MaskMove,                                   "MaskMove",                                     SSE2,         -1,              16,           3,     {INS_maskmovdqu,        INS_maskmovdqu,     INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_Max,                                        "Max",                                          SSE2,         -1,              16,           2,     {INS_invalid,           INS_pmaxub,         INS_pmaxsw,         INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_maxpd},             HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(SSE2_MemoryFence,                                "MemoryFence",                                  SSE2,         -1,               0,           0,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_Special,                HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
@@ -260,13 +260,13 @@ HARDWARE_INTRINSIC(SSE2_ShuffleHigh,                                "ShuffleHigh
 HARDWARE_INTRINSIC(SSE2_ShuffleLow,                                 "ShuffleLow",                                   SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_pshuflw,        INS_pshuflw,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(SSE2_Sqrt,                                       "Sqrt",                                         SSE2,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_sqrtpd},            HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_SqrtScalar,                                 "SqrtScalar",                                   SSE2,         -1,              16,          -1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_sqrtsd},            HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
-HARDWARE_INTRINSIC(SSE2_Store,                                      "Store",                                        SSE2,         -1,              16,           2,     {INS_movdqu,            INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_invalid,        INS_movupd},            HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_StoreAligned,                               "StoreAligned",                                 SSE2,         -1,              16,           2,     {INS_movdqa,            INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_invalid,        INS_movapd},            HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_StoreAlignedNonTemporal,                    "StoreAlignedNonTemporal",                      SSE2,         -1,              16,           2,     {INS_movntdq,           INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_invalid,        INS_movntpd},           HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_StoreHigh,                                  "StoreHigh",                                    SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movhpd},            HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_StoreLow,                                   "StoreLow",                                     SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movq,           INS_movq,           INS_invalid,        INS_movlpd},            HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE2_StoreNonTemporal,                           "StoreNonTemporal",                             SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_movnti,         INS_movnti,         INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
-HARDWARE_INTRINSIC(SSE2_StoreScalar,                                "StoreScalar",                                  SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movsdsse2},         HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_Store,                                      "Store",                                        SSE2,         -1,              16,           2,     {INS_movdqu,            INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_invalid,        INS_movupd},            HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_StoreAligned,                               "StoreAligned",                                 SSE2,         -1,              16,           2,     {INS_movdqa,            INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_invalid,        INS_movapd},            HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_StoreAlignedNonTemporal,                    "StoreAlignedNonTemporal",                      SSE2,         -1,              16,           2,     {INS_movntdq,           INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_invalid,        INS_movntpd},           HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_StoreHigh,                                  "StoreHigh",                                    SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movhpd},            HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_StoreLow,                                   "StoreLow",                                     SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movq,           INS_movq,           INS_invalid,        INS_movlpd},            HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE2_StoreNonTemporal,                           "StoreNonTemporal",                             SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_movnti,         INS_movnti,         INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(SSE2_StoreScalar,                                "StoreScalar",                                  SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movsdsse2},         HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE2_Subtract,                                   "Subtract",                                     SSE2,         -1,              16,           2,     {INS_psubb,             INS_psubb,          INS_psubw,          INS_psubw,          INS_psubd,          INS_psubd,          INS_psubq,          INS_psubq,          INS_invalid,        INS_subpd},             HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_SubtractSaturate,                           "SubtractSaturate",                             SSE2,         -1,              16,           2,     {INS_psubsb,            INS_psubusb,        INS_psubsw,         INS_psubusw,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_SubtractScalar,                             "SubtractScalar",                               SSE2,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_subsd},             HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
@@ -286,7 +286,7 @@ HARDWARE_INTRINSIC(SSE2_X64_ConvertToUInt64,                        "ConvertToUI
 HARDWARE_INTRINSIC(SSE2_X64_ConvertScalarToVector128Double,         "ConvertScalarToVector128Double",               SSE2_X64,     -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cvtsi2sd,       INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromSecondArg)
 HARDWARE_INTRINSIC(SSE2_X64_ConvertScalarToVector128Int64,          "ConvertScalarToVector128Int64",                SSE2_X64,     -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_mov_i2xmm,      INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
 HARDWARE_INTRINSIC(SSE2_X64_ConvertScalarToVector128UInt64,         "ConvertScalarToVector128UInt64",               SSE2_X64,     -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_mov_i2xmm,      INS_invalid,        INS_invalid},           HW_Category_SIMDScalar,             HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
-HARDWARE_INTRINSIC(SSE2_X64_StoreNonTemporal,                       "StoreNonTemporal",                             SSE2_X64,     -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movnti,         INS_movnti,         INS_invalid,        INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
+HARDWARE_INTRINSIC(SSE2_X64_StoreNonTemporal,                       "StoreNonTemporal",                             SSE2_X64,     -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movnti,         INS_movnti,         INS_invalid,        INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics|HW_Flag_SpecialCodeGen)
 
 // ***************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
 //                 Intrinsic ID                                     Function name                                   ISA         ival        SIMD size       NumArg                                                                                                     instructions                                                                                                     Category                            Flags
@@ -297,8 +297,8 @@ HARDWARE_INTRINSIC(SSE3_IsSupported,                                "get_IsSuppo
 HARDWARE_INTRINSIC(SSE3_AddSubtract,                                "AddSubtract",                                  SSE3,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_addsubps,       INS_addsubpd},          HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE3_HorizontalAdd,                              "HorizontalAdd",                                SSE3,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_haddps,         INS_haddpd},            HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE3_HorizontalSubtract,                         "HorizontalSubtract",                           SSE3,         -1,              16,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_hsubps,         INS_hsubpd},            HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(SSE3_LoadAndDuplicateToVector128,                "LoadAndDuplicateToVector128",                  SSE3,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_lddqu,          INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movddup},           HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(SSE3_LoadDquVector128,                           "LoadDquVector128",                             SSE3,         -1,              16,           1,     {INS_lddqu,             INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE3_LoadAndDuplicateToVector128,                "LoadAndDuplicateToVector128",                  SSE3,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_lddqu,          INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movddup},           HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE3_LoadDquVector128,                           "LoadDquVector128",                             SSE3,         -1,              16,           1,     {INS_lddqu,             INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE3_MoveAndDuplicate,                           "MoveAndDuplicate",                             SSE3,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movddup},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE3_MoveHighAndDuplicate,                       "MoveHighAndDuplicate",                         SSE3,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movshdup,       INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE3_MoveLowAndDuplicate,                        "MoveLowAndDuplicate",                          SSE3,         -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movsldup,       INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
@@ -339,7 +339,7 @@ HARDWARE_INTRINSIC(SSE41_Extract,                                   "Extract",
 HARDWARE_INTRINSIC(SSE41_Floor,                                     "Floor",                                        SSE41,         9,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_roundps,        INS_roundpd},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE41_FloorScalar,                               "FloorScalar",                                  SSE41,         9,              16,          -1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_roundss,        INS_roundsd},           HW_Category_SIMDScalar,             HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE41_Insert,                                    "Insert",                                       SSE41,        -1,              16,           3,     {INS_pinsrb,            INS_pinsrb,         INS_invalid,        INS_invalid,        INS_pinsrd,         INS_pinsrd,         INS_invalid,        INS_invalid,        INS_insertps,       INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM)
-HARDWARE_INTRINSIC(SSE41_LoadAlignedVector128NonTemporal,           "LoadAlignedVector128NonTemporal",              SSE41,        -1,              16,           1,     {INS_movntdqa,          INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(SSE41_LoadAlignedVector128NonTemporal,           "LoadAlignedVector128NonTemporal",              SSE41,        -1,              16,           1,     {INS_movntdqa,          INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(SSE41_Max,                                       "Max",                                          SSE41,        -1,              16,           2,     {INS_pmaxsb,            INS_invalid,        INS_invalid,        INS_pmaxuw,         INS_pmaxsd,         INS_pmaxud,         INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE41_Min,                                       "Min",                                          SSE41,        -1,              16,           2,     {INS_pminsb,            INS_invalid,        INS_invalid,        INS_pminuw,         INS_pminsd,         INS_pminud,         INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE41_MinHorizontal,                             "MinHorizontal",                                SSE41,        -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_phminposuw,     INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
@@ -403,9 +403,9 @@ HARDWARE_INTRINSIC(AVX_AndNot,                                      "AndNot",
 HARDWARE_INTRINSIC(AVX_Blend,                                       "Blend",                                        AVX,          -1,              32,           3,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_blendps,        INS_blendpd},           HW_Category_IMM,                    HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX_BlendVariable,                               "BlendVariable",                                AVX,          -1,              32,           3,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_vblendvps,      INS_vblendvpd},         HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX_Ceiling,                                     "Ceiling",                                      AVX,          10,              32,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_roundps,        INS_roundpd},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX_BroadcastScalarToVector128,                  "BroadcastScalarToVector128",                   AVX,          -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_vbroadcastss,   INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment)
-HARDWARE_INTRINSIC(AVX_BroadcastScalarToVector256,                  "BroadcastScalarToVector256",                   AVX,          -1,              32,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_vbroadcastss,   INS_vbroadcastsd},      HW_Category_MemoryLoad,             HW_Flag_NoContainment)
-HARDWARE_INTRINSIC(AVX_BroadcastVector128ToVector256,               "BroadcastVector128ToVector256",                AVX,          -1,              32,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_vbroadcastf128, INS_vbroadcastf128},    HW_Category_MemoryLoad,             HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(AVX_BroadcastScalarToVector128,                  "BroadcastScalarToVector128",                   AVX,          -1,              16,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_vbroadcastss,   INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX_BroadcastScalarToVector256,                  "BroadcastScalarToVector256",                   AVX,          -1,              32,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_vbroadcastss,   INS_vbroadcastsd},      HW_Category_MemoryLoad,             HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX_BroadcastVector128ToVector256,               "BroadcastVector128ToVector256",                AVX,          -1,              32,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_vbroadcastf128, INS_vbroadcastf128},    HW_Category_MemoryLoad,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX_Compare,                                     "Compare",                                      AVX,          -1,              32,           3,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpps,          INS_cmppd},             HW_Category_IMM,                    HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX_CompareScalar,                               "CompareScalar",                                AVX,          -1,              16,           3,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_cmpss,          INS_cmpsd},             HW_Category_IMM,                    HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(AVX_ConvertToVector128Int32,                     "ConvertToVector128Int32",                      AVX,          -1,              32,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_cvtpd2dq,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
@@ -424,12 +424,12 @@ HARDWARE_INTRINSIC(AVX_Floor,                                       "Floor",
 HARDWARE_INTRINSIC(AVX_HorizontalAdd,                               "HorizontalAdd",                                AVX,          -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_haddps,         INS_haddpd},            HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX_HorizontalSubtract,                          "HorizontalSubtract",                           AVX,          -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_hsubps,         INS_hsubpd},            HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX_InsertVector128,                             "InsertVector128",                              AVX,          -1,              32,           3,     {INS_vinsertf128,       INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128,    INS_vinsertf128},       HW_Category_IMM,                    HW_Flag_FullRangeIMM)
-HARDWARE_INTRINSIC(AVX_LoadAlignedVector256,                        "LoadAlignedVector256",                         AVX,          -1,              32,           1,     {INS_movdqa,            INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movaps,         INS_movapd},            HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX_LoadDquVector256,                            "LoadDquVector256",                             AVX,          -1,              32,           1,     {INS_lddqu,             INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX_LoadVector256,                               "LoadVector256",                                AVX,          -1,              32,           1,     {INS_movdqu,            INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movups,         INS_movupd},            HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX_LoadAlignedVector256,                        "LoadAlignedVector256",                         AVX,          -1,              32,           1,     {INS_movdqa,            INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movaps,         INS_movapd},            HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX_LoadDquVector256,                            "LoadDquVector256",                             AVX,          -1,              32,           1,     {INS_lddqu,             INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_lddqu,          INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX_LoadVector256,                               "LoadVector256",                                AVX,          -1,              32,           1,     {INS_movdqu,            INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movups,         INS_movupd},            HW_Category_MemoryLoad,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX_Max,                                         "Max",                                          AVX,          -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_maxps,          INS_maxpd},             HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX_Min,                                         "Min",                                          AVX,          -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_minps,          INS_minpd},             HW_Category_SimpleSIMD,             HW_Flag_Commutative)
-HARDWARE_INTRINSIC(AVX_MaskLoad,                                    "MaskLoad",                                     AVX,          -1,               0,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_vmaskmovps,     INS_vmaskmovpd},        HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_UnfixedSIMDSize)
+HARDWARE_INTRINSIC(AVX_MaskLoad,                                    "MaskLoad",                                     AVX,          -1,               0,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_vmaskmovps,     INS_vmaskmovpd},        HW_Category_MemoryLoad,             HW_Flag_UnfixedSIMDSize)
 HARDWARE_INTRINSIC(AVX_MaskStore,                                   "MaskStore",                                    AVX,          -1,               0,           3,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_vmaskmovps,     INS_vmaskmovpd},        HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_UnfixedSIMDSize|HW_Flag_BaseTypeFromSecondArg)
 HARDWARE_INTRINSIC(AVX_MoveMask,                                    "MoveMask",                                     AVX,          -1,              32,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_movmskps,       INS_movmskpd},          HW_Category_SimpleSIMD,             HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(AVX_Multiply,                                    "Multiply",                                     AVX,          -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_mulps,          INS_mulpd},             HW_Category_SimpleSIMD,             HW_Flag_Commutative)
@@ -446,9 +446,9 @@ HARDWARE_INTRINSIC(AVX_RoundToPositiveInfinity,                     "RoundToPosi
 HARDWARE_INTRINSIC(AVX_RoundToZero,                                 "RoundToZero",                                  AVX,          11,              32,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_roundps,        INS_roundpd},           HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX_Shuffle,                                     "Shuffle",                                      AVX,          -1,              32,           3,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_shufps,         INS_shufpd},            HW_Category_IMM,                    HW_Flag_NoRMWSemantics|HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX_Sqrt,                                        "Sqrt",                                         AVX,          -1,              32,           1,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_sqrtps,         INS_sqrtpd},            HW_Category_SimpleSIMD,             HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX_Store,                                       "Store",                                        AVX,          -1,              32,           2,     {INS_movdqu,            INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movups,         INS_movupd},            HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX_StoreAligned,                                "StoreAligned",                                 AVX,          -1,              32,           2,     {INS_movdqa,            INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movaps,         INS_movapd},            HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
-HARDWARE_INTRINSIC(AVX_StoreAlignedNonTemporal,                     "StoreAlignedNonTemporal",                      AVX,          -1,              32,           2,     {INS_movntdq,           INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntps,        INS_movntpd},           HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX_Store,                                       "Store",                                        AVX,          -1,              32,           2,     {INS_movdqu,            INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movdqu,         INS_movups,         INS_movupd},            HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX_StoreAligned,                                "StoreAligned",                                 AVX,          -1,              32,           2,     {INS_movdqa,            INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movdqa,         INS_movaps,         INS_movapd},            HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(AVX_StoreAlignedNonTemporal,                     "StoreAlignedNonTemporal",                      AVX,          -1,              32,           2,     {INS_movntdq,           INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntdq,        INS_movntps,        INS_movntpd},           HW_Category_MemoryStore,            HW_Flag_NoRMWSemantics)
 HARDWARE_INTRINSIC(AVX_Subtract,                                    "Subtract",                                     AVX,          -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_subps,          INS_subpd},             HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX_TestC,                                       "TestC",                                        AVX,          -1,               0,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_vtestps,        INS_vtestpd},           HW_Category_SimpleSIMD,             HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg)
 HARDWARE_INTRINSIC(AVX_TestNotZAndNotC,                             "TestNotZAndNotC",                              AVX,          -1,               0,           2,     {INS_ptest,             INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_ptest,          INS_vtestps,        INS_vtestpd},           HW_Category_SimpleSIMD,             HW_Flag_UnfixedSIMDSize|HW_Flag_MultiIns|HW_Flag_BaseTypeFromFirstArg)
@@ -472,9 +472,9 @@ HARDWARE_INTRINSIC(AVX2_AndNot,                                     "AndNot",
 HARDWARE_INTRINSIC(AVX2_Average,                                    "Average",                                      AVX2,         -1,              32,           2,     {INS_invalid,           INS_pavgb,          INS_invalid,        INS_pavgw,          INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX2_Blend,                                      "Blend",                                        AVX2,         -1,               0,           3,     {INS_invalid,           INS_invalid,        INS_pblendw,        INS_pblendw,        INS_vpblendd,       INS_vpblendd,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_UnfixedSIMDSize|HW_Flag_FullRangeIMM)
 HARDWARE_INTRINSIC(AVX2_BlendVariable,                              "BlendVariable",                                AVX2,         -1,              32,           3,     {INS_vpblendvb,         INS_vpblendvb,      INS_vpblendvb,      INS_vpblendvb,      INS_vpblendvb,      INS_vpblendvb,      INS_vpblendvb,      INS_vpblendvb,      INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
-HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector128,                 "BroadcastScalarToVector128",                   AVX2,         -1,              16,           1,     {INS_vpbroadcastb,      INS_vpbroadcastb,   INS_vpbroadcastw,   INS_vpbroadcastw,   INS_vpbroadcastd,   INS_vpbroadcastd,   INS_vpbroadcastq,   INS_vpbroadcastq,   INS_vbroadcastss,   INS_movddup},           HW_Category_SIMDScalar,             HW_Flag_NoContainment|HW_Flag_MaybeMemoryLoad)
-HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector256,                 "BroadcastScalarToVector256",                   AVX2,         -1,              32,           1,     {INS_vpbroadcastb,      INS_vpbroadcastb,   INS_vpbroadcastw,   INS_vpbroadcastw,   INS_vpbroadcastd,   INS_vpbroadcastd,   INS_vpbroadcastq,   INS_vpbroadcastq,   INS_vbroadcastss,   INS_vbroadcastsd},      HW_Category_SIMDScalar,             HW_Flag_NoContainment|HW_Flag_MaybeMemoryLoad)
-HARDWARE_INTRINSIC(AVX2_BroadcastVector128ToVector256,              "BroadcastVector128ToVector256",                AVX2,         -1,              32,           1,     {INS_vbroadcasti128,    INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector128,                 "BroadcastScalarToVector128",                   AVX2,         -1,              16,           1,     {INS_vpbroadcastb,      INS_vpbroadcastb,   INS_vpbroadcastw,   INS_vpbroadcastw,   INS_vpbroadcastd,   INS_vpbroadcastd,   INS_vpbroadcastq,   INS_vpbroadcastq,   INS_vbroadcastss,   INS_movddup},           HW_Category_SIMDScalar,             HW_Flag_MaybeMemoryLoad)
+HARDWARE_INTRINSIC(AVX2_BroadcastScalarToVector256,                 "BroadcastScalarToVector256",                   AVX2,         -1,              32,           1,     {INS_vpbroadcastb,      INS_vpbroadcastb,   INS_vpbroadcastw,   INS_vpbroadcastw,   INS_vpbroadcastd,   INS_vpbroadcastd,   INS_vpbroadcastq,   INS_vpbroadcastq,   INS_vbroadcastss,   INS_vbroadcastsd},      HW_Category_SIMDScalar,             HW_Flag_MaybeMemoryLoad)
+HARDWARE_INTRINSIC(AVX2_BroadcastVector128ToVector256,              "BroadcastVector128ToVector256",                AVX2,         -1,              32,           1,     {INS_vbroadcasti128,    INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_vbroadcasti128, INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_CompareEqual,                               "CompareEqual",                                 AVX2,         -1,              32,           2,     {INS_pcmpeqb,           INS_pcmpeqb,        INS_pcmpeqw,        INS_pcmpeqw,        INS_pcmpeqd,        INS_pcmpeqd,        INS_pcmpeqq,        INS_pcmpeqq,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX2_CompareGreaterThan,                         "CompareGreaterThan",                           AVX2,         -1,              32,           2,     {INS_pcmpgtb,           INS_invalid,        INS_pcmpgtw,        INS_invalid,        INS_pcmpgtd,        INS_invalid,        INS_pcmpgtq,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_ExtractVector128,                           "ExtractVector128",                             AVX2,         -1,              32,           2,     {INS_vextracti128,      INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_vextracti128,   INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM)
@@ -495,9 +495,9 @@ HARDWARE_INTRINSIC(AVX2_HorizontalAddSaturate,                      "HorizontalA
 HARDWARE_INTRINSIC(AVX2_HorizontalSubtract,                         "HorizontalSubtract",                           AVX2,         -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_phsubw,         INS_invalid,        INS_phsubd,         INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_HorizontalSubtractSaturate,                 "HorizontalSubtractSaturate",                   AVX2,         -1,              32,           2,     {INS_invalid,           INS_invalid,        INS_phsubsw,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(AVX2_InsertVector128,                            "InsertVector128",                              AVX2,         -1,              32,           3,     {INS_vinserti128,       INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_vinserti128,    INS_invalid,        INS_invalid},           HW_Category_IMM,                    HW_Flag_FullRangeIMM)
-HARDWARE_INTRINSIC(AVX2_LoadAlignedVector256NonTemporal,            "LoadAlignedVector256NonTemporal",              AVX2,         -1,              32,           1,     {INS_movntdqa,          INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment)
-HARDWARE_INTRINSIC(AVX2_MaskLoad,                                   "MaskLoad",                                     AVX2,         -1,               0,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_vpmaskmovd,     INS_vpmaskmovd,     INS_vpmaskmovq,     INS_vpmaskmovq,     INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoContainment|HW_Flag_UnfixedSIMDSize)
-HARDWARE_INTRINSIC(AVX2_MaskStore,                                  "MaskStore",                                    AVX2,         -1,               0,           3,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_vpmaskmovd,     INS_vpmaskmovd,     INS_vpmaskmovq,     INS_vpmaskmovq,     INS_invalid,        INS_invalid},           HW_Category_MemoryStore,            HW_Flag_NoContainment|HW_Flag_UnfixedSIMDSize|HW_Flag_BaseTypeFromSecondArg)
+HARDWARE_INTRINSIC(AVX2_LoadAlignedVector256NonTemporal,            "LoadAlignedVector256NonTemporal",              AVX2,         -1,              32,           1,     {INS_movntdqa,          INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_movntdqa,       INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(AVX2_MaskLoad,                                   "MaskLoad",                                     AVX2,         -1,               0,           2,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_vpmaskmovd,     INS_vpmaskmovd,     INS_vpmaskmovq,     INS_vpmaskmovq,     INS_invalid,        INS_invalid},           HW_Category_MemoryLoad,             HW_Flag_UnfixedSIMDSize)
+HARDWARE_INTRINSIC(AVX2_MaskStore,                                  "MaskStore",                                    AVX2,         -1,               0,           3,     {INS_invalid,           INS_invalid,        INS_invalid,        INS_invalid,        INS_vpmaskmovd,     INS_vpmaskmovd,     INS_vpmaskmovq,     INS_vpmaskmovq,     INS_invalid,        INS_invalid},           HW_Category_MemoryStore,            HW_Flag_UnfixedSIMDSize|HW_Flag_BaseTypeFromSecondArg)
 HARDWARE_INTRINSIC(AVX2_Max,                                        "Max",                                          AVX2,         -1,              32,           2,     {INS_pmaxsb,            INS_pmaxub,         INS_pmaxsw,         INS_pmaxuw,         INS_pmaxsd,         INS_pmaxud,         INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX2_Min,                                        "Min",                                          AVX2,         -1,              32,           2,     {INS_pminsb,            INS_pminub,         INS_pminsw,         INS_pminuw,         INS_pminsd,         INS_pminud,         INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_Commutative)
 HARDWARE_INTRINSIC(AVX2_MoveMask,                                   "MoveMask",                                     AVX2,         -1,              32,           1,     {INS_pmovmskb,          INS_pmovmskb,       INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid,        INS_invalid},           HW_Category_SimpleSIMD,             HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
index fcc20e0..dc298ed 100644 (file)
@@ -847,6 +847,15 @@ GenTree* Compiler::impHWIntrinsic(NamedIntrinsic        intrinsic,
             case 1:
                 argType = JITtype2varType(strip(info.compCompHnd->getArgType(sig, argList, &argClass)));
                 op1     = getArgForHWIntrinsic(argType, argClass);
+                if ((category == HW_Category_MemoryLoad) && op1->OperIs(GT_CAST))
+                {
+                    // Although the API specifies a pointer, if what we have is a BYREF, that's what
+                    // we really want, so throw away the cast.
+                    if (op1->gtGetOp1()->TypeGet() == TYP_BYREF)
+                    {
+                        op1 = op1->gtGetOp1();
+                    }
+                }
                 retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
                 break;
             case 2:
index cbab22b..72a3fb9 100644 (file)
@@ -1039,7 +1039,126 @@ void CodeGen::inst_RV_RV_IV(instruction ins, emitAttr size, regNumber reg1, regN
 
     getEmitter()->emitIns_R_R_I(ins, size, reg1, reg2, ival);
 }
-#endif
+
+#ifdef FEATURE_HW_INTRINSICS
+//------------------------------------------------------------------------
+// inst_RV_TT_IV: Generates an instruction that takes 3 operands:
+//                a register operand, an operand that may be memory or register and an immediate
+//                and that returns a value in register
+//
+// Arguments:
+//    ins       -- The instruction being emitted
+//    attr      -- The emit attribute
+//    reg1      -- The first operand, a register
+//    rmOp      -- The second operand, which may be a memory node or a node producing a register
+//    ival      -- The immediate operand
+//
+// Notes:
+//    This isn't really specific to HW intrinsics, but depends on other methods that are
+//    only defined for FEATURE_HW_INTRINSICS, and is currently only used in that context.
+//
+void CodeGen::inst_RV_TT_IV(instruction ins, emitAttr attr, regNumber reg1, GenTree* rmOp, int ival)
+{
+    noway_assert(getEmitter()->emitVerifyEncodable(ins, EA_SIZE(attr), reg1));
+
+    if (rmOp->isContained() || rmOp->isUsedFromSpillTemp())
+    {
+        TempDsc* tmpDsc = nullptr;
+        unsigned varNum = BAD_VAR_NUM;
+        unsigned offset = (unsigned)-1;
+
+        if (rmOp->isUsedFromSpillTemp())
+        {
+            assert(rmOp->IsRegOptional());
+
+            tmpDsc = getSpillTempDsc(rmOp);
+            varNum = tmpDsc->tdTempNum();
+            offset = 0;
+
+            regSet.tmpRlsTemp(tmpDsc);
+        }
+        else if (rmOp->OperIsHWIntrinsic())
+        {
+            getEmitter()->emitIns_R_AR_I(ins, attr, reg1, rmOp->gtGetOp1()->gtRegNum, 0, ival);
+            return;
+        }
+        else if (rmOp->isIndir())
+        {
+            GenTreeIndir* memIndir = rmOp->AsIndir();
+            GenTree*      memBase  = memIndir->gtOp1;
+
+            switch (memBase->OperGet())
+            {
+                case GT_LCL_VAR_ADDR:
+                {
+                    varNum = memBase->AsLclVarCommon()->GetLclNum();
+                    offset = 0;
+
+                    // Ensure that all the GenTreeIndir values are set to their defaults.
+                    assert(!memIndir->HasIndex());
+                    assert(memIndir->Scale() == 1);
+                    assert(memIndir->Offset() == 0);
+
+                    break;
+                }
+
+                case GT_CLS_VAR_ADDR:
+                {
+                    getEmitter()->emitIns_R_C_I(ins, attr, reg1, memBase->gtClsVar.gtClsVarHnd, 0, ival);
+                    return;
+                }
+
+                default:
+                {
+                    getEmitter()->emitIns_R_A_I(ins, attr, reg1, memIndir, ival);
+                    return;
+                }
+            }
+        }
+        else
+        {
+            switch (rmOp->OperGet())
+            {
+                case GT_LCL_FLD:
+                {
+                    GenTreeLclFld* lclField = rmOp->AsLclFld();
+
+                    varNum = lclField->GetLclNum();
+                    offset = lclField->gtLclFld.gtLclOffs;
+                    break;
+                }
+
+                case GT_LCL_VAR:
+                {
+                    assert(rmOp->IsRegOptional() || !compiler->lvaGetDesc(rmOp->gtLclVar.gtLclNum)->lvIsRegCandidate());
+                    varNum = rmOp->AsLclVar()->GetLclNum();
+                    offset = 0;
+                    break;
+                }
+
+                default:
+                    unreached();
+                    break;
+            }
+        }
+
+        // Ensure we got a good varNum and offset.
+        // We also need to check for `tmpDsc != nullptr` since spill temp numbers
+        // are negative and start with -1, which also happens to be BAD_VAR_NUM.
+        assert((varNum != BAD_VAR_NUM) || (tmpDsc != nullptr));
+        assert(offset != (unsigned)-1);
+
+        getEmitter()->emitIns_R_S_I(ins, attr, reg1, varNum, offset, ival);
+    }
+    else
+    {
+        regNumber rmOpReg = rmOp->gtRegNum;
+        getEmitter()->emitIns_SIMD_R_R_I(ins, attr, reg1, rmOpReg, ival);
+    }
+}
+#endif // FEATURE_HW_INTRINSICS
+
+#endif // _TARGET_XARCH_
 
 /*****************************************************************************
  *
index e29bb9c..e0a7c64 100644 (file)
@@ -104,6 +104,7 @@ private:
     void ContainCheckSIMD(GenTreeSIMD* simdNode);
 #endif // FEATURE_SIMD
 #ifdef FEATURE_HW_INTRINSICS
+    void ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree** pAddr);
     void ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node);
 #endif // FEATURE_HW_INTRINSICS
 
index 292fb93..373f881 100644 (file)
@@ -2488,6 +2488,10 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, Ge
 
     switch (category)
     {
+        case HW_Category_MemoryLoad:
+            supportsGeneralLoads = (!node->OperIsHWIntrinsic());
+            break;
+
         case HW_Category_SimpleSIMD:
         {
             // These intrinsics only expect 16 or 32-byte nodes for containment
@@ -2664,6 +2668,15 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, Ge
                     break;
                 }
 
+                case NI_AVX2_BroadcastScalarToVector128:
+                case NI_AVX2_BroadcastScalarToVector256:
+                {
+                    // The memory form of this already takes a pointer, and cannot be further contained.
+                    // The containable form is the one that takes a SIMD value, that may be in memory.
+                    supportsGeneralLoads = (node->TypeGet() == TYP_SIMD16);
+                    break;
+                }
+
                 case NI_SSE_ConvertScalarToVector128Single:
                 case NI_SSE2_ConvertScalarToVector128Double:
                 case NI_SSE2_ConvertScalarToVector128Int32:
@@ -2782,6 +2795,28 @@ bool Lowering::IsContainableHWIntrinsicOp(GenTreeHWIntrinsic* containingNode, Ge
 }
 
 //----------------------------------------------------------------------------------------------
+// ContainCheckHWIntrinsicAddr: Perform containment analysis for an address operand of a hardware
+//                              intrinsic node.
+//
+//  Arguments:
+//     node  - The hardware intrinsic node
+//     pAddr - The "parent" pointer to the address operand, so that we can update the operand
+//             of the parent as needed.
+//
+void Lowering::ContainCheckHWIntrinsicAddr(GenTreeHWIntrinsic* node, GenTree** pAddr)
+{
+    assert(((*pAddr)->TypeGet() == TYP_I_IMPL) || ((*pAddr)->TypeGet() == TYP_BYREF));
+    TryCreateAddrMode(LIR::Use(BlockRange(), pAddr, node), true);
+    GenTree* addr = *pAddr;
+    if ((addr->OperIs(GT_CLS_VAR_ADDR, GT_LCL_VAR_ADDR) ||
+         (addr->IsCnsIntOrI() && addr->AsIntConCommon()->FitsInAddrBase(comp)) || (addr->OperGet() == GT_LEA)) &&
+        IsSafeToContainMem(node, addr))
+    {
+        MakeSrcContained(node, addr);
+    }
+}
+
+//----------------------------------------------------------------------------------------------
 // ContainCheckHWIntrinsic: Perform containment analysis for a hardware intrinsic node.
 //
 //  Arguments:
@@ -2800,7 +2835,7 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
 
     if (!HWIntrinsicInfo::SupportsContainment(intrinsicId))
     {
-        // AVX2 gather are not contaibable and always have constant IMM argument
+        // AVX2 gather are not containable and always have constant IMM argument
         if (HWIntrinsicInfo::isAVX2GatherIntrinsic(intrinsicId))
         {
             GenTree* lastOp = HWIntrinsicInfo::lookupLastOp(node);
@@ -2825,6 +2860,12 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
 
         switch (category)
         {
+            case HW_Category_MemoryLoad:
+            {
+                GenTree** pAddr = &node->gtOp1;
+                ContainCheckHWIntrinsicAddr(node, pAddr);
+                break;
+            }
             case HW_Category_SimpleSIMD:
             case HW_Category_SIMDScalar:
             case HW_Category_Scalar:
@@ -2905,6 +2946,26 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
 
             switch (category)
             {
+                case HW_Category_MemoryLoad:
+                {
+                    GenTree** pAddr = nullptr;
+                    if ((intrinsicId == NI_AVX_MaskLoad) || (intrinsicId == NI_AVX2_MaskLoad))
+                    {
+                        pAddr = &node->gtOp.gtOp1;
+                    }
+                    else
+                    {
+                        pAddr = &node->gtOp.gtOp2;
+                    }
+                    ContainCheckHWIntrinsicAddr(node, pAddr);
+                    break;
+                }
+                case HW_Category_MemoryStore:
+                {
+                    GenTree** pAddr = &node->gtOp1;
+                    ContainCheckHWIntrinsicAddr(node, pAddr);
+                    break;
+                }
                 case HW_Category_SimpleSIMD:
                 case HW_Category_SIMDScalar:
                 case HW_Category_Scalar:
@@ -3113,6 +3174,12 @@ void Lowering::ContainCheckHWIntrinsic(GenTreeHWIntrinsic* node)
 
             switch (category)
             {
+                case HW_Category_MemoryStore:
+                {
+                    GenTree** pAddr = &node->gtOp.gtOp1->gtOp.gtOp1;
+                    ContainCheckHWIntrinsicAddr(node, pAddr);
+                    break;
+                }
                 case HW_Category_SimpleSIMD:
                 case HW_Category_SIMDScalar:
                 case HW_Category_Scalar:
index c0ce4fb..8494699 100644 (file)
@@ -1535,6 +1535,7 @@ private:
     int BuildOperandUses(GenTree* node, regMaskTP candidates = RBM_NONE);
     int BuildDelayFreeUses(GenTree* node, regMaskTP candidates = RBM_NONE);
     int BuildIndirUses(GenTreeIndir* indirTree, regMaskTP candidates = RBM_NONE);
+    int BuildAddrUses(GenTree* addr, regMaskTP candidates = RBM_NONE);
     void HandleFloatVarArgs(GenTreeCall* call, GenTree* argNode, bool* callHasFloatRegArgs);
     RefPosition* BuildDef(GenTree* tree, regMaskTP dstCandidates = RBM_NONE, int multiRegIdx = 0);
     void BuildDefs(GenTree* tree, int dstCount, regMaskTP dstCandidates = RBM_NONE);
index 38dfca6..aada9b2 100644 (file)
@@ -2672,6 +2672,11 @@ RefPosition* LinearScan::BuildUse(GenTree* operand, regMaskTP candidates, int mu
 int LinearScan::BuildIndirUses(GenTreeIndir* indirTree, regMaskTP candidates)
 {
     GenTree* const addr = indirTree->gtOp1;
+    return BuildAddrUses(addr, candidates);
+}
+
+int LinearScan::BuildAddrUses(GenTree* addr, regMaskTP candidates)
+{
     if (!addr->isContained())
     {
         BuildUse(addr, candidates);
@@ -2725,11 +2730,17 @@ int LinearScan::BuildOperandUses(GenTree* node, regMaskTP candidates)
     {
         return BuildIndirUses(node->AsIndir(), candidates);
     }
+#ifdef FEATURE_HW_INTRINSICS
     if (node->OperIsHWIntrinsic())
     {
+        if (node->AsHWIntrinsic()->OperIsMemoryLoad())
+        {
+            return BuildAddrUses(node->gtGetOp1());
+        }
         BuildUse(node->gtGetOp1(), candidates);
         return 1;
     }
+#endif // FEATURE_HW_INTRINSICS
 
     return 0;
 }
index 364a4b7..c430db0 100644 (file)
@@ -2699,11 +2699,29 @@ int LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
         {
             assert((numArgs > 0) && (numArgs < 4));
 
-            srcCount += BuildOperandUses(op1);
+            if (intrinsicTree->OperIsMemoryLoadOrStore())
+            {
+                srcCount += BuildAddrUses(op1);
+            }
+            else
+            {
+                srcCount += BuildOperandUses(op1);
+            }
 
             if (op2 != nullptr)
             {
-                srcCount += (isRMW) ? BuildDelayFreeUses(op2) : BuildOperandUses(op2);
+                if (op2->OperIs(GT_HWIntrinsic) && op2->AsHWIntrinsic()->OperIsMemoryLoad() && op2->isContained())
+                {
+                    srcCount += BuildAddrUses(op2->gtGetOp1());
+                }
+                else if (isRMW)
+                {
+                    srcCount += BuildDelayFreeUses(op2);
+                }
+                else
+                {
+                    srcCount += BuildOperandUses(op2);
+                }
 
                 if (op3 != nullptr)
                 {
diff --git a/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.cs b/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.cs
new file mode 100644 (file)
index 0000000..0e8f03c
--- /dev/null
@@ -0,0 +1,143 @@
+// Licensed to the .NET Foundation under one or more agreements.
+// The .NET Foundation licenses this file to you under the MIT license.
+// See the LICENSE file in the project root for more information.
+
+using System;
+using System.Runtime.CompilerServices;
+using System.Runtime.Intrinsics;
+using System.Runtime.Intrinsics.X86;
+using System.Threading;
+
+// Test folding of addressing expressions
+
+public class Program
+{
+    struct S
+    {
+        public float f0;
+        public float f1;
+        public float f2;
+        public float f3;
+        public float f4;
+        public float f5;
+        public float f6;
+        public float f7;
+        public float f8;
+        public float f9;
+        public float f10;
+        public float f11;
+        public float f12;
+        public float f13;
+        public float f14;
+        public float f15;
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static unsafe int Test(ref S s, Vector128<float> v, int offset)
+    {
+        int returnVal = 100;
+
+        if (Sse2.IsSupported)
+        {
+            fixed (float* p = &s.f0)
+            {
+                // We need an address aligned on 16 bytes, so we need to add a *float* offset to get there.
+                int alignmentOffset = (0x10 - ((int)p & 0xc)) >> 2;
+                try
+                {
+                    // This is the aligned case.
+                    // We're going to store a scalar at an offset of 2 from the aligned location.
+                    // As it happens, we know that the struct has been initialized to all zeros,
+                    // and the vector passed in was all ones, so now we have a one at offset 2.
+                    Sse2.StoreScalar(p + alignmentOffset + 2, Sse2.Subtract(v, Sse2.LoadAlignedVector128(p + offset + alignmentOffset + 4)));
+
+                    // Now do a load from the aligned location.
+                    // That should give us {0, 0, 1, 0}.
+                    Vector128<float> v2;
+                    if (Sse41.IsSupported)
+                    {
+                        v2 = Sse41.LoadAlignedVector128NonTemporal((byte*)(p + alignmentOffset)).AsSingle();
+                    }
+                    else
+                    {
+                        v2 = Sse2.LoadVector128((byte*)(p + alignmentOffset)).AsSingle();
+                    }
+                    if (!v2.Equals(Vector128.Create(0.0F, 0.0F, 1.0F, 0.0F)))
+                    {
+                        Console.WriteLine("Aligned case FAILED: v2 = " + v2);
+                        returnVal = -1;
+                    }
+
+                    // This is the unaligned case. The value we're loading to subtract is one element earlier than what we just stored.
+                    // So we're doing { 1, 1, 1, 1 } - { 0, 1, 0, 0 } = { 1, 0, 1, 1 }
+                    Sse2.Store(p + alignmentOffset + 1, Sse2.Subtract(v, Sse2.LoadVector128(p + offset + alignmentOffset + 1)));
+                    // Now do an unaligned load from that location.
+                    v2 = Sse2.LoadVector128(p + alignmentOffset + 1);
+                    if (!v2.Equals(Vector128.Create(1.0F, 0.0F, 1.0F, 1.0F)))
+                    {
+                        Console.WriteLine("Unaligned case FAILED: v2 = " + v2);
+                        returnVal = -1;
+                    }
+
+                }
+                catch (Exception e)
+                {
+                    Console.WriteLine("Unexpected exception: " + e.Message);
+                    returnVal = -1;
+                }
+            }
+        }
+        return returnVal;
+    }
+
+    [MethodImpl(MethodImplOptions.NoInlining)]
+    static unsafe int Test256(ref S s, Vector256<float> v, int offset)
+    {
+        int returnVal = 100;
+        if (Avx.IsSupported)
+        {
+            // offset must be a multiple of the vector size in floats.
+            offset &= ~3;
+            fixed (float* p = &s.f0)
+            {
+                try
+                {
+                    Avx.Store(p + 1, Avx.Subtract(v, Avx.LoadVector256(p + offset + 1)));
+                    Vector256<float> v2 = Avx.LoadVector256(p + 1);
+                    if (!v2.Equals(v))
+                    {
+                        Console.WriteLine("Vector256 case FAILED: v = " + v + ", v2 = " + v2);
+                        returnVal = -1;
+                    }
+                }
+                catch (Exception e)
+                {
+                    Console.WriteLine("Unexpected exception: " + e.Message);
+                    returnVal = -1;
+                }
+            }
+        }
+        return returnVal;
+    }
+
+    static int Main()
+    {
+        S s = new S();
+        Vector128<float> v = Vector128.Create(1.0F);
+        int returnVal = Test(ref s, v, 0);
+        if (returnVal != 100)
+        {
+            Console.WriteLine("Vector128 test failed.");
+        }
+
+        // Get a new vector initialized to zeros.
+        S s2 = new S();
+        Vector256<float> v2 = Vector256.Create(1.0F);
+        if (Test256(ref s2, v2, 4) != 100)
+        {
+            Console.WriteLine("Vector256 test failed.");
+            returnVal = -1;
+        }
+        return returnVal;
+    }
+}
diff --git a/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.csproj b/tests/src/JIT/Regression/JitBlue/GitHub_19550/GitHub_19550.csproj
new file mode 100644 (file)
index 0000000..1ef9989
--- /dev/null
@@ -0,0 +1,16 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project ToolsVersion="12.0" DefaultTargets="Build" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.props))\dir.props" />
+  <PropertyGroup>
+    <AssemblyName>$(MSBuildProjectName)</AssemblyName>
+    <OutputType>Exe</OutputType>
+    <DebugType>None</DebugType>
+    <Optimize>True</Optimize>
+    <AllowUnsafeBlocks>True</AllowUnsafeBlocks>
+  </PropertyGroup>
+  <ItemGroup>
+    <Compile Include="$(MSBuildProjectName).cs" />
+  </ItemGroup>
+  <Import Project="$([MSBuild]::GetDirectoryNameOfFileAbove($(MSBuildThisFileDirectory), dir.targets))\dir.targets" />
+  <PropertyGroup Condition=" '$(MsBuildProjectDirOverride)' != '' "></PropertyGroup>
+</Project>