Implement scalar Sse2 hardware intrinsics
authorJacek Blaszczynski <biosciencenow@outlook.com>
Sat, 27 Jan 2018 21:18:05 +0000 (22:18 +0100)
committerTanner Gooding <tagoo@outlook.com>
Tue, 13 Feb 2018 23:08:12 +0000 (15:08 -0800)
src/jit/emitxarch.cpp
src/jit/hwintrinsiccodegenxarch.cpp
src/jit/hwintrinsiclistxarch.h
src/jit/hwintrinsicxarch.cpp
src/jit/lsraxarch.cpp
tests/src/JIT/HardwareIntrinsics/X86/Sse2/TestTableSse2.cs

index 3555a0c..f20573c 100644 (file)
@@ -225,6 +225,7 @@ bool emitter::IsDstSrcSrcAVXInstruction(instruction ins)
         case INS_movhps:
         case INS_movlpd:
         case INS_movlps:
+        case INS_movsdsse2:
         case INS_movss:
         case INS_rcpss:
         case INS_roundsd:
@@ -284,7 +285,7 @@ bool emitter::Is4ByteSSE4OrAVXInstruction(instruction ins)
 bool emitter::TakesVexPrefix(instruction ins)
 {
     // special case vzeroupper as it requires 2-byte VEX prefix
-    // special case (l|m|s)fence and the prefetch instructions as they never take a VEX prefix
+    // special case the fencing and the prefetch instructions as they never take a VEX prefix
     switch (ins)
     {
         case INS_lfence:
index 597a72b..4264059 100644 (file)
@@ -744,7 +744,6 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
     regNumber      targetReg   = node->gtRegNum;
     var_types      targetType  = node->TypeGet();
     var_types      baseType    = node->gtSIMDBaseType;
-    instruction    ins         = INS_invalid;
     regNumber      op1Reg      = REG_NA;
     regNumber      op2Reg      = REG_NA;
     emitter*       emit        = getEmitter();
@@ -764,15 +763,161 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
             assert(op1 != nullptr);
             assert(op2 != nullptr);
             assert(baseType == TYP_DOUBLE);
-
-            ins    = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
-            op2Reg = op2->gtRegNum;
-            ival   = Compiler::ivalOfHWIntrinsic(intrinsicID);
+            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+            op2Reg          = op2->gtRegNum;
+            ival            = Compiler::ivalOfHWIntrinsic(intrinsicID);
+            assert(ival != -1);
             emit->emitIns_SIMD_R_R_R_I(ins, emitTypeSize(TYP_SIMD16), targetReg, op1Reg, op2Reg, ival);
 
             break;
         }
 
+        case NI_SSE2_CompareEqualOrderedScalar:
+        case NI_SSE2_CompareEqualUnorderedScalar:
+        {
+            assert(baseType == TYP_DOUBLE);
+            op2Reg             = op2->gtRegNum;
+            regNumber   tmpReg = node->GetSingleTempReg();
+            instruction ins    = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+
+            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
+            emit->emitIns_R(INS_setpo, EA_1BYTE, targetReg);
+            emit->emitIns_R(INS_sete, EA_1BYTE, tmpReg);
+            emit->emitIns_R_R(INS_and, EA_1BYTE, tmpReg, targetReg);
+            emit->emitIns_R(INS_setne, EA_1BYTE, targetReg);
+            emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
+            break;
+        }
+
+        case NI_SSE2_CompareGreaterThanOrderedScalar:
+        case NI_SSE2_CompareGreaterThanUnorderedScalar:
+        {
+            assert(baseType == TYP_DOUBLE);
+            op2Reg          = op2->gtRegNum;
+            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+
+            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
+            emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
+            emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
+            break;
+        }
+
+        case NI_SSE2_CompareGreaterThanOrEqualOrderedScalar:
+        case NI_SSE2_CompareGreaterThanOrEqualUnorderedScalar:
+        {
+            assert(baseType == TYP_DOUBLE);
+            op2Reg          = op2->gtRegNum;
+            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+
+            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
+            emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
+            emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
+            break;
+        }
+
+        case NI_SSE2_CompareLessThanOrderedScalar:
+        case NI_SSE2_CompareLessThanUnorderedScalar:
+        {
+            assert(baseType == TYP_DOUBLE);
+            op2Reg          = op2->gtRegNum;
+            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+
+            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
+            emit->emitIns_R(INS_seta, EA_1BYTE, targetReg);
+            emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
+            break;
+        }
+
+        case NI_SSE2_CompareLessThanOrEqualOrderedScalar:
+        case NI_SSE2_CompareLessThanOrEqualUnorderedScalar:
+        {
+            assert(baseType == TYP_DOUBLE);
+            op2Reg          = op2->gtRegNum;
+            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+
+            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op2Reg, op1Reg);
+            emit->emitIns_R(INS_setae, EA_1BYTE, targetReg);
+            emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
+            break;
+        }
+
+        case NI_SSE2_CompareNotEqualOrderedScalar:
+        case NI_SSE2_CompareNotEqualUnorderedScalar:
+        {
+            assert(baseType == TYP_DOUBLE);
+            op2Reg             = op2->gtRegNum;
+            instruction ins    = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+            regNumber   tmpReg = node->GetSingleTempReg();
+
+            emit->emitIns_R_R(ins, emitTypeSize(TYP_SIMD16), op1Reg, op2Reg);
+            emit->emitIns_R(INS_setpe, EA_1BYTE, targetReg);
+            emit->emitIns_R(INS_setne, EA_1BYTE, tmpReg);
+            emit->emitIns_R_R(INS_or, EA_1BYTE, tmpReg, targetReg);
+            emit->emitIns_R(INS_setne, EA_1BYTE, targetReg);
+            emit->emitIns_R_R(INS_movzx, EA_1BYTE, targetReg, targetReg);
+            break;
+        }
+
+        case NI_SSE2_ConvertScalarToVector128Double:
+        case NI_SSE2_ConvertScalarToVector128Single:
+        {
+            assert(baseType == TYP_INT || baseType == TYP_LONG || baseType == TYP_FLOAT || baseType == TYP_DOUBLE);
+            assert(op1 != nullptr);
+            assert(op2 != nullptr);
+            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+            genHWIntrinsic_R_R_RM(node, ins);
+            break;
+        }
+
+        case NI_SSE2_ConvertScalarToVector128Int64:
+        case NI_SSE2_ConvertScalarToVector128UInt64:
+        {
+            assert(baseType == TYP_LONG || baseType == TYP_ULONG);
+            assert(op1 != nullptr);
+            assert(op2 == nullptr);
+            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+            // TODO-XArch-CQ -> use of type size of TYP_SIMD16 leads to
+            // instruction register encoding errors for SSE legacy encoding
+            emit->emitIns_R_R(ins, emitTypeSize(baseType), targetReg, op1Reg);
+            break;
+        }
+
+        case NI_SSE2_ConvertToDouble:
+        {
+            assert(op2 == nullptr);
+            if (op1Reg != targetReg)
+            {
+                instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+                emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg);
+            }
+            break;
+        }
+
+        case NI_SSE2_ConvertToInt32:
+        case NI_SSE2_ConvertToInt64:
+        case NI_SSE2_ConvertToUInt32:
+        case NI_SSE2_ConvertToUInt64:
+        {
+            assert(op2 == nullptr);
+            assert(baseType == TYP_DOUBLE || baseType == TYP_FLOAT || baseType == TYP_INT || baseType == TYP_UINT ||
+                   baseType == TYP_LONG || baseType == TYP_ULONG);
+            if (op1Reg != targetReg)
+            {
+                instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+                if (baseType == TYP_DOUBLE || baseType == TYP_FLOAT)
+                {
+                    emit->emitIns_R_R(ins, emitTypeSize(targetType), targetReg, op1Reg);
+                }
+                else
+                {
+                    // TODO-XArch-Bug https://github.com/dotnet/coreclr/issues/16329
+                    // using hardcoded instruction as workaround for inexact type conversions
+                    emit->emitIns_R_R(INS_mov_xmm2i, emitActualTypeSize(baseType), op1Reg, targetReg);
+                }
+            }
+            break;
+        }
+
         case NI_SSE2_LoadFence:
         {
             assert(baseType == TYP_VOID);
@@ -781,6 +926,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
             emit->emitIns(INS_lfence);
             break;
         }
+
         case NI_SSE2_MemoryFence:
         {
             assert(baseType == TYP_VOID);
@@ -795,7 +941,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
             assert(op2 == nullptr);
             assert(baseType == TYP_BYTE || baseType == TYP_UBYTE || baseType == TYP_DOUBLE);
 
-            ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
             emit->emitIns_R_R(ins, emitTypeSize(TYP_INT), targetReg, op1Reg);
             break;
         }
@@ -807,7 +953,7 @@ void CodeGen::genSSE2Intrinsic(GenTreeHWIntrinsic* node)
             assert(op1 == nullptr);
             assert(op2 == nullptr);
 
-            ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
+            instruction ins = Compiler::insOfHWIntrinsic(intrinsicID, baseType);
             emit->emitIns_SIMD_R_R_R(ins, emitTypeSize(TYP_SIMD16), targetReg, targetReg, targetReg);
             break;
         }
index 1b68ea8..54b1ac0 100644 (file)
@@ -129,44 +129,88 @@ HARDWARE_INTRINSIC(SSE_Xor,                                          "Xor",
 HARDWARE_INTRINSIC(SSE2_IsSupported,                                 "get_IsSupported",                                  SSE2,       -1,            0,          0,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_IsSupportedProperty,               HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_Add,                                         "Add",                                              SSE2,       -1,           16,          2,            {INS_paddb,     INS_paddb,     INS_paddw,     INS_paddw,     INS_paddd,     INS_paddd,     INS_paddq,     INS_paddq,     INS_invalid,   INS_addpd},             HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
 HARDWARE_INTRINSIC(SSE2_AddSaturate,                                 "AddSaturate",                                      SSE2,       -1,           16,          2,            {INS_paddsb,    INS_paddusb,   INS_paddsw,    INS_paddusw,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE2_AddScalar,                                   "AddScalar",                                        SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_addsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE2_And,                                         "And",                                              SSE2,       -1,           16,          2,            {INS_pand,      INS_pand,      INS_pand,      INS_pand,      INS_pand,      INS_pand,      INS_pand,      INS_pand,      INS_invalid,   INS_andpd},             HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
 HARDWARE_INTRINSIC(SSE2_AndNot,                                      "AndNot",                                           SSE2,       -1,           16,          2,            {INS_pandn,     INS_pandn,     INS_pandn,     INS_pandn,     INS_pandn,     INS_pandn,     INS_pandn,     INS_pandn,     INS_invalid,   INS_andnpd},            HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_Average,                                     "Average",                                          SSE2,       -1,           16,          2,            {INS_invalid,   INS_pavgb,     INS_invalid,   INS_pavgw,     INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
 HARDWARE_INTRINSIC(SSE2_CompareEqual,                                "CompareEqual",                                     SSE2,        0,           16,          2,            {INS_pcmpeqb,   INS_pcmpeqb,   INS_pcmpeqw,   INS_pcmpeqw,   INS_pcmpeqd,   INS_pcmpeqd,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmppd},             HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE2_CompareEqualOrderedScalar,                   "CompareEqualOrderedScalar",                        SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comisd},            HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE2_CompareEqualScalar,                          "CompareEqualScalar",                               SSE2,        0,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE2_CompareEqualUnorderedScalar,                 "CompareEqualUnorderedScalar",                      SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomisd},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(SSE2_CompareGreaterThan,                          "CompareGreaterThan",                               SSE2,        6,           16,          2,            {INS_pcmpgtb,   INS_invalid,   INS_pcmpgtw,   INS_invalid,   INS_pcmpgtd,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmppd},             HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrderedScalar,             "CompareGreaterThanOrderedScalar",                  SSE2,       -1,           16,          2,            {INS_pcmpgtb,   INS_invalid,   INS_pcmpgtw,   INS_invalid,   INS_pcmpgtd,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comisd},            HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE2_CompareGreaterThanScalar,                    "CompareGreaterThanScalar",                         SSE2,        6,           16,          2,            {INS_pcmpgtb,   INS_invalid,   INS_pcmpgtw,   INS_invalid,   INS_pcmpgtd,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE2_CompareGreaterThanUnorderedScalar,           "CompareGreaterThanUnorderedScalar",                SSE2,       -1,           16,          2,            {INS_pcmpgtb,   INS_invalid,   INS_pcmpgtw,   INS_invalid,   INS_pcmpgtd,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomisd},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrEqual,                   "CompareGreaterThanOrEqual",                        SSE2,        5,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmppd},             HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrEqualOrderedScalar,      "CompareGreaterThanOrEqualOrderedScalar",           SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comisd},            HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrEqualScalar,             "CompareGreaterThanOrEqualScalar",                  SSE2,        5,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE2_CompareGreaterThanOrEqualUnorderedScalar,    "CompareGreaterThanOrEqualUnorderedScalar",         SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomisd},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(SSE2_CompareLessThan,                             "CompareLessThan",                                  SSE2,        1,           16,          2,            {INS_pcmpgtb,   INS_invalid,   INS_pcmpgtw,   INS_invalid,   INS_pcmpgtd,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmppd},             HW_Category_Special,                           HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_CompareLessThanOrderedScalar,                "CompareLessThanOrderedScalar",                     SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comisd},            HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE2_CompareLessThanScalar,                       "CompareLessThanScalar",                            SSE2,        1,           16,          2,            {INS_pcmpgtb,   INS_invalid,   INS_pcmpgtw,   INS_invalid,   INS_pcmpgtd,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpsd},             HW_Category_SIMDScalar,                        HW_Flag_BaseTypeFromArg|HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE2_CompareLessThanUnorderedScalar,              "CompareLessThanUnorderedScalar",                   SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomisd},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(SSE2_CompareLessThanOrEqual,                      "CompareLessThanOrEqual",                           SSE2,        2,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmppd},             HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_CompareLessThanOrEqualOrderedScalar,         "CompareLessThanOrEqualOrderedScalar",              SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comisd},            HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE2_CompareLessThanOrEqualScalar,                "CompareLessThanOrEqualScalar",                     SSE2,        2,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE2_CompareLessThanOrEqualUnorderedScalar,       "CompareLessThanOrEqualUnorderedScalar",            SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomisd},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(SSE2_CompareNotEqual,                             "CompareNotEqual",                                  SSE2,        4,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmppd},             HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE2_CompareNotEqualOrderedScalar,                "CompareNotEqualOrderedScalar",                     SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_comisd},            HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
+HARDWARE_INTRINSIC(SSE2_CompareNotEqualScalar,                       "CompareNotEqualScalar",                            SSE2,        4,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE2_CompareNotEqualUnorderedScalar,              "CompareNotEqualUnorderedScalar",                   SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_ucomisd},           HW_Category_SIMDScalar,                        HW_Flag_MultiIns|HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(SSE2_CompareNotGreaterThan,                       "CompareNotGreaterThan",                            SSE2,        2,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmppd},             HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_CompareNotGreaterThanScalar,                 "CompareNotGreaterThanScalar",                      SSE2,        2,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE2_CompareNotGreaterThanOrEqual,                "CompareNotGreaterThanOrEqual",                     SSE2,        1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmppd},             HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_CompareNotGreaterThanOrEqualScalar,          "CompareNotGreaterThanOrEqualScalar",               SSE2,        1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE2_CompareNotLessThan,                          "CompareNotLessThan",                               SSE2,        5,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmppd},             HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_CompareNotLessThanScalar,                    "CompareNotLessThanScalar",                         SSE2,        5,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE2_CompareNotLessThanOrEqual,                   "CompareNotLessThanOrEqual",                        SSE2,        6,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmppd},             HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_CompareNotLessThanOrEqualScalar,             "CompareNotLessThanOrEqualScalar",                  SSE2,        6,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE2_CompareOrdered,                              "CompareOrdered",                                   SSE2,        7,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmppd},             HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_CompareOrderedScalar,                        "CompareOrderedScalar",                             SSE2,        7,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE2_CompareUnordered,                            "CompareUnordered",                                 SSE2,        3,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmppd},             HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_CompareUnorderedScalar,                      "CompareUnorderedScalar",                           SSE2,        3,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cmpsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
+HARDWARE_INTRINSIC(SSE2_ConvertToDouble,                             "ConvertToDouble",                                  SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movsd},             HW_Category_Helper,                            HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_ConvertToInt32,                              "ConvertToInt32",                                   SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mov_xmm2i, INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtsd2si},          HW_Category_Special,                           HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_ConvertToInt32WithTruncation,                "ConvertToInt32WithTruncation",                     SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvttsd2si},         HW_Category_SIMDScalar,                        HW_Flag_BaseTypeFromArg)
+HARDWARE_INTRINSIC(SSE2_ConvertToInt64,                              "ConvertToInt64",                                   SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mov_xmm2i, INS_invalid,   INS_invalid,   INS_cvtsd2si},          HW_Category_Special,                           HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_ConvertToInt64WithTruncation,                "ConvertToInt64WithTruncation",                     SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvttsd2si},         HW_Category_SIMDScalar,                        HW_Flag_BaseTypeFromArg)
+HARDWARE_INTRINSIC(SSE2_ConvertToUInt32,                             "ConvertToUInt32",                                  SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mov_xmm2i, INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_Special,                           HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_ConvertToUInt64,                             "ConvertToUInt64",                                  SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mov_xmm2i, INS_invalid,   INS_invalid},           HW_Category_Special,                           HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_ConvertToVector128Double,                    "ConvertToVector128Double",                         SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtdq2pd,  INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtps2pd,  INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_BaseTypeFromArg)
+HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128Double,              "ConvertScalarToVector128Double",                   SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtsi2sd,  INS_invalid,   INS_cvtsi2sd,  INS_invalid,   INS_cvtss2sd,  INS_invalid},           HW_Category_Special,                           HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_ConvertToVector128Int32,                     "ConvertToVector128Int32",                          SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtps2dq,  INS_cvtpd2dq},          HW_Category_SimpleSIMD,                        HW_Flag_BaseTypeFromArg)
+HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128Int32,               "ConvertScalarToVector128Int32",                    SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mov_i2xmm, INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_ConvertToVector128Int32WithTruncation,       "ConvertToVector128Int32WithTruncation",            SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvttps2dq, INS_cvttpd2dq},         HW_Category_SimpleSIMD,                        HW_Flag_BaseTypeFromArg)
+HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128Int64,               "ConvertScalarToVector128Int64",                    SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mov_i2xmm, INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_Special,                           HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_ConvertToVector128Single,                    "ConvertToVector128Single",                         SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtdq2ps,  INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtpd2ps},          HW_Category_SimpleSIMD,                        HW_Flag_BaseTypeFromArg)
+HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128Single,              "ConvertScalarToVector128Single",                   SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_cvtsd2ss},          HW_Category_Special,                           HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128UInt32,              "ConvertScalarToVector128UInt32",                   SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mov_i2xmm, INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SIMDScalar,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_ConvertScalarToVector128UInt64,              "ConvertScalarToVector128UInt64",                   SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mov_i2xmm, INS_invalid,   INS_invalid},           HW_Category_Special,                           HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_Divide,                                      "Divide",                                           SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_divpd},             HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_DivideScalar,                                "DivideScalar",                                     SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_divsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE2_LoadAlignedVector128,                        "LoadAlignedVector128",                             SSE2,       -1,           16,          1,            {INS_movdqa,    INS_movdqa,    INS_movdqa,    INS_movdqa,    INS_movdqa,    INS_movdqa,    INS_movdqa,    INS_movdqa,    INS_invalid,   INS_movapd},            HW_Category_MemoryLoad,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_LoadFence,                                   "LoadFence",                                        SSE2,       -1,            0,          0,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_Special,                           HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_LoadScalarVector128,                         "LoadScalarVector128",                              SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movd,      INS_movd,      INS_movq,      INS_movq,      INS_invalid,   INS_movsdsse2},         HW_Category_MemoryLoad,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_LoadVector128,                               "LoadVector128",                                    SSE2,       -1,           16,          1,            {INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_invalid,   INS_movupd},            HW_Category_MemoryLoad,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_Max,                                         "Max",                                              SSE2,       -1,           16,          2,            {INS_invalid,   INS_pmaxub,    INS_pmaxsw,    INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_maxpd},             HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
 HARDWARE_INTRINSIC(SSE2_MemoryFence,                                 "MemoryFence",                                      SSE2,       -1,            0,          0,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_Special,                           HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_MaxScalar,                                   "MaxScalar",                                        SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_maxsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE2_Min,                                         "Min",                                              SSE2,       -1,           16,          2,            {INS_invalid,   INS_pminub,    INS_pminsw,    INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_minpd},             HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE2_MinScalar,                                   "MinScalar",                                        SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_minsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE2_MoveMask,                                    "MoveMask",                                         SSE2,       -1,           16,          1,            {INS_pmovmskb,  INS_pmovmskb,  INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movmskpd},          HW_Category_Special,                           HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_MoveScalar,                                  "MoveScalar",                                       SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movsdsse2},         HW_Category_SIMDScalar,                        HW_Flag_NoContainment)
 HARDWARE_INTRINSIC(SSE2_Multiply,                                    "Multiply",                                         SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_pmuludq,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mulpd},             HW_Category_SimpleSIMD,                        HW_Flag_Commutative|HW_Flag_BaseTypeFromArg)
 HARDWARE_INTRINSIC(SSE2_MultiplyHigh,                                "MultiplyHigh",                                     SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_pmulhw,    INS_pmulhuw,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
 HARDWARE_INTRINSIC(SSE2_MultiplyHorizontalAdd,                       "MultiplyHorizontalAdd",                            SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_pmaddwd,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative|HW_Flag_BaseTypeFromArg)
 HARDWARE_INTRINSIC(SSE2_MultiplyLow,                                 "MultiplyLow",                                      SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_pmullw,    INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
+HARDWARE_INTRINSIC(SSE2_MultiplyScalar,                              "MultiplyScalar",                                   SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_mulsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE2_Or,                                          "Or",                                               SSE2,       -1,           16,          2,            {INS_por,       INS_por,       INS_por,       INS_por,       INS_por,       INS_por,       INS_por,       INS_por,       INS_invalid,   INS_orpd},              HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
 HARDWARE_INTRINSIC(SSE2_PackSignedSaturate,                          "PackSignedSaturate",                               SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_packsswb,  INS_invalid,   INS_packssdw,  INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_BaseTypeFromArg)
 HARDWARE_INTRINSIC(SSE2_PackUnsignedSaturate,                        "PackUnsignedSaturate",                             SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_packuswb,  INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_BaseTypeFromArg)
 HARDWARE_INTRINSIC(SSE2_SetZeroVector128,                            "SetZeroVector128",                                 SSE2,       -1,           16,          0,            {INS_pxor,      INS_pxor,      INS_pxor,      INS_pxor,      INS_pxor,      INS_pxor,      INS_pxor,      INS_pxor,      INS_invalid,   INS_xorpd},             HW_Category_Helper,                            HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_SumAbsoluteDifferences,                      "SumAbsoluteDifferences",                           SSE2,       -1,           16,          2,            {INS_invalid,   INS_psadbw,    INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_BaseTypeFromArg)
 HARDWARE_INTRINSIC(SSE2_Sqrt,                                        "Sqrt",                                             SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_sqrtpd},            HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_SqrtScalar,                                  "SqrtScalar",                                       SSE2,       -1,           16,          1,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_sqrtsd},            HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE2_Store,                                       "Store",                                            SSE2,       -1,           16,          2,            {INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_movdqu,    INS_invalid,   INS_movupd},            HW_Category_MemoryStore,                       HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_StoreAligned,                                "StoreAligned",                                     SSE2,       -1,           16,          2,            {INS_movdqa,    INS_movdqa,    INS_movdqa,    INS_movdqa,    INS_movdqa,    INS_movdqa,    INS_movdqa,    INS_movdqa,    INS_invalid,   INS_movapd},            HW_Category_MemoryStore,                       HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_StoreAlignedNonTemporal,                     "StoreAlignedNonTemporal",                          SSE2,       -1,           16,          2,            {INS_movntdq,   INS_movntdq,   INS_movntdq,   INS_movntdq,   INS_movntdq,   INS_movntdq,   INS_movntdq,   INS_movntdq,   INS_invalid,   INS_movntpd},           HW_Category_MemoryStore,                       HW_Flag_NoFlag)
@@ -175,6 +219,7 @@ HARDWARE_INTRINSIC(SSE2_StoreLow,                                    "StoreLow",
 HARDWARE_INTRINSIC(SSE2_StoreHigh,                                   "StoreHigh",                                        SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_movhpd},            HW_Category_MemoryStore,                       HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_Subtract,                                    "Subtract",                                         SSE2,       -1,           16,          2,            {INS_psubb,     INS_psubb,     INS_psubw,     INS_psubw,     INS_psubd,     INS_psubd,     INS_psubq,     INS_psubq,     INS_invalid,   INS_subpd},             HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_SubtractSaturate,                            "SubtractSaturate",                                 SSE2,       -1,           16,          2,            {INS_psubsb,    INS_psubusb,   INS_psubsw,    INS_psubusw,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid},           HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
+HARDWARE_INTRINSIC(SSE2_SubtractScalar,                              "SubtractScalar",                                   SSE2,       -1,           16,          2,            {INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_invalid,   INS_subsd},             HW_Category_SIMDScalar,                        HW_Flag_CopyUpperBits)
 HARDWARE_INTRINSIC(SSE2_UnpackHigh,                                  "UnpackHigh",                                       SSE2,       -1,           16,          2,            {INS_punpckhbw, INS_punpckhbw, INS_punpckhwd, INS_punpckhwd, INS_punpckhdq, INS_punpckhdq, INS_punpckhqdq,INS_punpckhqdq,INS_invalid,   INS_unpckhpd},          HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_UnpackLow,                                   "UnpackLow",                                        SSE2,       -1,           16,          2,            {INS_punpcklbw, INS_punpcklbw, INS_punpcklwd, INS_punpcklwd, INS_punpckldq, INS_punpckldq, INS_punpcklqdq,INS_punpcklqdq,INS_invalid,   INS_unpcklpd},          HW_Category_SimpleSIMD,                        HW_Flag_NoFlag)
 HARDWARE_INTRINSIC(SSE2_Xor,                                         "Xor",                                              SSE2,       -1,           16,          2,            {INS_pxor,      INS_pxor,      INS_pxor,      INS_pxor,      INS_pxor,      INS_pxor,      INS_pxor,      INS_pxor,      INS_invalid,   INS_xorpd},             HW_Category_SimpleSIMD,                        HW_Flag_Commutative)
index 0fc2251..383c2c1 100644 (file)
@@ -736,11 +736,17 @@ GenTree* Compiler::impSSE2Intrinsic(NamedIntrinsic        intrinsic,
     var_types baseType = TYP_UNKNOWN;
     var_types retType  = TYP_UNKNOWN;
 
+    // The  fencing intrinsics don't take any operands and simdSize is 0
     assert((simdSize == 16) || (simdSize == 0));
 
+    CORINFO_ARG_LIST_HANDLE argList = sig->args;
+    CORINFO_CLASS_HANDLE    argClass;
+    var_types               argType = TYP_UNKNOWN;
+
     switch (intrinsic)
     {
         case NI_SSE2_CompareLessThan:
+        {
             assert(sig->numArgs == 2);
             op2      = impSIMDPopStack(TYP_SIMD16);
             op1      = impSIMDPopStack(TYP_SIMD16);
@@ -755,6 +761,89 @@ GenTree* Compiler::impSSE2Intrinsic(NamedIntrinsic        intrinsic,
                     gtNewSimdHWIntrinsicNode(TYP_SIMD16, op2, op1, NI_SSE2_CompareGreaterThan, baseType, simdSize);
             }
             break;
+        }
+
+        case NI_SSE2_ConvertScalarToVector128Double:
+        {
+            assert(sig->numArgs == 2);
+            assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_DOUBLE);
+
+            argList = info.compCompHnd->getArgNext(sig->args);
+            CorInfoType corType =
+                strip(info.compCompHnd->getArgType(sig, argList, &argClass)); // type of the second argument
+
+            baseType = JITtype2varType(corType);
+
+#ifdef _TARGET_X86_
+            if (varTypeIsLong(JITtype2varType(corType)))
+            {
+                return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
+            }
+#endif // _TARGET_X86_
+
+            if (baseType == TYP_STRUCT)
+            {
+                baseType = TYP_FLOAT; // it is the only type passed as Vector
+                op2      = impSIMDPopStack(TYP_SIMD16);
+            }
+            else
+            {
+                op2 = impPopStack().val;
+            }
+
+            op1     = impSIMDPopStack(TYP_SIMD16);
+            retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, baseType, simdSize);
+
+            break;
+        }
+
+        case NI_SSE2_ConvertScalarToVector128Int64:
+        case NI_SSE2_ConvertScalarToVector128UInt64:
+        {
+            assert(sig->numArgs == 1);
+            baseType = getBaseTypeOfSIMDType(sig->retTypeSigClass);
+            assert(baseType == TYP_LONG || baseType == TYP_ULONG);
+
+#ifdef _TARGET_X86_
+            return impUnsupportedHWIntrinsic(CORINFO_HELP_THROW_PLATFORM_NOT_SUPPORTED, method, sig, mustExpand);
+#endif // _TARGET_X86_
+
+            op1     = impPopStack().val;
+            retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, intrinsic, baseType, simdSize);
+            break;
+        }
+
+        case NI_SSE2_ConvertScalarToVector128Single:
+        {
+            assert(sig->numArgs == 2);
+            assert(getBaseTypeOfSIMDType(sig->retTypeSigClass) == TYP_FLOAT);
+
+            op2     = impSIMDPopStack(TYP_SIMD16);
+            op1     = impSIMDPopStack(TYP_SIMD16);
+            retNode = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, op2, intrinsic, TYP_DOUBLE, simdSize);
+            break;
+        }
+
+        case NI_SSE2_ConvertToInt32:
+        case NI_SSE2_ConvertToInt64:
+        {
+            assert(sig->numArgs == 1);
+            op1      = impSIMDPopStack(TYP_SIMD16);
+            retType  = JITtype2varType(sig->retType);
+            baseType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args));
+            retNode  = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
+            break;
+        }
+
+        case NI_SSE2_ConvertToUInt32:
+        case NI_SSE2_ConvertToUInt64:
+        {
+            assert(sig->numArgs == 1);
+            op1      = impSIMDPopStack(TYP_SIMD16);
+            baseType = JITtype2varType(sig->retType);
+            retNode  = gtNewSimdHWIntrinsicNode(baseType, op1, intrinsic, baseType, simdSize);
+            break;
+        }
 
         case NI_SSE2_LoadFence:
         case NI_SSE2_MemoryFence:
@@ -768,6 +857,7 @@ GenTree* Compiler::impSSE2Intrinsic(NamedIntrinsic        intrinsic,
         }
 
         case NI_SSE2_MoveMask:
+        {
             assert(sig->numArgs == 1);
             retType = JITtype2varType(sig->retType);
             assert(retType == TYP_INT);
@@ -775,6 +865,7 @@ GenTree* Compiler::impSSE2Intrinsic(NamedIntrinsic        intrinsic,
             baseType = getBaseTypeOfSIMDType(info.compCompHnd->getArgClass(sig, sig->args));
             retNode  = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, baseType, simdSize);
             break;
+        }
 
         default:
             JITDUMP("Not implemented hardware intrinsic");
index e409573..63db1b0 100644 (file)
@@ -2286,6 +2286,10 @@ void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
         case NI_SSE_CompareEqualUnorderedScalar:
         case NI_SSE_CompareNotEqualOrderedScalar:
         case NI_SSE_CompareNotEqualUnorderedScalar:
+        case NI_SSE2_CompareEqualOrderedScalar:
+        case NI_SSE2_CompareEqualUnorderedScalar:
+        case NI_SSE2_CompareNotEqualOrderedScalar:
+        case NI_SSE2_CompareNotEqualUnorderedScalar:
             info->internalIntCount = 1;
             info->setInternalCandidates(this, RBM_BYTE_REGS);
             break;
@@ -2317,6 +2321,7 @@ void LinearScan::BuildHWIntrinsic(GenTreeHWIntrinsic* intrinsicTree)
 
         case NI_SSE_ConvertToSingle:
         case NI_SSE_StaticCast:
+        case NI_SSE2_ConvertToDouble:
             assert(info->srcCount == 1);
             assert(info->dstCount == 1);
             useList.Last()->info.isTgtPref = true;
index 8233c16..e3fc3e8 100644 (file)
@@ -4,11 +4,11 @@
 //
 
 using System;
-using System.Globalization;
 using System.Runtime.CompilerServices;
 using System.Runtime.InteropServices;
 using System.Runtime.Intrinsics;
 using System.Runtime.Intrinsics.X86;
+using System.Text;
 
 namespace IntelHardwareIntrinsicTest
 {
@@ -36,6 +36,9 @@ namespace IntelHardwareIntrinsicTest
         ValueTuple<U, U, U, U, U, U, U, ValueTuple<U>> z,
         ref U a1, ref U a2, ref U a3, ref U a4, ref U a5, ref U a6, ref U a7, ref U a8);
 
+    public delegate bool CheckMethodEight<T, U>(
+     Span<T> x, Span<T> y, Span<U> z, Span<U> a);
+
     public delegate bool CheckMethodEightOne<T, U>(
         Span<T> x, Span<T> y, U z, ref U a);
 
@@ -106,6 +109,12 @@ namespace IntelHardwareIntrinsicTest
             Unsafe.Write((byte*)OutArrayPtr + (index * _stepSize), value);
         }
 
+        public void SetOutArray(bool value1, Vector128<T> value2, int index = -1)
+        {
+            index = index < 0 ? _index : index;
+            Unsafe.Write((byte*)OutArrayPtr + (index * _stepSize), value2);
+        }
+
         public (Vector128<T>, Vector128<T>, Vector128<T>) this[int index]
         {
             get
@@ -1040,7 +1049,8 @@ namespace IntelHardwareIntrinsicTest
     public unsafe struct TestTableImmSse2<T, U, V> : IDisposable where T : struct where U : struct where V : struct
     {
         private const int _stepSize = 16;
-        private int _tSize;
+        private static int s_tSize;
+        private static int s_ElementCount;
 
         private GCHandle _inHandle1;
         private GCHandle _inHandle2;
@@ -1100,8 +1110,9 @@ namespace IntelHardwareIntrinsicTest
 
         public TestTableImmSse2(int lengthInVectors, double tSizeMultiplier = 1.0, bool initialize = true)
         {
-            _tSize = Marshal.SizeOf<T>();
-            int length = _stepSize / _tSize * lengthInVectors;
+            s_tSize = Marshal.SizeOf<T>();
+            s_ElementCount = _stepSize / s_tSize;
+            int length = s_ElementCount * lengthInVectors;
             inArray1 = new T[length];
             inArray2 = new T[lengthInVectors];
             immArray = new V[lengthInVectors];
@@ -1147,7 +1158,7 @@ namespace IntelHardwareIntrinsicTest
                 }
                 else
                 {
-                    random.NextBytes(new Span<byte>(((byte*)InArray1Ptr), inArray1.Length * _tSize));
+                    random.NextBytes(new Span<byte>(((byte*)InArray1Ptr), inArray1.Length * s_tSize));
                 }
             }
             else if (mode == InitMode.NumberFirstVectors)
@@ -1237,7 +1248,7 @@ namespace IntelHardwareIntrinsicTest
             bool result = true;
             for (int i = 0; i < inArray1.Length; i++)
             {
-                int elNo = _stepSize / _tSize;
+                int elNo = _stepSize / s_tSize;
                 if (!check(
                     new Span<T>(inArray1, Index * elNo, elNo),
                     inArray2[i], immArray[i],
@@ -1260,6 +1271,257 @@ namespace IntelHardwareIntrinsicTest
         }
     }
 
+    public unsafe struct TestTableScalarSse2<T, U> : IDisposable where T : struct where U : struct
+    {
+        private const int _stepSize = 16;
+        private static int s_tSize;
+        public static int s_ElementCount;
+
+        private GCHandle _inHandle1;
+        private GCHandle _inHandle2;
+        private GCHandle _outHandle;
+        private GCHandle _checkHandle;
+
+        private int _index;
+
+        public T[] inArray1;
+        public T[] inArray2;
+        public U[] outArray;
+        public U[] checkArray;
+
+        public void* InArray1Ptr => _inHandle1.AddrOfPinnedObject().ToPointer();
+        public void* InArray2Ptr => _inHandle2.AddrOfPinnedObject().ToPointer();
+        public void* OutArrayPtr => _outHandle.AddrOfPinnedObject().ToPointer();
+        public void* CheckArrayPtr => _checkHandle.AddrOfPinnedObject().ToPointer();
+
+        public Vector128<T> Vector1 => Unsafe.Read<Vector128<T>>((byte*)InArray1Ptr + (_index * _stepSize));
+        public Vector128<T> Vector2 => Unsafe.Read<Vector128<T>>((byte*)InArray2Ptr + (_index * _stepSize));
+
+        public int Index { get => _index; set => _index = value; }
+
+        public void SetIndex(int index)
+        {
+            _index = index;
+        }
+
+        public void SetOutArray(U value, int index = -1)
+        {
+            index = index < 0 ? _index : index;
+            outArray[_index] = value;
+        }
+
+        public void SetOutArray(Vector128<U> value, int index = -1)
+        {
+            index = index < 0 ? _index : index;
+            Unsafe.Write((byte*)OutArrayPtr + (_index * _stepSize), value);
+        }
+
+        public (Vector128<T>, Vector128<T>) this[int index]
+        {
+            get
+            {
+                _index = index;
+                return (Vector1, Vector2);
+            }
+        }
+
+        public ValueTuple<Memory<T>, Memory<T>, Memory<U>, Memory<U>> GetMemoryDataPoint(int index)
+        {
+            return (new Memory<T>(inArray1, index, s_ElementCount), new Memory<T>(inArray2, index, s_ElementCount),
+                    new Memory<U>(outArray, index, s_ElementCount), new Memory<U>(checkArray, index, s_ElementCount));
+        }
+
+        public ValueTuple<Memory<T>, Memory<T>, U, U> GetMemoryValueDataPoint(int index)
+        {
+            return (new Memory<T>(inArray1, index, s_ElementCount), new Memory<T>(inArray2, index, s_ElementCount),
+                    outArray[index/s_ElementCount], checkArray[index/s_ElementCount]);
+        }
+
+        public static TestTableScalarSse2<T, U> Create(int lengthInVectors, double tSizeMultiplier = 1.0)
+        {
+            return new TestTableScalarSse2<T, U>(lengthInVectors, tSizeMultiplier);
+        }
+
+        public TestTableScalarSse2(int lengthInVectors, double tSizeMultiplier = 1.0, bool initialize = true)
+        {
+            s_tSize = Marshal.SizeOf<T>();
+            s_ElementCount = _stepSize / s_tSize;
+            int length = s_ElementCount * lengthInVectors;
+            inArray1 = new T[length];
+            inArray2 = new T[length];
+            outArray = new U[(int)(length / tSizeMultiplier)];
+            checkArray = new U[(int)(length / tSizeMultiplier)];
+            _index = 0;
+            _inHandle1 = GCHandle.Alloc(inArray1, GCHandleType.Pinned);
+            _inHandle2 = GCHandle.Alloc(inArray2, GCHandleType.Pinned);
+            _outHandle = GCHandle.Alloc(outArray, GCHandleType.Pinned);
+            _checkHandle = GCHandle.Alloc(checkArray, GCHandleType.Pinned);
+            if (initialize)
+            {
+                Initialize();
+            }
+        }
+
+        public void Initialize()
+        {
+            Initialize(InitMode.Undefined);
+        }
+
+        public void Initialize(InitMode mode = InitMode.Undefined)
+        {
+            if (mode == InitMode.Undefined)
+            {
+                Random random = new Random(unchecked((int)(DateTime.UtcNow.Ticks & 0x00000000ffffffffl)));
+                if (inArray1 is double[])
+                {
+                    var array1 = inArray1 as double[];
+                    var array2 = inArray2 as double[];
+                    for (int i = 0; i < array1.Length; i++)
+                    {
+                        array1[i] = random.NextDouble() * random.Next();
+                        array2[i] = random.NextDouble() * random.Next();
+                    }
+                }
+                else if (inArray1 is float[])
+                {
+                    var arrayFloat1 = inArray1 as float[];
+                    var arrayFloat2 = inArray2 as float[];
+                    for (int i = 0; i < arrayFloat1.Length; i++)
+                    {
+                        arrayFloat1[i] = (float)(random.NextDouble() * random.Next(ushort.MaxValue));
+                        arrayFloat2[i] = (float)(random.NextDouble() * random.Next(ushort.MaxValue));
+                    }
+                }
+                else
+                {
+                    random.NextBytes(new Span<byte>(((byte*)InArray1Ptr), inArray1.Length * s_tSize));
+                    random.NextBytes(new Span<byte>(((byte*)InArray2Ptr), inArray2.Length * s_tSize));
+                }
+            }
+            else if (mode == InitMode.NumberFirstVectors)
+            {
+                InitializeWithVectorNumbering();
+            }
+        }
+
+        private void InitializeWithVectorNumbering()
+        {
+            Type baseType = typeof(T);
+            if (inArray1 is double[] doubleArray1)
+            {
+                for (double i = 0.0, j = 10000.0; i < doubleArray1.Length; i++, j++)
+                {
+                    doubleArray1[(int)i] = i;
+                }
+            }
+            else if (inArray1 is float[] floatArray1)
+            {
+                for (float i = 0.0f, j = 10000.0f; i < floatArray1.Length; i++, j++)
+                {
+                    floatArray1[(int)i] = i;
+                }
+            }
+            else if (inArray1 is byte[] byteArray1)
+            {
+                for (byte i = 0, j = 100; i < byteArray1.Length; i++, j++)
+                {
+                    byteArray1[i] = i;
+                }
+            }
+            else if (inArray1 is sbyte[] sbyteArray1)
+            {
+                for (sbyte i = 0, j = 100; i < sbyteArray1.Length; i++, j++)
+                {
+                    sbyteArray1[i] = i;
+                }
+            }
+            else if (inArray1 is short[] shortArray1)
+            {
+                for (short i = 0, j = 10000; i < shortArray1.Length; i++, j++)
+                {
+                    shortArray1[i] = i;
+                }
+
+            }
+            else if (inArray1 is ushort[] ushortArray1)
+            {
+                for (ushort i = 0, j = 10000; i < ushortArray1.Length; i++, j++)
+                {
+                    ushortArray1[i] = i;
+                }
+            }
+            else if (inArray1 is int[] intArray1)
+            {
+                for (int i = 0, j = 10000; i < intArray1.Length; i++, j++)
+                {
+                    intArray1[i] = i;
+                }
+            }
+            else if (inArray1 is uint[] uintArray1)
+            {
+                for (uint i = 0, j = 10000; i < uintArray1.Length; i++, j++)
+                {
+                    uintArray1[i] = i;
+                }
+            }
+            else if (inArray1 is long[] longArray1)
+            {
+                for (long i = 0, j = 10000; i < longArray1.Length; i++, j++)
+                {
+                    longArray1[i] = i;
+                }
+            }
+            else if (inArray1 is ulong[] ulongArray1)
+            {
+                for (uint i = 0, j = 10000; i < ulongArray1.Length; i++, j++)
+                {
+                    ulongArray1[i] = i;
+                }
+            }
+        }
+
+        public bool CheckResult(CheckMethodEight<T, U> check)
+        {
+            bool result = true;
+            for (int i = 0; i < inArray1.Length - 1; i += s_ElementCount)
+            {
+                if (!check(
+                    new Span<T>(inArray1, i, s_ElementCount),
+                    new Span<T>(inArray2, i, s_ElementCount),
+                    new Span<U>(outArray, i, s_ElementCount),
+                    new Span<U>(checkArray, i, s_ElementCount)))
+                {
+                    result = false;
+                }
+            }
+            return result;
+        }
+
+        public bool CheckResult(CheckMethodEightOne<T, U> check)
+        {
+            bool result = true;
+            for (int i = 0, j =0; i < inArray1.Length - 1; i += s_ElementCount, j++)
+            {
+                if (!check(
+                    new Span<T>(inArray1, i, s_ElementCount),
+                    new Span<T>(inArray2, i, s_ElementCount),
+                    outArray[j], ref checkArray[j]))
+                {
+                    result = false;
+                }
+            }
+            return result;
+        }
+
+        public void Dispose()
+        {
+            _inHandle1.Free();
+            _inHandle2.Free();
+            _outHandle.Free();
+            _checkHandle.Free();
+        }
+    }
+
     public enum SpecialCheck
     {
         Undefined = 0,
@@ -1282,6 +1544,19 @@ namespace IntelHardwareIntrinsicTest
             Console.WriteLine($"{ typeof(Sse2)}.{functionName} test tuples:");
         }
 
+        private static string PrintMemory<T>(Memory<T> x)
+        {
+            var xSpan = x.Span;
+            var builder = new StringBuilder().Append("(");
+            for (int i = 0; i < x.Length; i++)
+            {
+                builder.Append(xSpan[i]);
+                if (i + 1 < x.Length)
+                    builder.Append(", ");
+            }
+            return builder.Append(")").ToString();
+        }
+
         private static void PrintError<T>(TestTableSse2<T> testTable, string functionName = "", string testFuncString = "",
             CheckMethod<T> check = null) where T : struct
         {
@@ -1391,6 +1666,30 @@ namespace IntelHardwareIntrinsicTest
             Console.WriteLine();
         }
 
+        private static void PrintError<T, U>(TestTableScalarSse2<T, U> testTable, string functionName = "", string testFuncString = "",
+            CheckMethodEight<T, U> check = null) where T : struct where U : struct
+        {
+            PrintErrorHeaderTu<T>(functionName, testFuncString);
+            for (int i = 0; i < testTable.inArray1.Length - 1; i += TestTableScalarSse2<T, U>.s_ElementCount)
+            {
+                var item = testTable.GetMemoryDataPoint(i);
+                Console.Write($"( x{PrintMemory(item.Item1)}, y{PrintMemory(item.Item2)}, z{PrintMemory(item.Item3)}, a{PrintMemory(item.Item4)})");
+            }
+            Console.WriteLine();
+        }
+
+        private static void PrintError<T, U>(TestTableScalarSse2<T, U> testTable, string functionName = "", string testFuncString = "",
+            CheckMethodEightOne<T, U> check = null) where T : struct where U : struct
+        {
+            PrintErrorHeaderTu<T>(functionName, testFuncString);
+            for (int i = 0; i < testTable.inArray1.Length - 1; i += TestTableScalarSse2<T, U>.s_ElementCount)
+            {
+                var item = testTable.GetMemoryValueDataPoint(i);
+                Console.Write($"( x{PrintMemory(item.Item1)}, y{PrintMemory(item.Item2)}, z({item.Item3}), a({item.Item4}))");
+            }
+            Console.WriteLine();
+        }
+
         private static void PrintError<T, U>(TestTableSse2<T, U> testTable, string functionName = "", string testFuncString = "",
             CheckMethodEightOne<T, U> check = null) where T : struct where U : struct
         {