From: Tanner Gooding Date: Fri, 21 Jan 2022 19:46:21 +0000 (-0800) Subject: Ensure several helper intrinsics are correctly imported and handled (#63972) X-Git-Tag: accepted/tizen/unified/riscv/20231226.055536~11334 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=96a7478a29d1711eae808746db9d409ed83104e4;p=platform%2Fupstream%2Fdotnet%2Fruntime.git Ensure several helper intrinsics are correctly imported and handled (#63972) * Ensure several helper intrinsics are correctly imported and handled * Ensure that Sum for TYP_INT/UINT on Arm64 is correctly handled * Respond to PR feedback and ensure ExtractMostSignificantBits for Vector64 on Arm64 also uses AddPairwise * Applying formatting patch * Ensure the clsHnd is correct * Fix the remaining musl failures * Ensure that we aren't sign-extending TYP_BYTE (System.SByte) for ExtractMostSignificantBits * Ensure an assert is correct on x64 * Ensure Vector64.Dot on Arm64 uses AddPairwise, not AddAcross * Apply formatting patch --- diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index be664b4..b8897e8 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -17948,7 +17948,7 @@ GenTree* Compiler::gtNewSimdAbsNode( if ((simdBaseType != TYP_LONG) && ((simdSize == 32) || compOpportunisticallyDependsOn(InstructionSet_SSSE3))) { NamedIntrinsic intrinsic = (simdSize == 32) ? NI_AVX2_Abs : NI_SSSE3_Abs; - return gtNewSimdAsHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize); + return gtNewSimdHWIntrinsicNode(type, op1, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); } else { @@ -19414,7 +19414,7 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(genTreeOps op, getAllBitsSet = NI_Vector128_get_AllBitsSet; } - op1 = gtNewSimdCmpOpNode(op, simdBaseType, op1, op2, simdBaseJitType, simdSize, + op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); if (simdBaseType == TYP_FLOAT) @@ -19428,7 +19428,7 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(genTreeOps op, simdBaseJitType = CORINFO_TYPE_LONG; } - op2 = gtNewSimdHWIntrinsicNode(simdBaseType, getAllBitsSet, simdBaseJitType, simdSize); + op2 = gtNewSimdHWIntrinsicNode(simdType, getAllBitsSet, simdBaseJitType, simdSize); break; } #elif defined(TARGET_ARM64) @@ -19459,7 +19459,7 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(genTreeOps op, getAllBitsSet = NI_Vector128_get_AllBitsSet; } - op1 = gtNewSimdCmpOpNode(op, simdBaseType, op1, op2, simdBaseJitType, simdSize, + op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); if (simdBaseType == TYP_FLOAT) @@ -19473,7 +19473,7 @@ GenTree* Compiler::gtNewSimdCmpOpAllNode(genTreeOps op, simdBaseJitType = CORINFO_TYPE_LONG; } - op2 = gtNewSimdHWIntrinsicNode(simdBaseType, getAllBitsSet, simdBaseJitType, simdSize); + op2 = gtNewSimdHWIntrinsicNode(simdType, getAllBitsSet, simdBaseJitType, simdSize); break; } #else @@ -19537,7 +19537,7 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode(genTreeOps op, intrinsic = (simdSize == 32) ? NI_Vector256_op_Inequality : NI_Vector128_op_Inequality; - op1 = gtNewSimdCmpOpNode(op, simdBaseType, op1, op2, simdBaseJitType, simdSize, + op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); if (simdBaseType == TYP_FLOAT) @@ -19551,7 +19551,7 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode(genTreeOps op, simdBaseJitType = CORINFO_TYPE_LONG; } - op2 = gtNewSimdZeroNode(simdBaseType, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + op2 = gtNewSimdZeroNode(simdType, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); break; } @@ -19572,7 +19572,7 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode(genTreeOps op, intrinsic = (simdSize == 8) ? NI_Vector64_op_Inequality : NI_Vector128_op_Inequality; - op1 = gtNewSimdCmpOpNode(op, simdBaseType, op1, op2, simdBaseJitType, simdSize, + op1 = gtNewSimdCmpOpNode(op, simdType, op1, op2, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); if (simdBaseType == TYP_FLOAT) @@ -19586,7 +19586,7 @@ GenTree* Compiler::gtNewSimdCmpOpAnyNode(genTreeOps op, simdBaseJitType = CORINFO_TYPE_LONG; } - op2 = gtNewSimdZeroNode(simdBaseType, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + op2 = gtNewSimdZeroNode(simdType, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); break; } @@ -19659,7 +19659,8 @@ GenTree* Compiler::gtNewSimdCndSelNode(var_types type, // result = op2 | op3 return gtNewSimdBinOpNode(GT_OR, type, op2, op3, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); #elif defined(TARGET_ARM64) - return gtNewSimdAsHWIntrinsicNode(type, op1, op2, op3, NI_AdvSimd_BitwiseSelect, simdBaseJitType, simdSize); + return gtNewSimdHWIntrinsicNode(type, op1, op2, op3, NI_AdvSimd_BitwiseSelect, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); #else #error Unsupported platform #endif // !TARGET_XARCH && !TARGET_ARM64 @@ -19718,7 +19719,7 @@ GenTree* Compiler::gtNewSimdDotProdNode(var_types type, assert(op2->TypeIs(simdType)); var_types simdBaseType = JitType2PreciseVarType(simdBaseJitType); - assert(genActualType(simdBaseType) == type); + assert(JITtype2varType(simdBaseJitType) == type); NamedIntrinsic intrinsic = NI_Illegal; @@ -20751,7 +20752,7 @@ GenTree* Compiler::gtNewSimdSumNode( for (int i = 0; i < haddCount; i++) { op1 = impCloneExpr(op1, &tmp, clsHnd, (unsigned)CHECK_SPILL_ALL, nullptr DEBUGARG("Clone op1 for vector sum")); - op1 = gtNewSimdAsHWIntrinsicNode(simdType, op1, tmp, intrinsic, simdBaseJitType, simdSize); + op1 = gtNewSimdHWIntrinsicNode(simdType, op1, tmp, intrinsic, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); } if (simdSize == 32) @@ -20759,14 +20760,15 @@ GenTree* Compiler::gtNewSimdSumNode( intrinsic = (simdBaseType == TYP_FLOAT) ? NI_SSE_Add : NI_SSE2_Add; op1 = impCloneExpr(op1, &tmp, clsHnd, (unsigned)CHECK_SPILL_ALL, nullptr DEBUGARG("Clone op1 for vector sum")); - op1 = gtNewSimdAsHWIntrinsicNode(TYP_SIMD16, op1, gtNewIconNode(0x01, TYP_INT), NI_AVX_ExtractVector128, - simdBaseJitType, simdSize); + op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, gtNewIconNode(0x01, TYP_INT), NI_AVX_ExtractVector128, + simdBaseJitType, simdSize, isSimdAsHWIntrinsic); - tmp = gtNewSimdAsHWIntrinsicNode(simdType, tmp, NI_Vector256_GetLower, simdBaseJitType, simdSize); - op1 = gtNewSimdAsHWIntrinsicNode(TYP_SIMD16, op1, tmp, intrinsic, simdBaseJitType, 16); + tmp = gtNewSimdHWIntrinsicNode(simdType, tmp, NI_Vector256_GetLower, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, tmp, intrinsic, simdBaseJitType, 16, isSimdAsHWIntrinsic); } - return gtNewSimdAsHWIntrinsicNode(type, op1, NI_Vector128_ToScalar, simdBaseJitType, simdSize); + return gtNewSimdHWIntrinsicNode(type, op1, NI_Vector128_ToScalar, simdBaseJitType, simdSize, isSimdAsHWIntrinsic); #elif defined(TARGET_ARM64) switch (simdBaseType) { @@ -20774,34 +20776,64 @@ GenTree* Compiler::gtNewSimdSumNode( case TYP_UBYTE: case TYP_SHORT: case TYP_USHORT: + { + tmp = gtNewSimdHWIntrinsicNode(simdType, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + return gtNewSimdHWIntrinsicNode(type, tmp, NI_Vector64_ToScalar, simdBaseJitType, 8, isSimdAsHWIntrinsic); + } + case TYP_INT: case TYP_UINT: { - tmp = gtNewSimdAsHWIntrinsicNode(simdType, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, simdSize); - return gtNewSimdAsHWIntrinsicNode(type, tmp, NI_Vector64_ToScalar, simdBaseJitType, 8); + if (simdSize == 8) + { + op1 = impCloneExpr(op1, &tmp, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector sum")); + tmp = gtNewSimdHWIntrinsicNode(simdType, op1, tmp, NI_AdvSimd_AddPairwise, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); + } + else + { + tmp = gtNewSimdHWIntrinsicNode(TYP_SIMD16, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, 16, + isSimdAsHWIntrinsic); + } + return gtNewSimdHWIntrinsicNode(type, tmp, NI_Vector64_ToScalar, simdBaseJitType, 8, isSimdAsHWIntrinsic); } + case TYP_FLOAT: { - unsigned vectorLength = getSIMDVectorLength(simdSize, simdBaseType); - int haddCount = genLog2(vectorLength); - - for (int i = 0; i < haddCount; i++) + if (simdSize == 8) { - op1 = impCloneExpr(op1, &tmp, clsHnd, (unsigned)CHECK_SPILL_ALL, - nullptr DEBUGARG("Clone op1 for vector sum")); - op1 = gtNewSimdAsHWIntrinsicNode(simdType, op1, tmp, NI_AdvSimd_Arm64_AddPairwise, simdBaseJitType, - simdSize); + op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddPairwiseScalar, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); } + else + { + unsigned vectorLength = getSIMDVectorLength(simdSize, simdBaseType); + int haddCount = genLog2(vectorLength); - return gtNewSimdAsHWIntrinsicNode(type, op1, NI_Vector128_ToScalar, simdBaseJitType, simdSize); + for (int i = 0; i < haddCount; i++) + { + op1 = impCloneExpr(op1, &tmp, clsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector sum")); + op1 = gtNewSimdHWIntrinsicNode(simdType, op1, tmp, NI_AdvSimd_Arm64_AddPairwise, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); + } + } + return gtNewSimdHWIntrinsicNode(type, op1, NI_Vector128_ToScalar, simdBaseJitType, simdSize, + isSimdAsHWIntrinsic); } + case TYP_DOUBLE: case TYP_LONG: case TYP_ULONG: { - op1 = gtNewSimdAsHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddPairwiseScalar, simdBaseJitType, - simdSize); - return gtNewSimdAsHWIntrinsicNode(type, op1, NI_Vector64_ToScalar, simdBaseJitType, 8); + if (simdSize == 16) + { + op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddPairwiseScalar, simdBaseJitType, + simdSize, isSimdAsHWIntrinsic); + } + return gtNewSimdHWIntrinsicNode(type, op1, NI_Vector64_ToScalar, simdBaseJitType, 8, isSimdAsHWIntrinsic); } default: { diff --git a/src/coreclr/jit/hwintrinsicarm64.cpp b/src/coreclr/jit/hwintrinsicarm64.cpp index 0c1b01e..3e5c612 100644 --- a/src/coreclr/jit/hwintrinsicarm64.cpp +++ b/src/coreclr/jit/hwintrinsicarm64.cpp @@ -339,6 +339,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, GenTree* op1 = nullptr; GenTree* op2 = nullptr; GenTree* op3 = nullptr; + GenTree* op4 = nullptr; switch (intrinsic) { @@ -601,11 +602,16 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { assert(sig->numArgs == 2); - op2 = impSIMDPopStack(retType); - op1 = impSIMDPopStack(retType); + if (!varTypeIsLong(simdBaseType)) + { + var_types simdType = getSIMDTypeForSize(simdSize); - retNode = - gtNewSimdDotProdNode(retType, op1, op2, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + op2 = impSIMDPopStack(simdType); + op1 = impSIMDPopStack(simdType); + + retNode = gtNewSimdDotProdNode(retType, op1, op2, simdBaseJitType, simdSize, + /* isSimdAsHWIntrinsic */ false); + } break; } @@ -681,6 +687,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case TYP_UBYTE: { op2 = gtNewIconNode(0x80); + simdBaseType = TYP_UBYTE; + simdBaseJitType = CORINFO_TYPE_UBYTE; vectorCreateOp1 = gtNewLconNode(0x00FFFEFDFCFBFAF9); if (simdSize == 16) @@ -705,30 +713,6 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case TYP_INT: case TYP_UINT: - { - op2 = gtNewIconNode(0x80000000); - vectorCreateOp1 = gtNewLconNode(0xFFFFFFE2FFFFFFE1); - - if (simdSize == 16) - { - vectorCreateOp2 = gtNewLconNode(0xFFFFFFE4FFFFFFE3); - } - break; - } - - case TYP_LONG: - case TYP_ULONG: - { - op2 = gtNewLconNode(0x8000000000000000); - vectorCreateOp1 = gtNewLconNode(0xFFFFFFFFFFFFFFC1); - - if (simdSize == 16) - { - vectorCreateOp2 = gtNewLconNode(0xFFFFFFFFFFFFFFC2); - } - break; - } - case TYP_FLOAT: { op2 = gtNewIconNode(0x80000000); @@ -743,6 +727,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, break; } + case TYP_LONG: + case TYP_ULONG: case TYP_DOUBLE: { op2 = gtNewLconNode(0x8000000000000000); @@ -774,17 +760,27 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, gtNewSimdHWIntrinsicNode(simdType, vectorCreateOp1, NI_Vector64_Create, vectorCreateType, simdSize); } + op2 = + gtNewSimdCreateBroadcastNode(simdType, op2, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); + op1 = gtNewSimdHWIntrinsicNode(simdType, op1, op2, NI_AdvSimd_And, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); - op1 = gtNewSimdHWIntrinsicNode(simdType, op1, op3, NI_AdvSimd_ShiftLogical, simdBaseJitType, simdSize, + NamedIntrinsic shiftIntrinsic = NI_AdvSimd_ShiftLogical; + + if ((simdSize == 8) && varTypeIsLong(simdBaseType)) + { + shiftIntrinsic = NI_AdvSimd_ShiftLogicalScalar; + } + + op1 = gtNewSimdHWIntrinsicNode(simdType, op1, op3, shiftIntrinsic, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); if (varTypeIsByte(simdBaseType) && (simdSize == 16)) { - CORINFO_CLASS_HANDLE clsHnd = gtGetStructHandleForSIMD(simdType, simdBaseJitType); + CORINFO_CLASS_HANDLE simdClsHnd = gtGetStructHandleForSIMD(simdType, simdBaseJitType); - op1 = impCloneExpr(op1, &op2, clsHnd, (unsigned)CHECK_SPILL_ALL, + op1 = impCloneExpr(op1, &op2, simdClsHnd, (unsigned)CHECK_SPILL_ALL, nullptr DEBUGARG("Clone op1 for vector extractmostsignificantbits")); op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_Vector128_GetLower, simdBaseJitType, simdSize, @@ -793,9 +789,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, /* isSimdAsHWIntrinsic */ false); op1 = gtNewSimdHWIntrinsicNode(simdBaseType, op1, NI_Vector64_ToScalar, simdBaseJitType, 8, /* isSimdAsHWIntrinsic */ false); - op1 = gtNewCastNode(TYP_INT, op1, /* isUnsigned */ true, simdBaseType); + op1 = gtNewCastNode(TYP_INT, op1, /* isUnsigned */ true, TYP_INT); - GenTree* zero = gtNewSimdHWIntrinsicNode(retType, NI_Vector128_get_Zero, simdBaseJitType, simdSize); + GenTree* zero = gtNewSimdZeroNode(simdType, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); ssize_t index = 8 / genTypeSize(simdBaseType); op2 = gtNewSimdHWIntrinsicNode(simdType, op2, zero, gtNewIconNode(index), NI_AdvSimd_ExtractVector128, @@ -806,7 +802,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, /* isSimdAsHWIntrinsic */ false); op2 = gtNewSimdHWIntrinsicNode(simdBaseType, op2, NI_Vector64_ToScalar, simdBaseJitType, 8, /* isSimdAsHWIntrinsic */ false); - op2 = gtNewCastNode(TYP_INT, op2, /* isUnsigned */ true, simdBaseType); + op2 = gtNewCastNode(TYP_INT, op2, /* isUnsigned */ true, TYP_INT); op2 = gtNewOperNode(GT_LSH, TYP_INT, op2, gtNewIconNode(8)); retNode = gtNewOperNode(GT_OR, TYP_INT, op1, op2); @@ -815,8 +811,20 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, { if (!varTypeIsLong(simdBaseType)) { - op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, - simdSize, /* isSimdAsHWIntrinsic */ false); + if ((simdSize == 8) && ((simdBaseType == TYP_INT) || (simdBaseType == TYP_UINT))) + { + CORINFO_CLASS_HANDLE simdClsHnd = gtGetStructHandleForSIMD(simdType, simdBaseJitType); + + op1 = impCloneExpr(op1, &op2, simdClsHnd, (unsigned)CHECK_SPILL_ALL, + nullptr DEBUGARG("Clone op1 for vector extractmostsignificantbits")); + op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, op2, NI_AdvSimd_AddPairwise, simdBaseJitType, + simdSize, /* isSimdAsHWIntrinsic */ false); + } + else + { + op1 = gtNewSimdHWIntrinsicNode(TYP_SIMD8, op1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, + simdSize, /* isSimdAsHWIntrinsic */ false); + } } else if (simdSize == 16) { @@ -829,7 +837,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, if ((simdBaseType != TYP_INT) && (simdBaseType != TYP_UINT)) { - retNode = gtNewCastNode(TYP_INT, retNode, /* isUnsigned */ true, simdBaseType); + retNode = gtNewCastNode(TYP_INT, retNode, /* isUnsigned */ true, TYP_INT); } } break; @@ -1398,9 +1406,10 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_Store: { assert(sig->numArgs == 2); + var_types simdType = getSIMDTypeForSize(simdSize); op2 = impPopStack().val; - op1 = impSIMDPopStack(retType); + op1 = impSIMDPopStack(simdType); retNode = gtNewSimdHWIntrinsicNode(retType, op2, op1, NI_AdvSimd_Store, simdBaseJitType, simdSize); break; @@ -1410,6 +1419,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_StoreAligned: { assert(sig->numArgs == 2); + var_types simdType = getSIMDTypeForSize(simdSize); if (!opts.MinOpts()) { @@ -1419,7 +1429,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } op2 = impPopStack().val; - op1 = impSIMDPopStack(retType); + op1 = impSIMDPopStack(simdType); retNode = gtNewSimdHWIntrinsicNode(retType, op2, op1, NI_AdvSimd_Store, simdBaseJitType, simdSize); break; @@ -1429,6 +1439,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_StoreAlignedNonTemporal: { assert(sig->numArgs == 2); + var_types simdType = getSIMDTypeForSize(simdSize); if (!opts.MinOpts()) { @@ -1438,7 +1449,7 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } op2 = impPopStack().val; - op1 = impSIMDPopStack(retType); + op1 = impSIMDPopStack(simdType); // ARM64 has non-temporal stores (STNP) but we don't currently support them @@ -1449,6 +1460,8 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector64_StoreUnsafe: case NI_Vector128_StoreUnsafe: { + var_types simdType = getSIMDTypeForSize(simdSize); + if (sig->numArgs == 3) { op3 = impPopStack().val; @@ -1459,13 +1472,13 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, } op2 = impPopStack().val; - op1 = impSIMDPopStack(retType); + op1 = impSIMDPopStack(simdType); if (sig->numArgs == 3) { - op3 = gtNewIconNode(genTypeSize(simdBaseType), op2->TypeGet()); - op2 = gtNewOperNode(GT_MUL, op2->TypeGet(), op2, op3); - op2 = gtNewOperNode(GT_ADD, op1->TypeGet(), op1, op2); + op4 = gtNewIconNode(genTypeSize(simdBaseType), op3->TypeGet()); + op3 = gtNewOperNode(GT_MUL, op3->TypeGet(), op3, op4); + op2 = gtNewOperNode(GT_ADD, op2->TypeGet(), op2, op3); } retNode = gtNewSimdHWIntrinsicNode(retType, op2, op1, NI_AdvSimd_Store, simdBaseJitType, simdSize); @@ -1476,8 +1489,9 @@ GenTree* Compiler::impSpecialIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_Sum: { assert(sig->numArgs == 1); + var_types simdType = getSIMDTypeForSize(simdSize); - op1 = impSIMDPopStack(retType); + op1 = impSIMDPopStack(simdType); retNode = gtNewSimdSumNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); break; } diff --git a/src/coreclr/jit/hwintrinsiclistarm64.h b/src/coreclr/jit/hwintrinsiclistarm64.h index e119a38..7d4582d 100644 --- a/src/coreclr/jit/hwintrinsiclistarm64.h +++ b/src/coreclr/jit/hwintrinsiclistarm64.h @@ -42,28 +42,28 @@ HARDWARE_INTRINSIC(Vector64, ConvertToUInt64, HARDWARE_INTRINSIC(Vector64, Create, 8, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_mov, INS_mov, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector64, CreateScalarUnsafe, 8, 1, {INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_invalid, INS_invalid, INS_fmov, INS_invalid}, HW_Category_SIMD, HW_Flag_SpecialCodeGen|HW_Flag_SupportsContainment) HARDWARE_INTRINSIC(Vector64, Divide, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, Dot, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, Dot, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, Equals, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, EqualsAll, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, EqualsAny, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, ExtractMostSignificantBits, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, EqualsAll, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, EqualsAny, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, ExtractMostSignificantBits, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, Floor, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, get_AllBitsSet, 8, 0, {INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector64, get_Count, 8, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector64, get_Zero, 8, 0, {INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi, INS_movi}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector64, GetElement, 8, 2, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector64, GreaterThan, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, GreaterThanAll, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, GreaterThanAny, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, GreaterThanAll, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, GreaterThanAny, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, GreaterThanOrEqual, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, GreaterThanOrEqualAll, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, GreaterThanOrEqualAny, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, GreaterThanOrEqualAll, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, GreaterThanOrEqualAny, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, LessThan, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, LessThanAll, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, LessThanAny, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, LessThanAll, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, LessThanAny, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, LessThanOrEqual, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, LessThanOrEqualAll, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, LessThanOrEqualAny, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, LessThanOrEqualAll, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, LessThanOrEqualAny, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, Load, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, LoadAligned, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, LoadAlignedNonTemporal, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -90,12 +90,12 @@ HARDWARE_INTRINSIC(Vector64, ShiftLeft, HARDWARE_INTRINSIC(Vector64, ShiftRightArithmetic, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, ShiftRightLogical, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, Sqrt, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, Store, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, StoreAligned, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, StoreAlignedNonTemporal, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, StoreUnsafe, 8, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, Store, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, StoreAligned, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, StoreAlignedNonTemporal, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, StoreUnsafe, 8, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, Subtract, 8, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector64, Sum, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector64, Sum, 8, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector64, ToScalar, 8, 1, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SIMDScalar|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector64, ToVector128, 8, 1, {INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov}, HW_Category_SIMD, HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector64, ToVector128Unsafe, 8, 1, {INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov}, HW_Category_SIMD, HW_Flag_SpecialCodeGen) @@ -140,11 +140,11 @@ HARDWARE_INTRINSIC(Vector128, ConvertToUInt64, HARDWARE_INTRINSIC(Vector128, Create, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, CreateScalarUnsafe, 16, 1, {INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_ins, INS_fmov, INS_fmov}, HW_Category_SIMD, HW_Flag_SpecialCodeGen|HW_Flag_SupportsContainment) HARDWARE_INTRINSIC(Vector128, Divide, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, Dot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, Dot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Equals, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, EqualsAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, EqualsAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, EqualsAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, EqualsAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni, INS_mvni}, HW_Category_Helper, HW_Flag_SpecialCodeGen|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, get_Count, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_NoCodeGen|HW_Flag_SpecialImport) @@ -153,17 +153,17 @@ HARDWARE_INTRINSIC(Vector128, GetElement, HARDWARE_INTRINSIC(Vector128, GetLower, 16, 1, {INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov, INS_mov}, HW_Category_SIMD, HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector128, GetUpper, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SpecialImport) HARDWARE_INTRINSIC(Vector128, GreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, GreaterThanAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, GreaterThanAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, GreaterThanAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, GreaterThanAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, GreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, GreaterThanOrEqualAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, GreaterThanOrEqualAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, GreaterThanOrEqualAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, GreaterThanOrEqualAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, LessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, LessThanAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, LessThanAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, LessThanAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, LessThanAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, LessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, LessThanOrEqualAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, LessThanOrEqualAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, LessThanOrEqualAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, LessThanOrEqualAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Load, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, LoadAligned, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, LoadAlignedNonTemporal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -190,12 +190,12 @@ HARDWARE_INTRINSIC(Vector128, ShiftLeft, HARDWARE_INTRINSIC(Vector128, ShiftRightArithmetic, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, ShiftRightLogical, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Sqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, Store, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, StoreAligned, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, StoreAlignedNonTemporal, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, StoreUnsafe, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, Store, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, StoreAligned, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, StoreAlignedNonTemporal, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, StoreUnsafe, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Subtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, Sum, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, Sum, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_smov, INS_umov, INS_smov, INS_umov, INS_smov, INS_umov, INS_umov, INS_umov, INS_dup, INS_dup}, HW_Category_SIMD, HW_Flag_BaseTypeFromFirstArg|HW_Flag_SIMDScalar|HW_Flag_SpecialCodeGen) HARDWARE_INTRINSIC(Vector128, WidenLower, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, WidenUpper, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) diff --git a/src/coreclr/jit/hwintrinsiclistxarch.h b/src/coreclr/jit/hwintrinsiclistxarch.h index b0571e1..7c80de3 100644 --- a/src/coreclr/jit/hwintrinsiclistxarch.h +++ b/src/coreclr/jit/hwintrinsiclistxarch.h @@ -59,28 +59,28 @@ HARDWARE_INTRINSIC(Vector128, ConvertToUInt64, HARDWARE_INTRINSIC(Vector128, Create, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, CreateScalarUnsafe, 16, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, Divide, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, Dot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, Dot, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Equals, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, EqualsAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, EqualsAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, EqualsAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, EqualsAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, ExtractMostSignificantBits, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Floor, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, get_AllBitsSet, 16, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, get_Count, 16, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, get_Zero, 16, 0, {INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps, INS_xorps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, GetElement, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector128, GreaterThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, GreaterThanAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, GreaterThanAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, GreaterThanAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, GreaterThanAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, GreaterThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, GreaterThanOrEqualAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, GreaterThanOrEqualAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, GreaterThanOrEqualAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, GreaterThanOrEqualAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, LessThan, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, LessThanAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, LessThanAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, LessThanAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, LessThanAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, LessThanOrEqual, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, LessThanOrEqualAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, LessThanOrEqualAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, LessThanOrEqualAll, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, LessThanOrEqualAny, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Load, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, LoadAligned, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, LoadAlignedNonTemporal, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -107,12 +107,12 @@ HARDWARE_INTRINSIC(Vector128, ShiftLeft, HARDWARE_INTRINSIC(Vector128, ShiftRightArithmetic, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, ShiftRightLogical, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Sqrt, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, Store, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, StoreAligned, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, StoreAlignedNonTemporal, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, StoreUnsafe, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, Store, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, StoreAligned, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, StoreAlignedNonTemporal, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, StoreUnsafe, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, Subtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector128, Sum, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector128, Sum, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, ToVector256, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector128, ToVector256Unsafe, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) @@ -155,11 +155,11 @@ HARDWARE_INTRINSIC(Vector256, ConvertToUInt64, HARDWARE_INTRINSIC(Vector256, Create, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, CreateScalarUnsafe, 32, 1, {INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movd, INS_movss, INS_movsdsse2}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, Divide, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, Dot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, Dot, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Equals, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, EqualsAll, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, EqualsAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, ExtractMostSignificantBits, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, EqualsAll, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, EqualsAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, ExtractMostSignificantBits, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Floor, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, get_AllBitsSet, 32, 0, {INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_pcmpeqd, INS_cmpps, INS_cmpps}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, get_Count, 32, 0, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -167,17 +167,17 @@ HARDWARE_INTRINSIC(Vector256, get_Zero, HARDWARE_INTRINSIC(Vector256, GetElement, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, GetLower, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, GreaterThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, GreaterThanAll, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, GreaterThanAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, GreaterThanAll, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, GreaterThanAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, GreaterThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, GreaterThanOrEqualAll, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, GreaterThanOrEqualAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, GreaterThanOrEqualAll, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, GreaterThanOrEqualAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, LessThan, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, LessThanAll, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, LessThanAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, LessThanAll, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, LessThanAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, LessThanOrEqual, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, LessThanOrEqualAll, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, LessThanOrEqualAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, LessThanOrEqualAll, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, LessThanOrEqualAny, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Load, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, LoadAligned, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, LoadAlignedNonTemporal, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) @@ -204,12 +204,12 @@ HARDWARE_INTRINSIC(Vector256, ShiftLeft, HARDWARE_INTRINSIC(Vector256, ShiftRightArithmetic, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, ShiftRightLogical, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Sqrt, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, Store, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, StoreAligned, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, StoreAlignedNonTemporal, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, StoreUnsafe, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, Store, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, StoreAligned, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, StoreAlignedNonTemporal, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, StoreUnsafe, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, Subtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) -HARDWARE_INTRINSIC(Vector256, Sum, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen) +HARDWARE_INTRINSIC(Vector256, Sum, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen) HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsdsse2}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics) HARDWARE_INTRINSIC(Vector256, WidenLower, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) HARDWARE_INTRINSIC(Vector256, WidenUpper, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg) diff --git a/src/coreclr/jit/hwintrinsicxarch.cpp b/src/coreclr/jit/hwintrinsicxarch.cpp index e6f4995..f1b8482 100644 --- a/src/coreclr/jit/hwintrinsicxarch.cpp +++ b/src/coreclr/jit/hwintrinsicxarch.cpp @@ -535,6 +535,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, GenTree* op1 = nullptr; GenTree* op2 = nullptr; GenTree* op3 = nullptr; + GenTree* op4 = nullptr; if (!featureSIMD || !IsBaselineSimdIsaSupported()) { @@ -947,6 +948,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, case NI_Vector256_Dot: { assert(sig->numArgs == 2); + var_types simdType = getSIMDTypeForSize(simdSize); if (varTypeIsByte(simdBaseType) || varTypeIsLong(simdBaseType)) { @@ -973,8 +975,8 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, } } - op2 = impSIMDPopStack(retType); - op1 = impSIMDPopStack(retType); + op2 = impSIMDPopStack(simdType); + op1 = impSIMDPopStack(simdType); retNode = gtNewSimdDotProdNode(retType, op1, op2, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); @@ -1044,24 +1046,137 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, { var_types simdType = getSIMDTypeForSize(simdSize); - op1 = impSIMDPopStack(simdType); - NamedIntrinsic moveMaskIntrinsic = NI_Illegal; + NamedIntrinsic shuffleIntrinsic = NI_Illegal; + NamedIntrinsic createIntrinsic = NI_Illegal; - if (simdBaseType == TYP_FLOAT) - { - moveMaskIntrinsic = (simdSize == 32) ? NI_AVX_MoveMask : NI_SSE_MoveMask; - } - else if (simdBaseType == TYP_DOUBLE) - { - moveMaskIntrinsic = (simdSize == 32) ? NI_AVX_MoveMask : NI_SSE2_MoveMask; - } - else + switch (simdBaseType) { - moveMaskIntrinsic = (simdSize == 32) ? NI_AVX2_MoveMask : NI_SSE2_MoveMask; - simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE; + case TYP_BYTE: + case TYP_UBYTE: + { + op1 = impSIMDPopStack(simdType); + moveMaskIntrinsic = (simdSize == 32) ? NI_AVX2_MoveMask : NI_SSE2_MoveMask; + break; + } + + case TYP_SHORT: + case TYP_USHORT: + { + simdBaseJitType = varTypeIsUnsigned(simdBaseType) ? CORINFO_TYPE_UBYTE : CORINFO_TYPE_BYTE; + + assert((simdSize == 16) || (simdSize == 32)); + IntrinsicNodeBuilder nodeBuilder(getAllocator(CMK_ASTNode), simdSize); + + // We want to tightly pack the most significant byte of each short/ushort + // and then zero the tightly packed least significant bytes + + nodeBuilder.AddOperand(0x00, gtNewIconNode(0x01)); + nodeBuilder.AddOperand(0x01, gtNewIconNode(0x03)); + nodeBuilder.AddOperand(0x02, gtNewIconNode(0x05)); + nodeBuilder.AddOperand(0x03, gtNewIconNode(0x07)); + nodeBuilder.AddOperand(0x04, gtNewIconNode(0x09)); + nodeBuilder.AddOperand(0x05, gtNewIconNode(0x0B)); + nodeBuilder.AddOperand(0x06, gtNewIconNode(0x0D)); + nodeBuilder.AddOperand(0x07, gtNewIconNode(0x0F)); + + for (unsigned i = 0x08; i < 0x10; i++) + { + // The most significant bit being set means zero the value + nodeBuilder.AddOperand(i, gtNewIconNode(0x80)); + } + + if (simdSize == 32) + { + // Vector256 works on 2x128-bit lanes, so repeat the same indices for the upper lane + + nodeBuilder.AddOperand(0x10, gtNewIconNode(0x01)); + nodeBuilder.AddOperand(0x11, gtNewIconNode(0x03)); + nodeBuilder.AddOperand(0x12, gtNewIconNode(0x05)); + nodeBuilder.AddOperand(0x13, gtNewIconNode(0x07)); + nodeBuilder.AddOperand(0x14, gtNewIconNode(0x09)); + nodeBuilder.AddOperand(0x15, gtNewIconNode(0x0B)); + nodeBuilder.AddOperand(0x16, gtNewIconNode(0x0D)); + nodeBuilder.AddOperand(0x17, gtNewIconNode(0x0F)); + + for (unsigned i = 0x18; i < 0x20; i++) + { + // The most significant bit being set means zero the value + nodeBuilder.AddOperand(i, gtNewIconNode(0x80)); + } + + createIntrinsic = NI_Vector256_Create; + shuffleIntrinsic = NI_AVX2_Shuffle; + moveMaskIntrinsic = NI_AVX2_MoveMask; + } + else if (compOpportunisticallyDependsOn(InstructionSet_SSSE3)) + { + createIntrinsic = NI_Vector128_Create; + shuffleIntrinsic = NI_SSSE3_Shuffle; + moveMaskIntrinsic = NI_SSE2_MoveMask; + } + else + { + return nullptr; + } + + op2 = gtNewSimdHWIntrinsicNode(simdType, std::move(nodeBuilder), createIntrinsic, + simdBaseJitType, simdSize); + + op1 = impSIMDPopStack(simdType); + op1 = gtNewSimdHWIntrinsicNode(simdType, op1, op2, shuffleIntrinsic, simdBaseJitType, simdSize); + + if (simdSize == 32) + { + CorInfoType simdOtherJitType; + + // Since Vector256 is 2x128-bit lanes we need a full width permutation so we get the lower + // 64-bits of each lane next to eachother. The upper bits should be zero, but also don't + // matter so we can also then simplify down to a 128-bit move mask. + + simdOtherJitType = (simdBaseType == TYP_UBYTE) ? CORINFO_TYPE_ULONG : CORINFO_TYPE_LONG; + + op1 = gtNewSimdHWIntrinsicNode(simdType, op1, gtNewIconNode(0xD8), NI_AVX2_Permute4x64, + simdOtherJitType, simdSize); + + simdSize = 16; + simdType = TYP_SIMD16; + + op1 = gtNewSimdHWIntrinsicNode(simdType, op1, NI_Vector256_GetLower, simdBaseJitType, + simdSize); + } + break; + } + + case TYP_INT: + case TYP_UINT: + case TYP_FLOAT: + { + simdBaseJitType = CORINFO_TYPE_FLOAT; + op1 = impSIMDPopStack(simdType); + moveMaskIntrinsic = (simdSize == 32) ? NI_AVX_MoveMask : NI_SSE_MoveMask; + break; + } + + case TYP_LONG: + case TYP_ULONG: + case TYP_DOUBLE: + { + simdBaseJitType = CORINFO_TYPE_DOUBLE; + op1 = impSIMDPopStack(simdType); + moveMaskIntrinsic = (simdSize == 32) ? NI_AVX_MoveMask : NI_SSE2_MoveMask; + break; + } + + default: + { + unreached(); + } } + assert(moveMaskIntrinsic != NI_Illegal); + assert(op1 != nullptr); + retNode = gtNewSimdHWIntrinsicNode(retType, op1, moveMaskIntrinsic, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); } @@ -1780,9 +1895,10 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, case NI_Vector256_Store: { assert(sig->numArgs == 2); + var_types simdType = getSIMDTypeForSize(simdSize); op2 = impPopStack().val; - op1 = impSIMDPopStack(retType); + op1 = impSIMDPopStack(simdType); NamedIntrinsic storeIntrinsic = NI_Illegal; @@ -1807,9 +1923,10 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, case NI_Vector256_StoreAligned: { assert(sig->numArgs == 2); + var_types simdType = getSIMDTypeForSize(simdSize); op2 = impPopStack().val; - op1 = impSIMDPopStack(retType); + op1 = impSIMDPopStack(simdType); NamedIntrinsic storeIntrinsic = NI_Illegal; @@ -1834,9 +1951,10 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, case NI_Vector256_StoreAlignedNonTemporal: { assert(sig->numArgs == 2); + var_types simdType = getSIMDTypeForSize(simdSize); op2 = impPopStack().val; - op1 = impSIMDPopStack(retType); + op1 = impSIMDPopStack(simdType); NamedIntrinsic storeIntrinsic = NI_Illegal; @@ -1860,6 +1978,8 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, case NI_Vector128_StoreUnsafe: case NI_Vector256_StoreUnsafe: { + var_types simdType = getSIMDTypeForSize(simdSize); + if (sig->numArgs == 3) { op3 = impPopStack().val; @@ -1870,13 +1990,13 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, } op2 = impPopStack().val; - op1 = impSIMDPopStack(retType); + op1 = impSIMDPopStack(simdType); if (sig->numArgs == 3) { - op3 = gtNewIconNode(genTypeSize(simdBaseType), op2->TypeGet()); - op2 = gtNewOperNode(GT_MUL, op2->TypeGet(), op2, op3); - op2 = gtNewOperNode(GT_ADD, op1->TypeGet(), op1, op2); + op4 = gtNewIconNode(genTypeSize(simdBaseType), op3->TypeGet()); + op3 = gtNewOperNode(GT_MUL, op3->TypeGet(), op3, op4); + op2 = gtNewOperNode(GT_ADD, op2->TypeGet(), op2, op3); } NamedIntrinsic storeIntrinsic = NI_Illegal; @@ -1902,6 +2022,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, case NI_Vector256_Sum: { assert(sig->numArgs == 1); + var_types simdType = getSIMDTypeForSize(simdSize); if (varTypeIsFloating(simdBaseType)) { @@ -1922,7 +2043,7 @@ GenTree* Compiler::impBaseIntrinsic(NamedIntrinsic intrinsic, break; } - op1 = impSIMDPopStack(retType); + op1 = impSIMDPopStack(simdType); retNode = gtNewSimdSumNode(retType, op1, simdBaseJitType, simdSize, /* isSimdAsHWIntrinsic */ false); break; } diff --git a/src/coreclr/jit/lowerarmarch.cpp b/src/coreclr/jit/lowerarmarch.cpp index 192c4ab..e1812d7 100644 --- a/src/coreclr/jit/lowerarmarch.cpp +++ b/src/coreclr/jit/lowerarmarch.cpp @@ -874,8 +874,8 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp) GenTree* insCns = comp->gtNewIconNode(-1, TYP_INT); BlockRange().InsertAfter(idxCns, insCns); - GenTree* tmp = comp->gtNewSimdAsHWIntrinsicNode(simdType, cmp, idxCns, insCns, NI_AdvSimd_Insert, - CORINFO_TYPE_INT, simdSize); + GenTree* tmp = comp->gtNewSimdHWIntrinsicNode(simdType, cmp, idxCns, insCns, NI_AdvSimd_Insert, + CORINFO_TYPE_INT, simdSize); BlockRange().InsertAfter(insCns, tmp); LowerNode(tmp); @@ -891,7 +891,7 @@ void Lowering::LowerHWIntrinsicCmpOp(GenTreeHWIntrinsic* node, genTreeOps cmpOp) BlockRange().InsertAfter(msk, zroCns); GenTree* val = - comp->gtNewSimdAsHWIntrinsicNode(TYP_UBYTE, msk, zroCns, NI_AdvSimd_Extract, CORINFO_TYPE_UBYTE, simdSize); + comp->gtNewSimdHWIntrinsicNode(TYP_UBYTE, msk, zroCns, NI_AdvSimd_Extract, CORINFO_TYPE_UBYTE, simdSize); BlockRange().InsertAfter(zroCns, val); LowerNode(val); @@ -1169,7 +1169,7 @@ void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) BlockRange().InsertAfter(idx, tmp1); LowerNode(tmp1); - op1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, op1, idx, tmp1, NI_AdvSimd_Insert, simdBaseJitType, simdSize); + op1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, idx, tmp1, NI_AdvSimd_Insert, simdBaseJitType, simdSize); BlockRange().InsertAfter(tmp1, op1); LowerNode(op1); @@ -1180,7 +1180,7 @@ void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) BlockRange().InsertAfter(idx, tmp2); LowerNode(tmp2); - op2 = comp->gtNewSimdAsHWIntrinsicNode(simdType, op2, idx, tmp2, NI_AdvSimd_Insert, simdBaseJitType, simdSize); + op2 = comp->gtNewSimdHWIntrinsicNode(simdType, op2, idx, tmp2, NI_AdvSimd_Insert, simdBaseJitType, simdSize); BlockRange().InsertAfter(tmp2, op2); LowerNode(op2); } @@ -1205,32 +1205,35 @@ void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) } assert(!varTypeIsLong(simdBaseType)); - tmp1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, op1, op2, multiply, simdBaseJitType, simdSize); + tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, op1, op2, multiply, simdBaseJitType, simdSize); BlockRange().InsertBefore(node, tmp1); LowerNode(tmp1); if (varTypeIsFloating(simdBaseType)) { - // We will be constructing the following parts: - // ... - // /--* tmp1 simd16 - // * STORE_LCL_VAR simd16 - // tmp1 = LCL_VAR simd16 - // tmp2 = LCL_VAR simd16 - // ... + if ((simdSize != 8) || (simdBaseType == TYP_FLOAT)) + { + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // * STORE_LCL_VAR simd16 + // tmp1 = LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // ... - // This is roughly the following managed code: - // ... - // var tmp2 = tmp1; - // ... + // This is roughly the following managed code: + // ... + // var tmp2 = tmp1; + // ... - node->Op(1) = tmp1; - LIR::Use tmp1Use(BlockRange(), &node->Op(1), node); - ReplaceWithLclVar(tmp1Use); - tmp1 = node->Op(1); + node->Op(1) = tmp1; + LIR::Use tmp1Use(BlockRange(), &node->Op(1), node); + ReplaceWithLclVar(tmp1Use); + tmp1 = node->Op(1); - tmp2 = comp->gtClone(tmp1); - BlockRange().InsertAfter(tmp1, tmp2); + tmp2 = comp->gtClone(tmp1); + BlockRange().InsertAfter(tmp1, tmp2); + } if (simdSize == 8) { @@ -1248,8 +1251,8 @@ void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) // var tmp1 = AdvSimd.AddPairwise(tmp1, tmp2); // ... - tmp1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, tmp1, tmp2, NI_AdvSimd_AddPairwise, simdBaseJitType, - simdSize); + tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_AdvSimd_AddPairwise, simdBaseJitType, + simdSize); BlockRange().InsertAfter(tmp2, tmp1); LowerNode(tmp1); } @@ -1274,8 +1277,8 @@ void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) // var tmp1 = AdvSimd.Arm64.AddPairwise(tmp1, tmp2); // ... - tmp1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, tmp1, tmp2, NI_AdvSimd_Arm64_AddPairwise, simdBaseJitType, - simdSize); + tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_AdvSimd_Arm64_AddPairwise, simdBaseJitType, + simdSize); BlockRange().InsertAfter(tmp2, tmp1); LowerNode(tmp1); @@ -1313,8 +1316,8 @@ void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) tmp2 = comp->gtClone(tmp1); BlockRange().InsertAfter(tmp1, tmp2); - tmp1 = comp->gtNewSimdAsHWIntrinsicNode(simdType, tmp1, tmp2, NI_AdvSimd_Arm64_AddPairwise, - simdBaseJitType, simdSize); + tmp1 = comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_AdvSimd_Arm64_AddPairwise, + simdBaseJitType, simdSize); BlockRange().InsertAfter(tmp2, tmp1); LowerNode(tmp1); } @@ -1326,20 +1329,66 @@ void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) { assert(varTypeIsIntegral(simdBaseType)); - // We will be constructing the following parts: - // ... - // /--* tmp1 simd16 - // tmp2 = * HWINTRINSIC simd16 T AddAcross - // ... + if ((simdSize == 8) && ((simdBaseType == TYP_INT) || (simdBaseType == TYP_UINT))) + { + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // * STORE_LCL_VAR simd16 + // tmp1 = LCL_VAR simd16 + // tmp2 = LCL_VAR simd16 + // ... - // This is roughly the following managed code: - // ... - // var tmp2 = AdvSimd.Arm64.AddAcross(tmp1); - // ... + // This is roughly the following managed code: + // ... + // var tmp2 = tmp1; + // ... - tmp2 = comp->gtNewSimdAsHWIntrinsicNode(simdType, tmp1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, simdSize); - BlockRange().InsertAfter(tmp1, tmp2); - LowerNode(tmp2); + node->Op(1) = tmp1; + LIR::Use tmp1Use(BlockRange(), &node->Op(1), node); + ReplaceWithLclVar(tmp1Use); + tmp1 = node->Op(1); + + tmp2 = comp->gtClone(tmp1); + BlockRange().InsertAfter(tmp1, tmp2); + + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // /--* tmp2 simd16 + // tmp2 = * HWINTRINSIC simd8 T AddPairwise + // ... + + // This is roughly the following managed code: + // ... + // var tmp2 = AdvSimd.AddPairwise(tmp1, tmp2); + // ... + + tmp1 = + comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, tmp2, NI_AdvSimd_AddPairwise, simdBaseJitType, simdSize); + BlockRange().InsertAfter(tmp2, tmp1); + LowerNode(tmp1); + + tmp2 = tmp1; + } + else + { + // We will be constructing the following parts: + // ... + // /--* tmp1 simd16 + // tmp2 = * HWINTRINSIC simd16 T AddAcross + // ... + + // This is roughly the following managed code: + // ... + // var tmp2 = AdvSimd.Arm64.AddAcross(tmp1); + // ... + + tmp2 = + comp->gtNewSimdHWIntrinsicNode(simdType, tmp1, NI_AdvSimd_Arm64_AddAcross, simdBaseJitType, simdSize); + BlockRange().InsertAfter(tmp1, tmp2); + LowerNode(tmp2); + } } // We will be constructing the following parts: diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 0d2acef..413e4ba 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -3047,8 +3047,6 @@ void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) if (simdSize == 32) { - assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2)); - switch (simdBaseType) { case TYP_SHORT: @@ -3056,6 +3054,8 @@ void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) case TYP_INT: case TYP_UINT: { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX2)); + multiply = NI_AVX2_MultiplyLow; horizontalAdd = NI_AVX2_HorizontalAdd; add = NI_AVX2_Add; @@ -3064,6 +3064,8 @@ void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) case TYP_FLOAT: { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); + // We will be constructing the following parts: // idx = CNS_INT int 0xF1 // /--* op1 simd16 @@ -3127,6 +3129,8 @@ void Lowering::LowerHWIntrinsicDot(GenTreeHWIntrinsic* node) case TYP_DOUBLE: { + assert(comp->compIsaSupportedDebugOnly(InstructionSet_AVX)); + multiply = NI_AVX_Multiply; horizontalAdd = NI_AVX_HorizontalAdd; add = NI_AVX_Add; diff --git a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs index 6d82ccc..9dba4bc 100644 --- a/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs +++ b/src/libraries/System.Private.CoreLib/src/System/Runtime/Intrinsics/Vector256.cs @@ -2389,11 +2389,12 @@ namespace System.Runtime.Intrinsics { T result = default; - for (int index = 0; index < Vector256.Count; index++) - { - T value = Scalar.Multiply(left.GetElementUnsafe(index), right.GetElementUnsafe(index)); - result = Scalar.Add(result, value); - } + // Doing this as Dot(lower) + Dot(upper) is important for floating-point determinism + // This is because the underlying dpps instruction on x86/x64 will do this equivalently + // and otherwise the software vs accelerated implementations may differ in returned result. + + result = Scalar.Add(result, Vector128.Dot(left.GetLower(), right.GetLower())); + result = Scalar.Add(result, Vector128.Dot(left.GetUpper(), right.GetUpper())); return result; } diff --git a/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorDotTest.template b/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorDotTest.template index 1ce27af..4c1a118 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorDotTest.template +++ b/src/tests/JIT/HardwareIntrinsics/General/Shared/VectorDotTest.template @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; {RetBaseType} actualResult = default; + {RetBaseType} intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += ({RetBaseType})(left[i] * right[i]); + if ((i % Vector128<{Op1BaseType}>.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += ({RetBaseType})(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Byte.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Byte.cs index b47e148..1b5ea70 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Byte.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Byte.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Byte actualResult = default; + Byte intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Byte)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Byte)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Double.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Double.cs index 83c5af2..3536b17 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Double.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Double.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Double actualResult = default; + Double intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Double)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Double)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Int16.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Int16.cs index 76fda4a..8130987 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Int16.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Int16.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Int16 actualResult = default; + Int16 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Int16)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Int16)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Int32.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Int32.cs index 4c00908..8d43f64 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Int32.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Int32.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Int32 actualResult = default; + Int32 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Int32)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Int32)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Int64.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Int64.cs index fb06bc4..58290b2 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Int64.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Int64.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Int64 actualResult = default; + Int64 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Int64)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Int64)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.SByte.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.SByte.cs index f48ccd4..cff4519 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.SByte.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.SByte.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; SByte actualResult = default; + SByte intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (SByte)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (SByte)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Single.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Single.cs index fa8cc2f..7d8da79 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Single.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.Single.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Single actualResult = default; + Single intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Single)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Single)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.UInt16.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.UInt16.cs index 69a9398..24b2373 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.UInt16.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.UInt16.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; UInt16 actualResult = default; + UInt16 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (UInt16)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (UInt16)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.UInt32.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.UInt32.cs index a64294f..2d3938e 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.UInt32.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.UInt32.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; UInt32 actualResult = default; + UInt32 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (UInt32)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (UInt32)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.UInt64.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.UInt64.cs index 107e290..857fc0b 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.UInt64.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector128/Dot.UInt64.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; UInt64 actualResult = default; + UInt64 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (UInt64)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (UInt64)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Byte.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Byte.cs index 83747a6..5caa395 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Byte.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Byte.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Byte actualResult = default; + Byte intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Byte)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Byte)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Double.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Double.cs index ab68e69..a23b124 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Double.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Double.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Double actualResult = default; + Double intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Double)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Double)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Int16.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Int16.cs index 2ddda2b..9a7eabe 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Int16.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Int16.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Int16 actualResult = default; + Int16 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Int16)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Int16)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Int32.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Int32.cs index 2ccb11f..50c9043 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Int32.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Int32.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Int32 actualResult = default; + Int32 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Int32)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Int32)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Int64.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Int64.cs index 986f169..f60de06 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Int64.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Int64.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Int64 actualResult = default; + Int64 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Int64)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Int64)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.SByte.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.SByte.cs index b911d6f..ee6db7a 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.SByte.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.SByte.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; SByte actualResult = default; + SByte intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (SByte)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (SByte)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Single.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Single.cs index b79bd66..a67955e 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Single.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.Single.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Single actualResult = default; + Single intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Single)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Single)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.UInt16.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.UInt16.cs index 8f1dd9d..41d65f3 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.UInt16.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.UInt16.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; UInt16 actualResult = default; + UInt16 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (UInt16)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (UInt16)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.UInt32.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.UInt32.cs index 6e48960..51bead4 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.UInt32.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.UInt32.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; UInt32 actualResult = default; + UInt32 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (UInt32)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (UInt32)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.UInt64.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.UInt64.cs index 06efced..dd4bbf8 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.UInt64.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector256/Dot.UInt64.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; UInt64 actualResult = default; + UInt64 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (UInt64)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (UInt64)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Byte.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Byte.cs index ff7cb00..4a8fedd 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Byte.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Byte.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Byte actualResult = default; + Byte intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Byte)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Byte)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Double.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Double.cs index 0d45f61..f2dc915 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Double.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Double.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Double actualResult = default; + Double intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Double)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Double)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Int16.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Int16.cs index 9157e7e..5284a97 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Int16.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Int16.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Int16 actualResult = default; + Int16 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Int16)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Int16)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Int32.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Int32.cs index a679da2..475030d 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Int32.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Int32.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Int32 actualResult = default; + Int32 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Int32)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Int32)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Int64.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Int64.cs index d75a81f..4a4a585 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Int64.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Int64.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Int64 actualResult = default; + Int64 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Int64)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Int64)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.SByte.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.SByte.cs index 8e0702b..ede7558 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.SByte.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.SByte.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; SByte actualResult = default; + SByte intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (SByte)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (SByte)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Single.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Single.cs index 5ec606f..35458c0 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Single.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.Single.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; Single actualResult = default; + Single intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (Single)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (Single)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.UInt16.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.UInt16.cs index 44bb84a..48281f7 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.UInt16.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.UInt16.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; UInt16 actualResult = default; + UInt16 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (UInt16)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (UInt16)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.UInt32.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.UInt32.cs index 47cf417..f5fad88 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.UInt32.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.UInt32.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; UInt32 actualResult = default; + UInt32 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (UInt32)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (UInt32)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false; diff --git a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.UInt64.cs b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.UInt64.cs index 448ae87..d3ad366 100644 --- a/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.UInt64.cs +++ b/src/tests/JIT/HardwareIntrinsics/General/Vector64/Dot.UInt64.cs @@ -292,12 +292,20 @@ namespace JIT.HardwareIntrinsics.General bool succeeded = true; UInt64 actualResult = default; + UInt64 intermResult = default; for (var i = 0; i < Op1ElementCount; i++) { - actualResult += (UInt64)(left[i] * right[i]); + if ((i % Vector128.Count) == 0) + { + actualResult += intermResult; + intermResult = default; + } + intermResult += (UInt64)(left[i] * right[i]); } + actualResult += intermResult; + if (actualResult != result) { succeeded = false;