switch (intrinsicId)
{
+ case NI_Vector128_ToScalar:
+ case NI_Vector256_ToScalar:
+ case NI_Vector512_ToScalar:
case NI_SSE2_ConvertToInt32:
case NI_SSE2_ConvertToUInt32:
case NI_SSE2_X64_ConvertToInt64:
case NI_Vector128_GetElement:
case NI_Vector256_GetElement:
case NI_Vector512_GetElement:
+ {
return DecomposeHWIntrinsicGetElement(use, hwintrinsicTree);
+ }
default:
+ {
noway_assert(!"unexpected GT_HWINTRINSIC node in long decomposition");
break;
+ }
}
return nullptr;
}
case NI_Vector128_GetElement:
+ case NI_Vector128_ToScalar:
+ case NI_Vector256_ToScalar:
+ case NI_Vector512_ToScalar:
case NI_SSE2_ConvertToInt32:
case NI_SSE2_ConvertToUInt32:
case NI_SSE2_X64_ConvertToInt64:
assert(varTypeIsArithmetic(simdBaseType));
#if defined(TARGET_XARCH)
+ bool useToScalar = op2->IsIntegralConst(0);
+
+#if defined(TARGET_X86)
+ // We handle decomposition via GetElement for simplicity
+ useToScalar &= !varTypeIsLong(simdBaseType);
+#endif // TARGET_X86
+
+ if (useToScalar)
+ {
+ intrinsicId = NI_Vector128_ToScalar;
+
+ if (simdSize == 64)
+ {
+ intrinsicId = NI_Vector512_ToScalar;
+ }
+ else if (simdSize == 32)
+ {
+ intrinsicId = NI_Vector256_ToScalar;
+ }
+
+ return gtNewSimdHWIntrinsicNode(type, op1, intrinsicId, simdBaseJitType, simdSize);
+ }
+
switch (simdBaseType)
{
- // Using software fallback if simdBaseType is not supported by hardware
case TYP_BYTE:
case TYP_UBYTE:
case TYP_INT:
case TYP_UINT:
case TYP_LONG:
case TYP_ULONG:
+ {
+ // Using software fallback if simdBaseType is not supported by hardware
assert(compIsaSupportedDebugOnly(InstructionSet_SSE41));
break;
+ }
case TYP_DOUBLE:
case TYP_FLOAT:
case TYP_SHORT:
case TYP_USHORT:
+ {
assert(compIsaSupportedDebugOnly(InstructionSet_SSE2));
break;
+ }
default:
+ {
unreached();
+ }
}
if (simdSize == 64)
intrinsicId = NI_Vector256_GetElement;
}
#elif defined(TARGET_ARM64)
+ if (op2->IsIntegralConst(0))
+ {
+ intrinsicId = NI_Vector128_ToScalar;
+
+ if (simdSize == 8)
+ {
+ intrinsicId = NI_Vector64_ToScalar;
+ }
+
+ return gtNewSimdHWIntrinsicNode(type, op1, intrinsicId, simdBaseJitType, simdSize);
+ }
+
if (simdSize == 8)
{
intrinsicId = NI_Vector64_GetElement;
case NI_Vector256_ToScalar:
case NI_Vector512_ToScalar:
{
- assert(varTypeIsFloating(baseType));
-
if (op1->isContained() || op1->isUsedFromSpillTemp())
{
+ if (varTypeIsIntegral(baseType))
+ {
+ // We just want to emit a standard read from memory
+ ins = ins_Move_Extend(baseType, false);
+ attr = emitTypeSize(baseType);
+ }
genHWIntrinsic_R_RM(node, ins, attr, targetReg, op1);
}
else
{
+ assert(varTypeIsFloating(baseType));
+
// Just use movaps for reg->reg moves as it has zero-latency on modern CPUs
emit->emitIns_Mov(INS_movaps, attr, targetReg, op1Reg, /* canSkip */ true);
}
HARDWARE_INTRINSIC(Vector128, StoreUnsafe, 16, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector128, Subtract, 16, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector128, Sum, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
-HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
+HARDWARE_INTRINSIC(Vector128, ToScalar, 16, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, ToVector256, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, ToVector256Unsafe, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector128, ToVector512, 16, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoRMWSemantics)
HARDWARE_INTRINSIC(Vector256, StoreUnsafe, 32, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen|HW_Flag_AvxOnlyCompatible)
HARDWARE_INTRINSIC(Vector256, Subtract, 32, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector256, Sum, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
-HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible)
+HARDWARE_INTRINSIC(Vector256, ToScalar, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg|HW_Flag_AvxOnlyCompatible)
HARDWARE_INTRINSIC(Vector256, ToVector512, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector256, ToVector512Unsafe, 32, 1, {INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_movdqu, INS_vmovdqu64, INS_vmovdqu64, INS_movups, INS_movupd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector256, WidenLower, 32, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector512, StoreAlignedNonTemporal, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, StoreUnsafe, 64, -1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_BaseTypeFromFirstArg|HW_Flag_NoCodeGen)
HARDWARE_INTRINSIC(Vector512, Subtract, 64, 2, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen)
-HARDWARE_INTRINSIC(Vector512, ToScalar, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SimpleSIMD, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
+HARDWARE_INTRINSIC(Vector512, ToScalar, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_movss, INS_movsd_simd}, HW_Category_SIMDScalar, HW_Flag_SpecialImport|HW_Flag_SpecialCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector512, WidenLower, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector512, WidenUpper, 64, 1, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoCodeGen|HW_Flag_BaseTypeFromFirstArg)
HARDWARE_INTRINSIC(Vector512, WithElement, 64, 3, {INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid, INS_invalid}, HW_Category_Helper, HW_Flag_SpecialImport|HW_Flag_NoContainment|HW_Flag_BaseTypeFromFirstArg)
{
assert(sig->numArgs == 2);
+ op2 = impStackTop(0).val;
+
switch (simdBaseType)
{
- // Using software fallback if simdBaseType is not supported by hardware
case TYP_BYTE:
case TYP_UBYTE:
case TYP_INT:
case TYP_UINT:
case TYP_LONG:
case TYP_ULONG:
- if (!compExactlyDependsOn(InstructionSet_SSE41))
+ {
+ bool useToScalar = op2->IsIntegralConst(0);
+
+#if defined(TARGET_X86)
+ useToScalar &= !varTypeIsLong(simdBaseType);
+#endif // TARGET_X86
+
+ if (!useToScalar && !compExactlyDependsOn(InstructionSet_SSE41))
{
+ // Using software fallback if simdBaseType is not supported by hardware
return nullptr;
}
break;
+ }
case TYP_DOUBLE:
case TYP_FLOAT:
case TYP_SHORT:
case TYP_USHORT:
+ {
// short/ushort/float/double is supported by SSE2
break;
+ }
default:
+ {
unreached();
+ }
}
- GenTree* op2 = impPopStack().val;
- GenTree* op1 = impSIMDPopStack();
+ impPopStack();
+ op1 = impSIMDPopStack();
retNode = gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize);
break;
{
assert(sig->numArgs == 1);
+ op1 = impSIMDPopStack();
+
#if defined(TARGET_X86)
if (varTypeIsLong(simdBaseType))
{
- // TODO-XARCH-CQ: It may be beneficial to decompose this operation
+ // Create a GetElement node which handles decomposition
+ op2 = gtNewIconNode(0);
+ retNode = gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize);
break;
}
#endif // TARGET_X86
- // TODO-XARCH-CQ: It may be beneficial to import this as GetElement(0)
- op1 = impSIMDPopStack();
retNode = gtNewSimdHWIntrinsicNode(retType, op1, intrinsic, simdBaseJitType, simdSize);
break;
}
var_types simdBaseType = node->GetSimdBaseType();
unsigned simdSize = node->GetSimdSize();
+ assert((intrinsicId == NI_Vector128_GetElement) || (intrinsicId == NI_Vector256_GetElement) ||
+ (intrinsicId == NI_Vector512_GetElement));
+
assert(!varTypeIsSIMD(simdType));
assert(varTypeIsArithmetic(simdBaseType));
assert(simdSize != 0);
GenTree* op1 = node->Op(1);
GenTree* op2 = node->Op(2);
+ if (op2->IsIntegralConst(0))
+ {
+ // Specially handle as ToScalar
+ BlockRange().Remove(op2);
+
+ if (simdSize == 64)
+ {
+ intrinsicId = NI_Vector512_ToScalar;
+ }
+ else if (simdSize == 32)
+ {
+ intrinsicId = NI_Vector256_ToScalar;
+ }
+ else
+ {
+ intrinsicId = NI_Vector128_ToScalar;
+ }
+
+ node->ResetHWIntrinsicId(intrinsicId, op1);
+ LowerNode(node);
+
+ return;
+ }
+
if (op1->OperIs(GT_IND))
{
// If the vector is already in memory, we force its
switch (simdBaseType)
{
- // Using software fallback if simdBaseType is not supported by hardware
case TYP_BYTE:
case TYP_UBYTE:
case TYP_INT:
case TYP_UINT:
- assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41));
- break;
-
+#if defined(TARGET_AMD64)
case TYP_LONG:
case TYP_ULONG:
- // We either support TYP_LONG or we have been decomposed into two TYP_INT inserts
- assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41_X64));
+#endif // TARGET_AMD64
+ {
+ // Using software fallback if simdBaseType is not supported by hardware
+ assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE41));
break;
+ }
case TYP_DOUBLE:
case TYP_FLOAT:
case TYP_SHORT:
case TYP_USHORT:
+ {
assert(comp->compIsaSupportedDebugOnly(InstructionSet_SSE2));
break;
+ }
default:
+ {
unreached();
+ }
}
// Remove the index node up front to simplify downstream logic
NamedIntrinsic resIntrinsic = NI_Illegal;
- if ((imm8 == 0) && (genTypeSize(simdBaseType) >= 4))
+ if (imm8 == 0)
{
- switch (simdBaseType)
- {
- case TYP_LONG:
- resIntrinsic = NI_SSE2_X64_ConvertToInt64;
- break;
-
- case TYP_ULONG:
- resIntrinsic = NI_SSE2_X64_ConvertToUInt64;
- break;
-
- case TYP_INT:
- resIntrinsic = NI_SSE2_ConvertToInt32;
- break;
-
- case TYP_UINT:
- resIntrinsic = NI_SSE2_ConvertToUInt32;
- break;
-
- case TYP_FLOAT:
- case TYP_DOUBLE:
- resIntrinsic = NI_Vector128_ToScalar;
- break;
-
- default:
- unreached();
- }
+ // Specially handle as ToScalar
node->SetSimdSize(16);
- node->ResetHWIntrinsicId(resIntrinsic, op1);
+ node->ResetHWIntrinsicId(NI_Vector128_ToScalar, op1);
+
+ LowerNode(node);
+ return;
}
else
{
if ((simdBaseType == TYP_BYTE) || (simdBaseType == TYP_SHORT))
{
- // The intrinsic zeros the upper bits, so we need an explicit
+ // The extract intrinsics zero the upper bits, so we need an explicit
// cast to ensure the result is properly sign extended
LIR::Use use;
- bool foundUse = BlockRange().TryGetUse(node, &use);
- GenTreeCast* cast = comp->gtNewCastNode(TYP_INT, node, /* isUnsigned */ true, simdBaseType);
+ bool foundUse = BlockRange().TryGetUse(node, &use);
+ bool fromUnsigned = false;
+
+ GenTreeCast* cast = comp->gtNewCastNode(TYP_INT, node, fromUnsigned, simdBaseType);
BlockRange().InsertAfter(node, cast);
if (foundUse)
assert((intrinsicId == NI_Vector128_ToScalar) || (intrinsicId == NI_Vector256_ToScalar) ||
(intrinsicId == NI_Vector512_ToScalar));
+
assert(varTypeIsSIMD(simdType));
assert(varTypeIsArithmetic(simdBaseType));
assert(simdSize != 0);
+ GenTree* op1 = node->Op(1);
+
+ if (IsContainableMemoryOp(op1) && IsSafeToContainMem(node, op1))
+ {
+ // We will specially handle ToScalar in codegen when op1 is already in memory
+ return;
+ }
+
switch (simdBaseType)
{
case TYP_BYTE:
if (genTypeSize(simdBaseType) < 4)
{
+ // The move intrinsics do not touch the upper bits, so we need an explicit
+ // cast to ensure the result is properly sign extended
+
LIR::Use use;
- bool foundUse = BlockRange().TryGetUse(node, &use);
- GenTreeCast* cast = comp->gtNewCastNode(simdBaseType, node, node->IsUnsigned(), simdBaseType);
+ bool foundUse = BlockRange().TryGetUse(node, &use);
+ bool fromUnsigned = varTypeIsUnsigned(simdBaseType);
+
+ GenTreeCast* cast = comp->gtNewCastNode(TYP_INT, node, fromUnsigned, simdBaseType);
BlockRange().InsertAfter(node, cast);
if (foundUse)
{
use.ReplaceWith(cast);
}
-
LowerNode(cast);
}
}
#if defined(FEATURE_HW_INTRINSICS)
else if (src->OperIsHWIntrinsic())
{
- GenTreeHWIntrinsic* hwintrinsic = src->AsHWIntrinsic();
- NamedIntrinsic intrinsicId = hwintrinsic->GetHWIntrinsicId();
- var_types simdBaseType = hwintrinsic->GetSimdBaseType();
- bool isContainable = false;
+ GenTreeHWIntrinsic* hwintrinsic = src->AsHWIntrinsic();
+ NamedIntrinsic intrinsicId = hwintrinsic->GetHWIntrinsicId();
+ var_types simdBaseType = hwintrinsic->GetSimdBaseType();
+ bool isContainable = false;
+ GenTree* clearContainedNode = nullptr;
switch (intrinsicId)
{
+ case NI_Vector128_ToScalar:
+ case NI_Vector256_ToScalar:
+ case NI_Vector512_ToScalar:
+ {
+ if (varTypeIsFloating(simdBaseType))
+ {
+ // These intrinsics are "ins reg/mem, xmm" or "ins xmm, reg/mem"
+ //
+ // In the case we are coming from and going to memory, we want to
+ // preserve the original containment as we'll end up emitting:
+ // movss xmm0, [addr1] ; Size: 4, Latency: 4-7, TP: 0.5
+ // movss [addr2], xmm0 ; Size: 4, Latency: 4-10, TP: 1
+ //
+ // However, we want to prefer containing the store over allowing the
+ // input to be regOptional, so track and clear containment if required.
+
+ clearContainedNode = hwintrinsic->Op(1);
+ isContainable = !clearContainedNode->isContained();
+ }
+ break;
+ }
+
case NI_SSE2_ConvertToInt32:
case NI_SSE2_ConvertToUInt32:
case NI_SSE2_X64_ConvertToInt64:
// However, we still want to do the efficient thing and write directly
// to memory in the case where the extract is immediately used by a store
- // TODO-XArch-CQ: We really should specially handle TYP_DOUBLE here but
- // it requires lowering GetElement(1) the GT_STOREIND to NI_SSE2_StoreHigh
- // while leaving GetElement(0) alone (it is already converted to ToScalar)
-
- if (simdBaseType == TYP_FLOAT)
+ if (varTypeIsFloating(simdBaseType) && hwintrinsic->Op(2)->IsCnsIntOrI())
{
- // SSE41_Extract is "extractps reg/mem, xmm, imm8"
- isContainable = hwintrinsic->Op(2)->IsCnsIntOrI() &&
- comp->compOpportunisticallyDependsOn(InstructionSet_SSE41);
+ assert(!hwintrinsic->Op(2)->IsIntegralConst(0));
+
+ if (simdBaseType == TYP_FLOAT)
+ {
+ // SSE41_Extract is "extractps reg/mem, xmm, imm8"
+ //
+ // In the case we are coming from and going to memory, we want to
+ // preserve the original containment as we'll end up emitting:
+ // movss xmm0, [addr1] ; Size: 4, Latency: 4-7, TP: 0.5
+ // movss [addr2], xmm0 ; Size: 4, Latency: 4-10, TP: 1
+ //
+ // The alternative would be emitting the slightly more expensive
+ // movups xmm0, [addr1] ; Size: 4, Latency: 4-7, TP: 0.5
+ // extractps [addr2], xmm0, cns ; Size: 6, Latency: 5-10, TP: 1
+ //
+ // However, we want to prefer containing the store over allowing the
+ // input to be regOptional, so track and clear containment if required.
+
+ if (comp->compOpportunisticallyDependsOn(InstructionSet_SSE41))
+ {
+ clearContainedNode = hwintrinsic->Op(1);
+ isContainable = !clearContainedNode->isContained();
+ }
+ }
+ else
+ {
+ // TODO-XArch-CQ: We really should specially handle TYP_DOUBLE here but
+ // it requires handling GetElement(1) and GT_STOREIND as NI_SSE2_StoreHigh
+ assert(!isContainable);
+ }
}
break;
}
{
MakeSrcContained(node, src);
- if (intrinsicId == NI_Vector128_GetElement)
+ if (clearContainedNode != nullptr)
{
- hwintrinsic->Op(1)->ClearContained();
+ // Ensure we aren't marked contained or regOptional
+ clearContainedNode->ClearContained();
}
}
}
}
case NI_Vector128_GetElement:
+ case NI_Vector128_ToScalar:
+ case NI_Vector256_ToScalar:
+ case NI_Vector512_ToScalar:
case NI_AVX_ExtractVector128:
case NI_AVX2_ExtractVector128:
case NI_AVX512F_ExtractVector128:
if ((simdSize == 8) || (simdSize == 12))
{
- // We want to handle GetElement still for Vector2/3
- if ((intrinsicId != NI_Vector128_GetElement) && (intrinsicId != NI_Vector256_GetElement) &&
- (intrinsicId != NI_Vector512_GetElement))
+ // We want to handle GetElement/ToScalar still for Vector2/3
+ if ((intrinsicId != NI_Vector128_GetElement) && (intrinsicId != NI_Vector128_ToScalar) &&
+ (intrinsicId != NI_Vector256_GetElement) && (intrinsicId != NI_Vector256_ToScalar) &&
+ (intrinsicId != NI_Vector512_GetElement) && (intrinsicId != NI_Vector512_ToScalar))
{
// TODO-XArch-CQ: Ideally we would key this off of the size the containing node
// expects vs the size node actually is or would be if spilled to the stack
case NI_VectorT128_GetElement:
case NI_VectorT256_GetElement:
{
+ op2 = impStackTop(0).val;
+
switch (simdBaseType)
{
- // Using software fallback if simdBaseType is not supported by hardware
case TYP_BYTE:
case TYP_UBYTE:
case TYP_INT:
case TYP_UINT:
case TYP_LONG:
case TYP_ULONG:
- if (!compExactlyDependsOn(InstructionSet_SSE41))
+ {
+ bool useToScalar = op2->IsIntegralConst(0);
+
+#if defined(TARGET_X86)
+ useToScalar &= !varTypeIsLong(simdBaseType);
+#endif // TARGET_X86
+
+ if (!useToScalar && !compExactlyDependsOn(InstructionSet_SSE41))
{
+ // Using software fallback if simdBaseType is not supported by hardware
return nullptr;
}
break;
+ }
case TYP_DOUBLE:
case TYP_FLOAT:
case TYP_SHORT:
case TYP_USHORT:
+ {
// short/ushort/float/double is supported by SSE2
break;
+ }
default:
+ {
unreached();
+ }
}
break;
}
break;
}
- case NI_VectorT128_ToScalar:
- case NI_VectorT256_ToScalar:
- {
-#if defined(TARGET_X86)
- if (varTypeIsLong(simdBaseType))
- {
- // TODO-XARCH-CQ: It may be beneficial to decompose this operation
- return nullptr;
- }
-#endif // TARGET_X86
- break;
- }
-
case NI_VectorT128_WithElement:
case NI_VectorT256_WithElement:
{
case NI_VectorT128_ToScalar:
{
+#if defined(TARGET_X86)
+ if (varTypeIsLong(simdBaseType))
+ {
+ op2 = gtNewIconNode(0);
+ return gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize);
+ }
+#endif // TARGET_X86
+
return gtNewSimdHWIntrinsicNode(retType, op1, NI_Vector128_ToScalar, simdBaseJitType, simdSize);
}
case NI_VectorT256_ToScalar:
{
+#if defined(TARGET_X86)
+ if (varTypeIsLong(simdBaseType))
+ {
+ op2 = gtNewIconNode(0);
+ return gtNewSimdGetElementNode(retType, op1, op2, simdBaseJitType, simdSize);
+ }
+#endif // TARGET_X86
+
return gtNewSimdHWIntrinsicNode(retType, op1, NI_Vector256_ToScalar, simdBaseJitType, simdSize);
}
#elif defined(TARGET_ARM64)
}
#endif // TARGET_XARCH
+ case NI_Vector128_ToScalar:
+#ifdef TARGET_ARM64
+ case NI_Vector64_ToScalar:
+#else
+ case NI_Vector256_ToScalar:
+ case NI_Vector512_ToScalar:
+#endif
+ {
+ return EvaluateSimdGetElement(this, type, baseType, arg0VN, 0);
+ }
+
default:
break;
}