CONFIG_DWORD_INFO_EX(INTERNAL_JitDebugLogLoopCloning, W("JitDebugLogLoopCloning"), 0, "In debug builds log places where loop cloning optimizations are performed on the fast path.", CLRConfig::REGUTIL_default);
CONFIG_DWORD_INFO_EX(INTERNAL_JitVNMapSelLimit, W("JitVNMapSelLimit"), 0, "If non-zero, assert if # of VNF_MapSelect applications considered reaches this", CLRConfig::REGUTIL_default)
RETAIL_CONFIG_DWORD_INFO(INTERNAL_JitVNMapSelBudget, W("JitVNMapSelBudget"), 100, "Max # of MapSelect's considered for a particular top-level invocation.")
-#if defined(_TARGET_AMD64_)
+#if defined(_TARGET_AMD64_) || defined(_TARGET_X86_)
#define EXTERNAL_FeatureSIMD_Default 1
#define EXTERNAL_JitEnableAVX_Default 1
-#else // !defined(_TARGET_AMD64_)
+#else // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
#define EXTERNAL_FeatureSIMD_Default 0
#define EXTERNAL_JitEnableAVX_Default 0
-#endif // !defined(_TARGET_AMD64_)
+#endif // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
RETAIL_CONFIG_DWORD_INFO_EX(EXTERNAL_FeatureSIMD, W("FeatureSIMD"), EXTERNAL_FeatureSIMD_Default, "Enable SIMD support with companion SIMDVector.dll", CLRConfig::REGUTIL_default)
RETAIL_CONFIG_DWORD_INFO_EX(EXTERNAL_EnableAVX, W("EnableAVX"), EXTERNAL_JitEnableAVX_Default, "Enable AVX instruction set for wide operations as default", CLRConfig::REGUTIL_default)
// their size rounded to TARGET_POINTER_SIZE (which is 8 bytes on 64-bit targets) and hence
// Vector3 locals could be treated as TYP_SIMD16 while reading/writing.
void genStoreIndTypeSIMD12(GenTree* treeNode);
-void genStoreLclFldTypeSIMD12(GenTree* treeNode);
void genLoadIndTypeSIMD12(GenTree* treeNode);
+void genStoreLclTypeSIMD12(GenTree* treeNode);
void genLoadLclTypeSIMD12(GenTree* treeNode);
#ifdef _TARGET_X86_
+void genStoreSIMD12ToStack(regNumber operandReg, regNumber tmpReg);
void genPutArgStkSIMD12(GenTree* treeNode);
#endif // _TARGET_X86_
#endif // FEATURE_SIMD
// storing of TYP_SIMD12 (i.e. Vector3) field
if (treeNode->TypeGet() == TYP_SIMD12)
{
- genStoreLclFldTypeSIMD12(treeNode);
+ genStoreLclTypeSIMD12(treeNode);
break;
}
-#endif
+#endif // FEATURE_SIMD
+
GenTreePtr op1 = treeNode->gtGetOp1();
genConsumeRegs(op1);
emit->emitInsBinary(ins_Store(targetType), emitTypeSize(treeNode), treeNode, op1);
#endif // !defined(_TARGET_64BIT_)
#ifdef FEATURE_SIMD
+ // storing of TYP_SIMD12 (i.e. Vector3) field
+ if (treeNode->TypeGet() == TYP_SIMD12)
+ {
+ genStoreLclTypeSIMD12(treeNode);
+ break;
+ }
+
if (varTypeIsSIMD(targetType) && (targetReg != REG_NA) && op1->IsCnsIntOrI())
{
// This is only possible for a zero-init.
#ifdef _TARGET_X86_
//---------------------------------------------------------------------
-// adjustStackForPutArgStk:
+// genAdjustStackForPutArgStk:
// adjust the stack pointer for a putArgStk node if necessary.
//
// Arguments:
//
// Returns: true if the stack pointer was adjusted; false otherwise.
//
+// Notes:
+// Sets `m_pushStkArg` to true if the stack arg needs to be pushed,
+// false if the stack arg needs to be stored at the current stack
+// pointer address. This is exactly the opposite of the return value
+// of this function.
+//
bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk)
{
#ifdef FEATURE_SIMD
}
//---------------------------------------------------------------------
-// genPutArgStkFieldList - generate code for passing an arg on the stack.
+// genPutArgStkFieldList - generate code for passing a GT_FIELD_LIST arg on the stack.
//
// Arguments
-// treeNode - the GT_PUTARG_STK node
-// targetType - the type of the treeNode
+// treeNode - the GT_PUTARG_STK node whose op1 is a GT_FIELD_LIST
//
// Return value:
// None
// Set m_pushStkArg and pre-adjust the stack if necessary.
const bool preAdjustedStack = genAdjustStackForPutArgStk(putArgStk);
+
// For now, we only support the "push" case; we will push a full slot for the first field of each slot
// within the struct.
assert((putArgStk->isPushKind()) && !preAdjustedStack && m_pushStkArg);
- // If we have pre-adjusted the stack and are simply storing the fields in order) set the offset to 0.
+ // If we have pre-adjusted the stack and are simply storing the fields in order, set the offset to 0.
// (Note that this mode is not currently being used.)
// If we are pushing the arguments (i.e. we have not pre-adjusted the stack), then we are pushing them
// in reverse order, so we start with the current field offset at the size of the struct arg (which must be
// a multiple of the target pointer size).
unsigned currentOffset = (preAdjustedStack) ? 0 : putArgStk->getArgSize();
unsigned prevFieldOffset = currentOffset;
- regNumber tmpReg = REG_NA;
+ regNumber intTmpReg = REG_NA;
+ regNumber simdTmpReg = REG_NA;
if (putArgStk->gtRsvdRegs != RBM_NONE)
{
- assert(genCountBits(putArgStk->gtRsvdRegs) == 1);
- tmpReg = genRegNumFromMask(putArgStk->gtRsvdRegs);
- assert(genIsValidIntReg(tmpReg));
+ regMaskTP rsvdRegs = putArgStk->gtRsvdRegs;
+ if ((rsvdRegs & RBM_ALLINT) != 0)
+ {
+ intTmpReg = genRegNumFromMask(rsvdRegs & RBM_ALLINT);
+ assert(genIsValidIntReg(intTmpReg));
+ }
+ if ((rsvdRegs & RBM_ALLFLOAT) != 0)
+ {
+ simdTmpReg = genRegNumFromMask(rsvdRegs & RBM_ALLFLOAT);
+ assert(genIsValidFloatReg(simdTmpReg));
+ }
+ assert(genCountBits(rsvdRegs) == ((intTmpReg == REG_NA) ? 0 : 1) + ((simdTmpReg == REG_NA) ? 0 : 1));
}
+
for (GenTreeFieldList* current = fieldList; current != nullptr; current = current->Rest())
{
GenTree* const fieldNode = current->Current();
// able to detect stores into the outgoing argument area of the stack on x86.
const bool fieldIsSlot = ((fieldOffset % 4) == 0) && ((prevFieldOffset - fieldOffset) >= 4);
int adjustment = roundUp(currentOffset - fieldOffset, 4);
- if (fieldIsSlot)
+ if (fieldIsSlot && !varTypeIsSIMD(fieldType))
{
fieldType = genActualType(fieldType);
unsigned pushSize = genTypeSize(fieldType);
else
{
m_pushStkArg = false;
+
// We always "push" floating point fields (i.e. they are full slot values that don't
// require special handling).
- assert(varTypeIsIntegralOrI(fieldNode));
+ assert(varTypeIsIntegralOrI(fieldNode) || varTypeIsSIMD(fieldNode));
+
// If we can't push this field, it needs to be in a register so that we can store
// it to the stack location.
- assert(tmpReg != REG_NA);
if (adjustment != 0)
{
// This moves the stack pointer to fieldOffset.
}
// Does it need to be in a byte register?
- // If so, we'll use tmpReg, which must have been allocated as a byte register.
+ // If so, we'll use intTmpReg, which must have been allocated as a byte register.
// If it's already in a register, but not a byteable one, then move it.
if (varTypeIsByte(fieldType) && ((argReg == REG_NA) || ((genRegMask(argReg) & RBM_BYTE_REGS) == 0)))
{
- noway_assert((genRegMask(tmpReg) & RBM_BYTE_REGS) != 0);
+ assert(intTmpReg != REG_NA);
+ noway_assert((genRegMask(intTmpReg) & RBM_BYTE_REGS) != 0);
if (argReg != REG_NA)
{
- inst_RV_RV(INS_mov, tmpReg, argReg, fieldType);
- argReg = tmpReg;
+ inst_RV_RV(INS_mov, intTmpReg, argReg, fieldType);
+ argReg = intTmpReg;
}
}
}
{
if (fieldNode->isUsedFromSpillTemp())
{
+ assert(!varTypeIsSIMD(fieldType)); // Q: can we get here with SIMD?
assert(fieldNode->IsRegOptional());
TempDsc* tmp = getSpillTempDsc(fieldNode);
getEmitter()->emitIns_S(INS_push, emitActualTypeSize(fieldNode->TypeGet()), tmp->tdTempNum(), 0);
}
else
{
- // The stack has been adjusted and we will load the field to tmpReg and then store it on the stack.
+ // The stack has been adjusted and we will load the field to intTmpReg and then store it on the stack.
assert(varTypeIsIntegralOrI(fieldNode));
switch (fieldNode->OperGet())
{
case GT_LCL_VAR:
- inst_RV_TT(INS_mov, tmpReg, fieldNode);
+ inst_RV_TT(INS_mov, intTmpReg, fieldNode);
break;
case GT_CNS_INT:
- genSetRegToConst(tmpReg, fieldNode->TypeGet(), fieldNode);
+ genSetRegToConst(intTmpReg, fieldNode->TypeGet(), fieldNode);
break;
default:
unreached();
}
- genStoreRegToStackArg(fieldType, tmpReg, fieldOffset - currentOffset);
+ genStoreRegToStackArg(fieldType, intTmpReg, fieldOffset - currentOffset);
}
}
else
{
- genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset);
+#if defined(_TARGET_X86_) && defined(FEATURE_SIMD)
+ if (fieldType == TYP_SIMD12)
+ {
+ assert(genIsValidFloatReg(simdTmpReg));
+ genStoreSIMD12ToStack(argReg, simdTmpReg);
+ }
+ else
+#endif // defined(_TARGET_X86_) && defined(FEATURE_SIMD)
+ {
+ genStoreRegToStackArg(fieldType, argReg, fieldOffset - currentOffset);
+ }
if (m_pushStkArg)
{
// We always push a slot-rounded size
#ifdef _TARGET_X86_
-#ifdef FEATURE_SIMD
- if (targetType == TYP_SIMD12)
- {
- genPutArgStkSIMD12(putArgStk);
- return;
- }
-#endif // FEATURE_SIMD
-
if (varTypeIsStruct(targetType))
{
(void)genAdjustStackForPutArgStk(putArgStk);
{
var_types targetType = putArgStk->TypeGet();
+#if defined(_TARGET_X86_) && defined(FEATURE_SIMD)
+ if (targetType == TYP_SIMD12)
+ {
+ genPutArgStkSIMD12(putArgStk);
+ return;
+ }
+#endif // defined(_TARGET_X86_) && defined(FEATURE_SIMD)
+
if (varTypeIsSIMD(targetType))
{
regNumber srcReg = genConsumeReg(putArgStk->gtGetOp1());
#endif // defined(_TARGET_64BIT_)
}
- unsigned lvSize() // Size needed for storage representation. Only used for structs or TYP_BLK.
+ unsigned lvSize() const // Size needed for storage representation. Only used for structs or TYP_BLK.
{
// TODO-Review: Sometimes we get called on ARM with HFA struct variables that have been promoted,
// where the struct itself is no longer used because all access is via its member fields.
#if defined(FEATURE_SIMD) && !defined(_TARGET_64BIT_)
// For 32-bit architectures, we make local variable SIMD12 types 16 bytes instead of just 12. We can't do
- // this for arguments, which must be passed according the defined ABI.
+ // this for arguments, which must be passed according the defined ABI. We don't want to do this for
+ // dependently promoted struct fields, but we don't know that here. See lvaMapSimd12ToSimd16().
if ((lvType == TYP_SIMD12) && !lvIsParam)
{
assert(lvExactSize == 12);
SIMDIntrinsicID simdIntrinsicID,
var_types baseType,
unsigned size);
+ void SetOpLclRelatedToSIMDIntrinsic(GenTreePtr op);
#endif
GenTreePtr gtNewLclLNode(unsigned lnum, var_types type, IL_OFFSETX ILoffs = BAD_IL_OFFSET);
bool lvaIsFieldOfDependentlyPromotedStruct(const LclVarDsc* varDsc);
bool lvaIsGCTracked(const LclVarDsc* varDsc);
+#if defined(FEATURE_SIMD)
+ bool lvaMapSimd12ToSimd16(const LclVarDsc* varDsc)
+ {
+ assert(varDsc->lvType == TYP_SIMD12);
+ assert(varDsc->lvExactSize == 12);
+
+#if defined(_TARGET_64BIT_)
+ assert(varDsc->lvSize() == 16);
+ return true;
+#else // !defined(_TARGET_64BIT_)
+
+ // For 32-bit architectures, we make local variable SIMD12 types 16 bytes instead of just 12. lvSize()
+ // already does this calculation. However, we also need to prevent mapping types if the var is a
+ // depenendently promoted struct field, which must remain its exact size within its parent struct.
+ // However, we don't know this until late, so we may have already pretended the field is bigger
+ // before that.
+ if ((varDsc->lvSize() == 16) && !lvaIsFieldOfDependentlyPromotedStruct(varDsc))
+ {
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+
+#endif // !defined(_TARGET_64BIT_)
+ }
+#endif // defined(FEATURE_SIMD)
+
BYTE* lvaGetGcLayout(unsigned varNum);
bool lvaTypeIsGC(unsigned varNum);
unsigned lvaGSSecurityCookie; // LclVar number
nextNode = DecomposeRotate(use);
break;
+#ifdef FEATURE_SIMD
+ case GT_SIMD:
+ nextNode = DecomposeSimd(use);
+ break;
+#endif // FEATURE_SIMD
+
case GT_LOCKADD:
case GT_XADD:
case GT_XCHG:
return FinalizeDecomposition(use, loResult, hiResult, hiResult);
}
+#ifdef FEATURE_SIMD
+
+//------------------------------------------------------------------------
+// DecomposeSimd: Decompose GT_SIMD.
+//
+// Arguments:
+// use - the LIR::Use object for the def that needs to be decomposed.
+//
+// Return Value:
+// The next node to process.
+//
+GenTree* DecomposeLongs::DecomposeSimd(LIR::Use& use)
+{
+ GenTree* tree = use.Def();
+ genTreeOps oper = tree->OperGet();
+
+ assert(oper == GT_SIMD);
+
+ GenTreeSIMD* simdTree = tree->AsSIMD();
+
+ switch (simdTree->gtSIMDIntrinsicID)
+ {
+ case SIMDIntrinsicGetItem:
+ return DecomposeSimdGetItem(use);
+
+ default:
+ noway_assert(!"unexpected GT_SIMD node in long decomposition");
+ break;
+ }
+
+ return nullptr;
+}
+
+//------------------------------------------------------------------------
+// DecomposeSimdGetItem: Decompose GT_SIMD -- SIMDIntrinsicGetItem.
+//
+// Decompose a get[i] node on Vector<long>. For:
+//
+// GT_SIMD{get_item}[long](simd_var, index)
+//
+// create:
+//
+// tmp_simd_var = simd_var
+// tmp_index = index
+// loResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2)
+// hiResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2 + 1)
+// return: GT_LONG(loResult, hiResult)
+//
+// This isn't optimal codegen, since SIMDIntrinsicGetItem sometimes requires
+// temps that could be shared, for example.
+//
+// Arguments:
+// use - the LIR::Use object for the def that needs to be decomposed.
+//
+// Return Value:
+// The next node to process.
+//
+GenTree* DecomposeLongs::DecomposeSimdGetItem(LIR::Use& use)
+{
+ GenTree* tree = use.Def();
+ genTreeOps oper = tree->OperGet();
+
+ assert(oper == GT_SIMD);
+
+ GenTreeSIMD* simdTree = tree->AsSIMD();
+ var_types baseType = simdTree->gtSIMDBaseType;
+ unsigned simdSize = simdTree->gtSIMDSize;
+
+ assert(simdTree->gtSIMDIntrinsicID == SIMDIntrinsicGetItem);
+ assert(varTypeIsLong(baseType));
+ assert(varTypeIsLong(simdTree));
+ assert(varTypeIsSIMD(simdTree->gtOp.gtOp1->gtType));
+ assert(simdTree->gtOp.gtOp2->gtType == TYP_INT);
+
+ LIR::Use op1(Range(), &simdTree->gtOp.gtOp1, simdTree);
+ unsigned simdTmpVarNum = op1.ReplaceWithLclVar(m_compiler, m_blockWeight);
+ JITDUMP("[DecomposeSimdGetItem]: Saving op1 tree to a temp var:\n");
+ DISPTREERANGE(Range(), op1.Def());
+
+ LIR::Use op2(Range(), &simdTree->gtOp.gtOp2, simdTree);
+ unsigned indexTmpVarNum = op2.ReplaceWithLclVar(m_compiler, m_blockWeight);
+ JITDUMP("[DecomposeSimdGetItem]: Saving op2 tree to a temp var:\n");
+ DISPTREERANGE(Range(), op2.Def());
+
+ // TODO-CQ: if the index is constant, we don't need to do the computation dynamically.
+
+ // Create:
+ // loResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2)
+
+ GenTree* simdTmpVar1 = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTree->gtOp.gtOp1->gtType);
+ GenTree* indexTmpVar1 = m_compiler->gtNewLclLNode(indexTmpVarNum, TYP_INT);
+ GenTree* two1 = m_compiler->gtNewIconNode(2, TYP_INT);
+ GenTree* indexTimesTwo1 = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, indexTmpVar1, two1);
+
+ GenTree* loResult =
+ m_compiler->gtNewSIMDNode(TYP_INT, simdTmpVar1, indexTimesTwo1, SIMDIntrinsicGetItem, TYP_INT, simdSize);
+
+ // Create:
+ // hiResult = GT_SIMD{get_item}[int](tmp_simd_var, tmp_index * 2 + 1)
+
+ GenTree* simdTmpVar2 = m_compiler->gtNewLclLNode(simdTmpVarNum, simdTree->gtOp.gtOp1->gtType);
+ GenTree* indexTmpVar2 = m_compiler->gtNewLclLNode(indexTmpVarNum, TYP_INT);
+ GenTree* two2 = m_compiler->gtNewIconNode(2, TYP_INT);
+ GenTree* indexTimesTwo2 = m_compiler->gtNewOperNode(GT_MUL, TYP_INT, indexTmpVar2, two2);
+ GenTree* one = m_compiler->gtNewIconNode(1, TYP_INT);
+ GenTree* indexTimesTwoPlusOne = m_compiler->gtNewOperNode(GT_ADD, TYP_INT, indexTimesTwo2, one);
+
+ GenTree* hiResult =
+ m_compiler->gtNewSIMDNode(TYP_INT, simdTmpVar2, indexTimesTwoPlusOne, SIMDIntrinsicGetItem, TYP_INT, simdSize);
+
+ // Put all the new nodes in execution order.
+
+ Range().InsertBefore(tree, simdTmpVar1, indexTmpVar1, two1, indexTimesTwo1);
+ Range().InsertBefore(tree, loResult, simdTmpVar2, indexTmpVar2, two2);
+ Range().InsertBefore(tree, indexTimesTwo2, one, indexTimesTwoPlusOne, hiResult);
+
+ Range().Remove(tree);
+
+ return FinalizeDecomposition(use, loResult, hiResult, hiResult);
+}
+
+#endif // FEATURE_SIMD
+
//------------------------------------------------------------------------
// StoreNodeToVar: Check if the user is a STORE_LCL_VAR, and if it isn't,
// store the node to a var. Then decompose the new LclVar.
GenTree* DecomposeRotate(LIR::Use& use);
GenTree* DecomposeMul(LIR::Use& use);
GenTree* DecomposeUMod(LIR::Use& use);
+ GenTree* DecomposeSimd(LIR::Use& use);
+ GenTree* DecomposeSimdGetItem(LIR::Use& use);
// Helper functions
GenTree* FinalizeDecomposition(LIR::Use& use, GenTree* loResult, GenTree* hiResult, GenTree* insertResultAfter);
{
if (JitConfig.EnableAVX() != 0)
{
+ JITDUMP("getMaxIntrinsicSIMDVectorLength: returning 32\n");
return 32;
}
}
#endif // FEATURE_AVX_SUPPORT
+ JITDUMP("getMaxIntrinsicSIMDVectorLength: returning 16\n");
return 16;
#endif // _TARGET_XARCH_
#else // !FEATURE_SIMD
+ JITDUMP("getMaxIntrinsicSIMDVectorLength: returning 0\n");
return 0;
#endif // !FEATURE_SIMD
}
#endif
}
-#ifdef _TARGET_AMD64_
-#define REX_PREFIX_MASK 0xFF00000000LL
-#endif // _TARGET_AMD64_
-
#ifdef FEATURE_AVX_SUPPORT
// Returns true if the AVX instruction is a binary operator that requires 3 operands.
// When we emit an instruction with only two operands, we will duplicate the destination
return 3;
}
-#ifdef _TARGET_AMD64_
- if (code & REX_PREFIX_MASK)
+ if (hasRexPrefix(code))
{
return 1;
}
-#endif // _TARGET_AMD64_
return 0;
}
}
}
-#ifdef _TARGET_AMD64_
size += emitGetVexPrefixAdjustedSize(ins, attrSize, code);
- if (code & REX_PREFIX_MASK)
+ if (hasRexPrefix(code))
{
// REX prefix
size += emitGetRexPrefixSize(ins);
// Should have a REX byte
size += emitGetRexPrefixSize(ins);
}
-#endif // _TARGET_AMD64_
if (rgx == REG_NA)
{
}
#endif // DEBUG
-#ifdef _TARGET_AMD64_
- assert((code & REX_PREFIX_MASK) == 0); // Can't have a REX bit with no operands, right?
-#endif // _TARGET_AMD64_
+ assert(!hasRexPrefix(code)); // Can't have a REX bit with no operands, right?
if (code & 0xFF000000)
{
code_t code = insCodeMI(ins);
UNATIVE_OFFSET sz = emitInsSizeCV(id, code, val);
-#ifdef _TARGET_AMD64_
// Vex prefix
sz += emitGetVexPrefixAdjustedSize(ins, attr, insCodeMI(ins));
// REX prefix, if not already included in "code"
- if (TakesRexWPrefix(ins, attr) && (code & REX_PREFIX_MASK) == 0)
+ if (TakesRexWPrefix(ins, attr) && !hasRexPrefix(code))
{
sz += emitGetRexPrefixSize(ins);
}
-#endif // _TARGET_AMD64_
id->idAddr()->iiaFieldHnd = fldHnd;
id->idCodeSize(sz);
}
bool Is4ByteSSE4Instruction(instruction ins);
+bool hasRexPrefix(code_t code)
+{
+#ifdef _TARGET_AMD64_
+ const code_t REX_PREFIX_MASK = 0xFF00000000LL;
+ return (code & REX_PREFIX_MASK) != 0;
+#else // !_TARGET_AMD64_
+ return false;
+#endif // !_TARGET_AMD64_
+}
+
#ifdef FEATURE_AVX_SUPPORT
// 3-byte VEX prefix starts with byte 0xC4
}
bool Is4ByteAVXInstruction(instruction ins);
#else // !FEATURE_AVX_SUPPORT
-bool UseAVX()
+bool UseAVX()
{
return false;
}
if (dst->OperIsLocal() && varTypeIsStruct(dst))
{
- unsigned lclNum = dst->AsLclVarCommon()->GetLclNum();
- LclVarDsc* lclVarDsc = &lvaTable[lclNum];
- lclVarDsc->lvUsedInSIMDIntrinsic = true;
+ setLclRelatedToSIMDIntrinsic(dst);
}
}
}
GenTreeSIMD* Compiler::gtNewSIMDNode(
var_types type, GenTreePtr op1, SIMDIntrinsicID simdIntrinsicID, var_types baseType, unsigned size)
{
- // TODO-CQ: An operand may be a GT_OBJ(GT_ADDR(GT_LCL_VAR))), in which case it should be
- // marked lvUsedInSIMDIntrinsic.
assert(op1 != nullptr);
- if (op1->OperGet() == GT_LCL_VAR)
- {
- unsigned lclNum = op1->AsLclVarCommon()->GetLclNum();
- LclVarDsc* lclVarDsc = &lvaTable[lclNum];
- lclVarDsc->lvUsedInSIMDIntrinsic = true;
- }
+ SetOpLclRelatedToSIMDIntrinsic(op1);
return new (this, GT_SIMD) GenTreeSIMD(type, op1, simdIntrinsicID, baseType, size);
}
GenTreeSIMD* Compiler::gtNewSIMDNode(
var_types type, GenTreePtr op1, GenTreePtr op2, SIMDIntrinsicID simdIntrinsicID, var_types baseType, unsigned size)
{
- // TODO-CQ: An operand may be a GT_OBJ(GT_ADDR(GT_LCL_VAR))), in which case it should be
- // marked lvUsedInSIMDIntrinsic.
assert(op1 != nullptr);
- if (op1->OperIsLocal())
+ SetOpLclRelatedToSIMDIntrinsic(op1);
+ if (op2 != nullptr)
{
- unsigned lclNum = op1->AsLclVarCommon()->GetLclNum();
- LclVarDsc* lclVarDsc = &lvaTable[lclNum];
- lclVarDsc->lvUsedInSIMDIntrinsic = true;
+ SetOpLclRelatedToSIMDIntrinsic(op2);
}
- if (op2 != nullptr && op2->OperIsLocal())
+ return new (this, GT_SIMD) GenTreeSIMD(type, op1, op2, simdIntrinsicID, baseType, size);
+}
+
+//-------------------------------------------------------------------
+// SetOpLclRelatedToSIMDIntrinsic: Determine if the tree has a local var that needs to be set
+// as used by a SIMD intrinsic, and if so, set that local var appropriately.
+//
+// Arguments:
+// op - The tree, to be an operand of a new GT_SIMD node, to check.
+//
+void Compiler::SetOpLclRelatedToSIMDIntrinsic(GenTreePtr op)
+{
+ if (op->OperIsLocal())
{
- unsigned lclNum = op2->AsLclVarCommon()->GetLclNum();
- LclVarDsc* lclVarDsc = &lvaTable[lclNum];
- lclVarDsc->lvUsedInSIMDIntrinsic = true;
+ setLclRelatedToSIMDIntrinsic(op);
+ }
+ else if ((op->OperGet() == GT_OBJ) && (op->gtOp.gtOp1->OperGet() == GT_ADDR) &&
+ op->gtOp.gtOp1->gtOp.gtOp1->OperIsLocal())
+ {
+ setLclRelatedToSIMDIntrinsic(op->gtOp.gtOp1->gtOp.gtOp1);
}
-
- return new (this, GT_SIMD) GenTreeSIMD(type, op1, op2, simdIntrinsicID, baseType, size);
}
bool GenTree::isCommutativeSIMDIntrinsic()
{
*pSimdBaseType = simdBaseType;
}
-#ifdef _TARGET_AMD64_
- // Amd64: also indicate that we use floating point registers
+ // Also indicate that we use floating point registers.
compFloatingPointUsed = true;
-#endif
}
}
}
{
// On SSE2/AVX - the same instruction is used for moving double/quad word to XMM/YMM register.
assert((srcType == TYP_INT) || (srcType == TYP_UINT) || (srcType == TYP_LONG) || (srcType == TYP_ULONG));
+
+#if !defined(_TARGET_64BIT_)
+ // No 64-bit registers on x86.
+ assert((srcType != TYP_LONG) && (srcType != TYP_ULONG));
+#endif // !defined(_TARGET_64BIT_)
+
return INS_mov_i2xmm;
}
{
// On SSE2/AVX - the same instruction is used for moving double/quad word of XMM/YMM to an integer register.
assert((dstType == TYP_INT) || (dstType == TYP_UINT) || (dstType == TYP_LONG) || (dstType == TYP_ULONG));
+
+#if !defined(_TARGET_64BIT_)
+ // No 64-bit registers on x86.
+ assert((dstType != TYP_LONG) && (dstType != TYP_ULONG));
+#endif // !defined(_TARGET_64BIT_)
+
return INS_mov_xmm2i;
}
CONFIG_INTEGER(EnableSSE3_4, W("EnableSSE3_4"), 1) // Enable SSE3, SSSE3, SSE 4.1 and 4.2 instruction set as default
#endif
-#if defined(_TARGET_AMD64_)
-CONFIG_INTEGER(EnableAVX, W("EnableAVX"), 1) // Enable AVX instruction set for wide operations as default.
-// When both AVX and SSE3_4 are set, we will use the most capable instruction set available
-// which will prefer AVX over SSE3/4.
-#else // !defined(_TARGET_AMD64_)
-CONFIG_INTEGER(EnableAVX, W("EnableAVX"), 0) // Enable AVX instruction set for wide operations as default
-#endif // defined(_TARGET_AMD64_)
+#if defined(_TARGET_AMD64_) || defined(_TARGET_X86_)
+// Enable AVX instruction set for wide operations as default. When both AVX and SSE3_4 are set, we will use the most
+// capable instruction set available which will prefer AVX over SSE3/4.
+CONFIG_INTEGER(EnableAVX, W("EnableAVX"), 1)
+#else // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
+// Enable AVX instruction set for wide operations as default
+CONFIG_INTEGER(EnableAVX, W("EnableAVX"), 0)
+#endif // !defined(_TARGET_AMD64_) && !defined(_TARGET_X86_)
#if !defined(DEBUG) && !defined(_DEBUG)
CONFIG_INTEGER(JitEnableNoWayAssert, W("JitEnableNoWayAssert"), 0)
unsigned varNum = node->AsLclVarCommon()->GetLclNum();
LclVarDsc* varDsc = &comp->lvaTable[varNum];
-#if defined(_TARGET_64BIT_)
- assert(varDsc->lvSize() == 16);
- node->gtType = TYP_SIMD16;
-#else // !_TARGET_64BIT_
- if (varDsc->lvSize() == 16)
+ if (comp->lvaMapSimd12ToSimd16(varDsc))
{
+ JITDUMP("Mapping TYP_SIMD12 lclvar node to TYP_SIMD16:\n");
+ DISPNODE(node);
+ JITDUMP("============");
+
node->gtType = TYP_SIMD16;
}
- else
- {
- // The following assert is guaranteed by lvSize().
- assert(varDsc->lvIsParam);
- }
-#endif // !_TARGET_64BIT_
}
#endif // FEATURE_SIMD
__fallthrough;
m_block = block;
for (GenTree* node : BlockRange().NonPhiNodes())
{
-/* We increment the number position of each tree node by 2 to
-* simplify the logic when there's the case of a tree that implicitly
-* does a dual-definition of temps (the long case). In this case
-* is easier to already have an idle spot to handle a dual-def instead
-* of making some messy adjustments if we only increment the
-* number position by one.
-*/
+ // We increment the number position of each tree node by 2 to simplify the logic when there's the case of
+ // a tree that implicitly does a dual-definition of temps (the long case). In this case it is easier to
+ // already have an idle spot to handle a dual-def instead of making some messy adjustments if we only
+ // increment the number position by one.
+ CLANG_FORMAT_COMMENT_ANCHOR;
+
#ifdef DEBUG
node->gtSeqNum = currentLoc;
#endif
// ComputeAvailableSrcCount: computes the number of registers available as
// sources for a node.
//
-// This is simply the sum of the number of registers prduced by each
+// This is simply the sum of the number of registers produced by each
// operand to the node.
//
// Arguments:
return numSources;
}
-#endif
+#endif // DEBUG
void LinearScan::buildRefPositionsForNode(GenTree* tree,
BasicBlock* block,
// InitBlk
MakeSrcContained(storeLoc, op1);
}
- else if ((storeLoc->TypeGet() == TYP_SIMD12) && (storeLoc->OperGet() == GT_STORE_LCL_FLD))
+ else if (storeLoc->TypeGet() == TYP_SIMD12)
{
// Need an additional register to extract upper 4 bytes of Vector3.
info->internalFloatCount = 1;
{
unsigned fieldCount = 0;
bool needsByteTemp = false;
+ bool needsSimdTemp = false;
unsigned prevOffset = putArgStk->getArgSize();
for (GenTreeFieldList* current = putArgStk->gtOp1->AsFieldList(); current != nullptr; current = current->Rest())
{
SetRegOptional(fieldNode);
}
}
+#if defined(FEATURE_SIMD)
+ // Note that we need to check the GT_FIELD_LIST type, not the fieldType. This is because the
+ // GT_FIELD_LIST will be TYP_SIMD12 whereas the fieldType might be TYP_SIMD16 for lclVar, where
+ // we "round up" to 16.
+ else if (current->gtFieldType == TYP_SIMD12)
+ {
+ needsSimdTemp = true;
+ }
+#endif // defined(FEATURE_SIMD)
else
{
- assert(varTypeIsFloating(fieldNode));
+ assert(varTypeIsFloating(fieldNode) || varTypeIsSIMD(fieldNode));
}
// We can treat as a slot any field that is stored at a slot boundary, where the previous
}
info->setInternalCandidates(l, regMask);
}
+
+#if defined(FEATURE_SIMD)
+ // For PutArgStk of a TYP_SIMD12, we need a SIMD temp register.
+ if (needsSimdTemp)
+ {
+ info->internalFloatCount += 1;
+ info->addInternalCandidates(l, l->allSIMDRegs());
+ }
+#endif // defined(FEATURE_SIMD)
+
return;
}
#endif // _TARGET_X86_
case SIMDIntrinsicInit:
{
- info->srcCount = 1;
- op1 = tree->gtOp.gtOp1;
+ op1 = tree->gtOp.gtOp1;
+
+#if !defined(_TARGET_64BIT_)
+ if (op1->OperGet() == GT_LONG)
+ {
+ info->srcCount = 2;
+ }
+ else
+#endif // !defined(_TARGET_64BIT_)
+ {
+ info->srcCount = 1;
+ }
// This sets all fields of a SIMD struct to the given value.
// Mark op1 as contained if it is either zero or int constant of all 1's,
// Should never see small int base type vectors except for zero initialization.
assert(!varTypeIsSmallInt(simdTree->gtSIMDBaseType) || op1->IsIntegralConst(0));
- if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
- (varTypeIsIntegral(simdTree->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
+#if !defined(_TARGET_64BIT_)
+ if (op1->OperGet() == GT_LONG)
{
- MakeSrcContained(tree, tree->gtOp.gtOp1);
+ GenTree* op1lo = op1->gtGetOp1();
+ GenTree* op1hi = op1->gtGetOp2();
+
+ if ((op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0)) ||
+ (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1)))
+ {
+ assert(op1->gtLsraInfo.srcCount == 0);
+ assert(op1->gtLsraInfo.dstCount == 0);
+ assert(op1lo->gtLsraInfo.srcCount == 0);
+ assert(op1lo->gtLsraInfo.dstCount == 1);
+ assert(op1hi->gtLsraInfo.srcCount == 0);
+ assert(op1hi->gtLsraInfo.dstCount == 1);
+
+ op1lo->gtLsraInfo.dstCount = 0;
+ op1hi->gtLsraInfo.dstCount = 0;
+ info->srcCount = 0;
+ }
+ else
+ {
+ // need a temp
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(lsra, lsra->allSIMDRegs());
+ info->isInternalRegDelayFree = true;
+ }
+ }
+ else
+#endif // !defined(_TARGET_64BIT_)
+ if (op1->IsFPZero() || op1->IsIntegralConst(0) ||
+ (varTypeIsIntegral(simdTree->gtSIMDBaseType) && op1->IsIntegralConst(-1)))
+ {
+ MakeSrcContained(tree, op1);
info->srcCount = 0;
}
else if ((comp->getSIMDInstructionSet() == InstructionSet_AVX) &&
// Either op1 is a float or dbl constant or an addr
if (op1->IsCnsFltOrDbl() || op1->OperIsLocalAddr())
{
- MakeSrcContained(tree, tree->gtOp.gtOp1);
+ MakeSrcContained(tree, op1);
info->srcCount = 0;
}
}
info->srcCount = 2;
// On SSE4/AVX, we can generate optimal code for (in)equality
- // against zero using ptest. We can safely do the this optimization
+ // against zero using ptest. We can safely do this optimization
// for integral vectors but not for floating-point for the reason
// that we have +0.0 and -0.0 and +0.0 == -0.0
op2 = tree->gtGetOp2();
}
else
{
-
// Need one SIMD register as scratch.
// See genSIMDIntrinsicRelOp() for details on code sequence generated and
// the need for one scratch register.
return false;
}
}
+#ifdef FEATURE_SIMD
+ else if (tree->OperGet() == GT_SIMD)
+ {
+ GenTreeSIMD* simdNode = tree->AsSIMD();
+ switch (simdNode->gtSIMDIntrinsicID)
+ {
+ case SIMDIntrinsicOpEquality:
+ case SIMDIntrinsicOpInEquality:
+ // We manifest it into a byte register, so the target must be byteable.
+ return true;
+
+ case SIMDIntrinsicGetItem:
+ {
+ // This logic is duplicated from genSIMDIntrinsicGetItem().
+ // When we generate code for a SIMDIntrinsicGetItem, under certain circumstances we need to
+ // generate a movzx/movsx. On x86, these require byteable registers. So figure out which
+ // cases will require this, so the non-byteable registers can be excluded.
+
+ GenTree* op1 = simdNode->gtGetOp1();
+ GenTree* op2 = simdNode->gtGetOp2();
+ var_types baseType = simdNode->gtSIMDBaseType;
+ if (!op1->isMemoryOp() && op2->IsCnsIntOrI() && varTypeIsSmallInt(baseType))
+ {
+ bool ZeroOrSignExtnReqd = true;
+ unsigned baseSize = genTypeSize(baseType);
+ if (baseSize == 1)
+ {
+ if ((op2->gtIntCon.gtIconVal % 2) == 1)
+ {
+ ZeroOrSignExtnReqd = (baseType == TYP_BYTE);
+ }
+ }
+ else
+ {
+ assert(baseSize == 2);
+ ZeroOrSignExtnReqd = (baseType == TYP_SHORT);
+ }
+ return ZeroOrSignExtnReqd;
+ }
+ break;
+ }
+
+ default:
+ break;
+ }
+ return false;
+ }
+#endif // FEATURE_SIMD
else
{
return false;
return;
}
+#ifdef DEBUG
+ if (verbose)
+ {
+ printf("\nlvaTable before fgPromoteStructs\n");
+ lvaTableDump();
+ }
+#endif // DEBUG
+
// The lvaTable might grow as we grab temps. Make a local copy here.
unsigned startLvaCount = lvaCount;
bool promotedVar = false;
LclVarDsc* varDsc = &lvaTable[lclNum];
-#ifdef FEATURE_SIMD
- if (varDsc->lvSIMDType && varDsc->lvUsedInSIMDIntrinsic)
+ if (varDsc->lvIsSIMDType() && varDsc->lvIsUsedInSIMDIntrinsic())
{
// If we have marked this as lvUsedInSIMDIntrinsic, then we do not want to promote
// its fields. Instead, we will attempt to enregister the entire struct.
varDsc->lvRegStruct = true;
}
- else
-#endif // FEATURE_SIMD
- // Don't promote if we have reached the tracking limit.
- if (lvaHaveManyLocals())
+ else if (lvaHaveManyLocals()) // Don't promote if we have reached the tracking limit.
{
// Print the message first time when we detected this condition
if (!tooManyLocals)
if (canPromote)
{
-
// We *can* promote; *should* we promote?
// We should only do so if promotion has potential savings. One source of savings
// is if a field of the struct is accessed, since this access will be turned into
}
#endif // FEATURE_SIMD
}
+
+#ifdef DEBUG
+ if (verbose)
+ {
+ printf("\nlvaTable after fgPromoteStructs\n");
+ lvaTableDump();
+ }
+#endif // DEBUG
}
Compiler::fgWalkResult Compiler::fgMorphStructField(GenTreePtr tree, fgWalkData* fgWalkPre)
return nullptr;
}
-#ifdef _TARGET_X86_
- // NYI: support LONG type SIMD intrinsics. Need support in long decomposition.
- // (Don't use NYI fallback mechanism; just call the function.)
- if ((*baseType == TYP_LONG) || (*baseType == TYP_ULONG))
- {
- JITDUMP("NYI: x86 long base type SIMD intrinsics\n");
- return nullptr;
- }
-#endif // _TARGET_X86_
-
// account for implicit "this" arg
*argCount = sig->numArgs;
if (sig->hasThis())
result = INS_vbroadcastsd;
break;
case TYP_ULONG:
- __fallthrough;
case TYP_LONG:
+ // NOTE: for x86, this instruction is valid if the src is xmm2/m64, but NOT if it is supposed
+ // to be TYP_LONG reg.
result = INS_vpbroadcastq;
break;
case TYP_UINT:
- __fallthrough;
case TYP_INT:
result = INS_vpbroadcastd;
break;
case TYP_CHAR:
- __fallthrough;
case TYP_SHORT:
result = INS_vpbroadcastw;
break;
case TYP_UBYTE:
- __fallthrough;
case TYP_BYTE:
result = INS_vpbroadcastb;
break;
}
break;
}
+
// For SSE, SIMDIntrinsicInit uses the same instruction as the SIMDIntrinsicShuffleSSE2 intrinsic.
__fallthrough;
+
case SIMDIntrinsicShuffleSSE2:
if (baseType == TYP_FLOAT)
{
}
else if (baseType == TYP_LONG || baseType == TYP_ULONG)
{
- // We don't have a seperate SSE2 instruction and will
+ // We don't have a separate SSE2 instruction and will
// use the instruction meant for doubles since it is
// of the same size as a long.
result = INS_shufpd;
noway_assert(!varTypeIsSmallInt(baseType) || op1->IsIntegralConst(0));
instruction ins = INS_invalid;
- if (op1->isContained())
+
+#if !defined(_TARGET_64BIT_)
+ if (op1->OperGet() == GT_LONG)
+ {
+ assert(varTypeIsLong(baseType));
+
+ GenTree* op1lo = op1->gtGetOp1();
+ GenTree* op1hi = op1->gtGetOp2();
+
+ if (op1lo->IsIntegralConst(0) && op1hi->IsIntegralConst(0))
+ {
+ genSIMDZero(targetType, baseType, targetReg);
+ }
+ else if (op1lo->IsIntegralConst(-1) && op1hi->IsIntegralConst(-1))
+ {
+ // Initialize elements of vector with all 1's: generate pcmpeqd reg, reg.
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicEqual, TYP_INT);
+ inst_RV_RV(ins, targetReg, targetReg, targetType, emitActualTypeSize(targetType));
+ }
+ else
+ {
+ // Generate:
+ // mov_i2xmm targetReg, op1lo
+ // mov_i2xmm xmmtmp, op1hi
+ // shl xmmtmp, 4 bytes
+ // por targetReg, xmmtmp
+ // Now, targetReg has the long in the low 64 bits. For SSE2, move it to the high 64 bits using:
+ // shufpd targetReg, targetReg, 0 // move the long to all the lanes
+ // For AVX2, move it to all 4 of the 64-bit lanes using:
+ // vpbroadcastq targetReg, targetReg
+
+ instruction ins;
+
+ regNumber op1loReg = genConsumeReg(op1lo);
+ ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
+ inst_RV_RV(ins, targetReg, op1loReg, TYP_INT, emitTypeSize(TYP_INT));
+
+ assert(simdNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(simdNode->gtRsvdRegs) == 1);
+ regNumber tmpReg = genRegNumFromMask(simdNode->gtRsvdRegs);
+
+ regNumber op1hiReg = genConsumeReg(op1hi);
+ ins = ins_CopyIntToFloat(TYP_INT, TYP_FLOAT);
+ inst_RV_RV(ins, tmpReg, op1hiReg, TYP_INT, emitTypeSize(TYP_INT));
+
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShiftLeftInternal, baseType);
+ getEmitter()->emitIns_R_I(ins, EA_16BYTE, tmpReg, 4); // shift left by 4 bytes
+
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicBitwiseOr, baseType);
+ inst_RV_RV(ins, targetReg, tmpReg, targetType, emitActualTypeSize(targetType));
+
+#ifdef FEATURE_AVX_SUPPORT
+ if (compiler->canUseAVX())
+ {
+ inst_RV_RV(INS_vpbroadcastq, targetReg, targetReg, TYP_SIMD32, emitTypeSize(TYP_SIMD32));
+ }
+ else
+#endif // FEATURE_AVX_SUPPORT
+ {
+ ins = getOpForSIMDIntrinsic(SIMDIntrinsicShuffleSSE2, baseType);
+ getEmitter()->emitIns_R_R_I(ins, emitActualTypeSize(targetType), targetReg, targetReg, 0);
+ }
+ }
+ }
+ else
+#endif // !defined(_TARGET_64BIT_)
+ if (op1->isContained())
{
if (op1->IsIntegralConst(0) || op1->IsFPZero())
{
}
noway_assert(op2->isContained());
+ noway_assert(op2->IsCnsIntOrI());
unsigned int index = (unsigned int)op2->gtIntCon.gtIconVal;
unsigned int byteShiftCnt = index * genTypeSize(baseType);
assert(tmpReg != REG_NA);
ins = ins_CopyFloatToInt(TYP_FLOAT, baseType);
- // (Note that for mov_xmm2i, the int register is always in the reg2 position.
+ // (Note that for mov_xmm2i, the int register is always in the reg2 position.)
inst_RV_RV(ins, tmpReg, targetReg, baseType);
}
}
}
//-----------------------------------------------------------------------------
-// genStoreLclFldTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
+// genStoreLclTypeSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
// Since Vector3 is not a hardware supported write size, it is performed
// as two stores: 8 byte followed by 4-byte.
//
// Return Value:
// None.
//
-void CodeGen::genStoreLclFldTypeSIMD12(GenTree* treeNode)
+void CodeGen::genStoreLclTypeSIMD12(GenTree* treeNode)
{
- assert(treeNode->OperGet() == GT_STORE_LCL_FLD);
+ assert((treeNode->OperGet() == GT_STORE_LCL_FLD) || (treeNode->OperGet() == GT_STORE_LCL_VAR));
- unsigned offs = treeNode->gtLclFld.gtLclOffs;
+ unsigned offs = 0;
unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
assert(varNum < compiler->lvaCount);
+ if (treeNode->OperGet() == GT_LCL_FLD)
+ {
+ offs = treeNode->gtLclFld.gtLclOffs;
+ }
+
GenTreePtr op1 = treeNode->gtOp.gtOp1;
assert(!op1->isContained());
regNumber operandReg = genConsumeReg(op1);
#ifdef _TARGET_X86_
//-----------------------------------------------------------------------------
+// genStoreSIMD12ToStack: store a TYP_SIMD12 (i.e. Vector3) type field to the stack.
+// Since Vector3 is not a hardware supported write size, it is performed
+// as two stores: 8 byte followed by 4-byte. The stack is assumed to have
+// already been adjusted.
+//
+// Arguments:
+// operandReg - the xmm register containing the SIMD12 to store.
+// tmpReg - an xmm register that can be used as a temporary for the operation.
+//
+// Return Value:
+// None.
+//
+void CodeGen::genStoreSIMD12ToStack(regNumber operandReg, regNumber tmpReg)
+{
+ assert(genIsValidFloatReg(operandReg));
+ assert(genIsValidFloatReg(tmpReg));
+
+ // 8-byte write
+ getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, 0);
+
+ // Extract upper 4-bytes from data
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
+
+ // 4-byte write
+ getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, 8);
+}
+
+//-----------------------------------------------------------------------------
// genPutArgStkSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
// Since Vector3 is not a hardware supported write size, it is performed
-// as two stores: 8 byte followed by 4-byte.
+// as two stores: 8 byte followed by 4-byte. The stack is assumed to have
+// already been adjusted.
//
// Arguments:
// treeNode - tree node that is attempting to store TYP_SIMD12 field
assert(genCountBits(treeNode->gtRsvdRegs) == 1);
regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
- // Subtract from ESP; create space for argument.
- // TODO-CQ: use 'push' instead?
- inst_RV_IV(INS_sub, REG_SPBASE, 12, EA_PTRSIZE);
- genStackLevel += 12;
-
- // 8-byte write
- getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, 0);
-
- // Extract upper 4-bytes from data
- getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
-
- // 4-byte write
- getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, 8);
+ genStoreSIMD12ToStack(operandReg, tmpReg);
}
#endif // _TARGET_X86_
}
if (returnVal == false)
{
- Console.WriteLine("CheckValue failed for " + expectedValue + " of type " + typeof(T).ToString());
+ Console.WriteLine("CheckValue failed for type " + typeof(T).ToString() + ". Expected: {0} (0x{0:X}), Got: {1} (0x{1:X})", expectedValue, value);
}
return returnVal;
}