We need to relax the assert as our estimation won't include code-gen
stack changes (which we know don't affect fgAddCodeRef()) */
noway_assert(getEmitter()->emitMaxStackDepth <=
- (compiler->fgPtrArgCntMax + compiler->compHndBBtabCount + // Return address for locally-called finallys
+ (compiler->fgPtrArgCntMax + // Max number of pointer-sized stack arguments.
+ compiler->compHndBBtabCount + // Return address for locally-called finallys
genTypeStSz(TYP_LONG) + // longs/doubles may be transferred via stack, etc
(compiler->compTailCallUsed ? 4 : 0))); // CORINFO_HELP_TAILCALL args
#endif
GenTreeLclVarCommon* lcl = unspillTree->AsLclVarCommon();
LclVarDsc* varDsc = &compiler->lvaTable[lcl->gtLclNum];
-// TODO-Cleanup: The following code could probably be further merged and cleand up.
+// TODO-Cleanup: The following code could probably be further merged and cleaned up.
#ifdef _TARGET_XARCH_
// Load local variable from its home location.
// In most cases the tree type will indicate the correct type to use for the load.
void genStoreIndTypeSIMD12(GenTree* treeNode);
void genStoreLclFldTypeSIMD12(GenTree* treeNode);
void genLoadIndTypeSIMD12(GenTree* treeNode);
-void genLoadLclFldTypeSIMD12(GenTree* treeNode);
+void genLoadLclTypeSIMD12(GenTree* treeNode);
+#ifdef _TARGET_X86_
+void genPutArgStkSIMD12(GenTree* treeNode);
+#endif // _TARGET_X86_
#endif // FEATURE_SIMD
#if !defined(_TARGET_64BIT_)
if (isRegCandidate && !(treeNode->gtFlags & GTF_VAR_DEATH))
{
- assert((treeNode->InReg()) || (treeNode->gtFlags & GTF_SPILLED));
+ assert(treeNode->InReg() || (treeNode->gtFlags & GTF_SPILLED));
}
// If this is a register candidate that has been spilled, genConsumeReg() will
{
assert(!isRegCandidate);
+#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+ // Loading of TYP_SIMD12 (i.e. Vector3) variable
+ if (treeNode->TypeGet() == TYP_SIMD12)
+ {
+ genLoadLclTypeSIMD12(treeNode);
+ break;
+ }
+#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+
emit->emitIns_R_S(ins_Load(treeNode->TypeGet(), compiler->isSIMDTypeLocalAligned(lcl->gtLclNum)),
emitTypeSize(treeNode), treeNode->gtRegNum, lcl->gtLclNum, 0);
genProduceReg(treeNode);
// Loading of TYP_SIMD12 (i.e. Vector3) field
if (treeNode->TypeGet() == TYP_SIMD12)
{
- genLoadLclFldTypeSIMD12(treeNode);
+ genLoadLclTypeSIMD12(treeNode);
break;
}
#endif
var_types targetType = putArgStk->TypeGet();
#ifdef _TARGET_X86_
+
+#ifdef FEATURE_SIMD
+ if (targetType == TYP_SIMD12)
+ {
+ genPutArgStkSIMD12(putArgStk);
+ return;
+ }
+#endif // FEATURE_SIMD
+
if (varTypeIsStruct(targetType))
{
(void)genAdjustStackForPutArgStk(putArgStk);
instruction ins;
emitAttr attr;
unsigned size;
+
if (type == TYP_STRUCT)
{
ins = INS_movdqu;
if (varTypeIsSIMD(type))
{
assert(genIsValidFloatReg(srcReg));
- ins = ins_Store(type);
+ ins = ins_Store(type); // TODO-CQ: pass 'aligned' correctly
}
else
#endif // FEATURE_SIMD
attr = emitTypeSize(type);
size = genTypeSize(type);
}
+
#ifdef _TARGET_X86_
if (m_pushStkArg)
{
// is now TYP_INT in the local variable table. It's not really unused, because it's in the tree.
assert(varTypeIsStruct(lvType) || (lvType == TYP_BLK) || (lvPromoted && lvUnusedStruct));
+
+#if defined(FEATURE_SIMD) && !defined(_TARGET_64BIT_)
+ // For 32-bit architectures, we make local variable SIMD12 types 16 bytes instead of just 12. We can't do
+ // this for arguments, which must be passed according the defined ABI.
+ if ((lvType == TYP_SIMD12) && !lvIsParam)
+ {
+ assert(lvExactSize == 12);
+ return 16;
+ }
+#endif // defined(FEATURE_SIMD) && !defined(_TARGET_64BIT_)
+
return (unsigned)(roundUp(lvExactSize, TARGET_POINTER_SIZE));
}
void lvaInit();
- unsigned lvaArgSize(const void* argTok);
unsigned lvaLclSize(unsigned varNum);
unsigned lvaLclExactSize(unsigned varNum);
// Returns true if the TYP_SIMD locals on stack are aligned at their
// preferred byte boundary specified by getSIMDTypeAlignment().
+ //
+ // As per the Intel manual, the preferred alignment for AVX vectors is 32-bytes. On Amd64,
+ // RSP/EBP is aligned at 16-bytes, therefore to align SIMD types at 32-bytes we need even
+ // RSP/EBP to be 32-byte aligned. It is not clear whether additional stack space used in
+ // aligning stack is worth the benefit and for now will use 16-byte alignment for AVX
+ // 256-bit vectors with unaligned load/stores to/from memory. On x86, the stack frame
+ // is aligned to 4 bytes. We need to extend existing support for double (8-byte) alignment
+ // to 16 or 32 byte alignment for frames with local SIMD vars, if that is determined to be
+ // profitable.
+ //
bool isSIMDTypeLocalAligned(unsigned varNum)
{
#if defined(FEATURE_SIMD) && ALIGN_SIMD_TYPES
int off = lvaFrameAddress(varNum, &ebpBased);
// TODO-Cleanup: Can't this use the lvExactSize on the varDsc?
int alignment = getSIMDTypeAlignment(lvaTable[varNum].lvType);
- bool isAligned = ((off % alignment) == 0);
- noway_assert(isAligned || lvaTable[varNum].lvIsParam);
+ bool isAligned = (alignment <= STACK_ALIGN) && ((off % alignment) == 0);
return isAligned;
}
#endif // FEATURE_SIMD
*
* Parameters
* srcType - source type
- * aligned - whether source is 16-byte aligned if srcType is a SIMD type
+ * aligned - whether source is properly aligned if srcType is a SIMD type
*/
instruction CodeGenInterface::ins_Load(var_types srcType, bool aligned /*=false*/)
{
#endif // FEATURE_SIMD
if (compiler->canUseAVX())
{
- // TODO-CQ: consider alignment of AVX vectors.
- return INS_movupd;
+ return (aligned) ? INS_movapd : INS_movupd;
}
else
{
*
* Parameters
* dstType - destination type
- * aligned - whether destination is 16-byte aligned if dstType is a SIMD type
+ * aligned - whether destination is properly aligned if dstType is a SIMD type
*/
instruction CodeGenInterface::ins_Store(var_types dstType, bool aligned /*=false*/)
{
#endif // FEATURE_SIMD
if (compiler->canUseAVX())
{
- // TODO-CQ: consider alignment of AVX vectors.
- return INS_movupd;
+ return (aligned) ? INS_movapd : INS_movupd;
}
else
{
}
#ifndef _TARGET_64BIT_
- bool fDoubleAlignHint = FALSE;
+ BOOL fDoubleAlignHint = FALSE;
#ifdef _TARGET_X86_
fDoubleAlignHint = TRUE;
#endif
case GT_STORE_LCL_VAR:
if (node->TypeGet() == TYP_SIMD12)
{
-#ifdef _TARGET_64BIT_
// Assumption 1:
// RyuJit backend depends on the assumption that on 64-Bit targets Vector3 size is rounded off
// to TARGET_POINTER_SIZE and hence Vector3 locals on stack can be treated as TYP_SIMD16 for
// Vector3 return values are returned two return registers and Caller assembles them into a
// single xmm reg. Hence RyuJIT explicitly generates code to clears upper 4-bytes of Vector3
// type args in prolog and Vector3 type return value of a call
+ //
+ // RyuJIT x86 Windows: all non-param Vector3 local vars are allocated as 16 bytes. Vector3 arguments
+ // are pushed as 12 bytes. For return values, a 16-byte local is allocated and the address passed
+ // as a return buffer pointer. The callee doesn't write the high 4 bytes, and we don't need to clear
+ // it either.
+
+ unsigned varNum = node->AsLclVarCommon()->GetLclNum();
+ LclVarDsc* varDsc = &comp->lvaTable[varNum];
+
+#if defined(_TARGET_64BIT_)
+ assert(varDsc->lvSize() == 16);
node->gtType = TYP_SIMD16;
-#else
- NYI("Lowering of TYP_SIMD12 locals");
-#endif // _TARGET_64BIT_
+#else // !_TARGET_64BIT_
+ if (varDsc->lvSize() == 16)
+ {
+ node->gtType = TYP_SIMD16;
+ }
+ else
+ {
+ // The following assert is guaranteed by lvSize().
+ assert(varDsc->lvIsParam);
+ }
+#endif // !_TARGET_64BIT_
}
#endif // FEATURE_SIMD
__fallthrough;
// Arguments:
// call - the call whose arg is being rewritten.
// arg - the arg being rewritten.
-// info - the ArgTabEntry information for the argument.
+// info - the fgArgTabEntry information for the argument.
// type - the type of the argument.
//
// Return Value:
// for two eightbyte structs.
//
// For STK passed structs the method generates GT_PUTARG_STK tree. For System V systems with native struct passing
-// (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING defined) this method also sets the GP pointers count and the pointers
+// (i.e. FEATURE_UNIX_AMD64_STRUCT_PASSING defined) this method also sets the GC pointers count and the pointers
// layout object, so the codegen of the GT_PUTARG_STK could use this for optimizing copying to the stack by value.
// (using block copy primitives for non GC pointers and a single TARGET_POINTER_SIZE copy with recording GC info.)
//
// pair copying using XMM registers or rep mov instructions.
if (info->isStruct)
{
- unsigned numRefs = 0;
- BYTE* gcLayout = new (comp, CMK_Codegen) BYTE[info->numSlots];
// We use GT_OBJ for non-SIMD struct arguments. However, for
// SIMD arguments the GT_OBJ has already been transformed.
if (arg->gtOper != GT_OBJ)
}
else
{
+ unsigned numRefs = 0;
+ BYTE* gcLayout = new (comp, CMK_Codegen) BYTE[info->numSlots];
assert(!varTypeIsSIMD(arg));
numRefs = comp->info.compCompHnd->getClassGClayout(arg->gtObj.gtClass, gcLayout);
+ putArg->AsPutArgStk()->setGcPointers(numRefs, gcLayout);
}
-
- putArg->AsPutArgStk()->setGcPointers(numRefs, gcLayout);
}
#endif // FEATURE_PUT_STRUCT_ARG_STK
}
type = TYP_INT;
}
+#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+ // Non-param TYP_SIMD12 local var nodes are massaged in Lower to TYP_SIMD16 to match their
+ // allocated size (see lvSize()). However, when passing the variables as arguments, and
+ // storing the variables to the outgoing argument area on the stack, we must use their
+ // actual TYP_SIMD12 type, so exactly 12 bytes is allocated and written.
+ if (type == TYP_SIMD16)
+ {
+ if ((arg->OperGet() == GT_LCL_VAR) || (arg->OperGet() == GT_STORE_LCL_VAR))
+ {
+ unsigned varNum = arg->AsLclVarCommon()->GetLclNum();
+ LclVarDsc* varDsc = &comp->lvaTable[varNum];
+ type = varDsc->lvType;
+ }
+ }
+#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+
GenTreePtr putArg;
// If we hit this we are probably double-lowering.
break;
case GT_LCL_FLD:
+ case GT_LCL_VAR:
info->srcCount = 0;
info->dstCount = 1;
}
#endif // _TARGET_X86_
+#if defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+ // For PutArgStk of a TYP_SIMD12, we need an extra register.
+ if (putArgStk->TypeGet() == TYP_SIMD12)
+ {
+ info->srcCount = putArgStk->gtOp1->gtLsraInfo.dstCount;
+ info->dstCount = 0;
+ info->internalFloatCount = 1;
+ info->setInternalCandidates(l, l->allSIMDRegs());
+ return;
+ }
+#endif // defined(FEATURE_SIMD) && defined(_TARGET_X86_)
+
if (putArgStk->TypeGet() != TYP_STRUCT)
{
TreeNodeInfoInitSimple(putArgStk);
regNumber rotateBlockStartLocation(Interval* interval, regNumber targetReg, regMaskTP availableRegs);
// This controls whether we always insert a GT_RELOAD instruction after a spill
- // Note that this can be combined with LsraSpillAlways (or not)
+ // Note that this can be combined with LSRA_SPILL_ALWAYS (or not)
enum LsraReload{LSRA_NO_RELOAD_IF_SAME = 0, LSRA_ALWAYS_INSERT_RELOAD = 0x400, LSRA_RELOAD_MASK = 0x400};
LsraReload getLsraReload()
{
return;
}
- // If the operand of is a GT_ADDR(GT_LCL_VAR) and LclVar is known to be of simdType,
- // replace obj by GT_LCL_VAR.
+ // If we have GT_IND(GT_LCL_VAR_ADDR) and the GT_LCL_VAR_ADDR is TYP_BYREF/TYP_I_IMPL,
+ // and the var is a SIMD type, replace the expression by GT_LCL_VAR.
GenTree* addr = tree->AsIndir()->Addr();
if (addr->OperIsLocalAddr() && comp->isAddrOfSIMDType(addr))
{
addr->gtType = simdType;
use.ReplaceWith(comp, addr);
}
+#if defined(_TARGET_X86_)
+ // For x86, if we have GT_IND(GT_ADDR(GT_SIMD)), remove the GT_IND(GT_ADDR()), leaving just
+ // the GT_SIMD.
+ else if ((addr->OperGet() == GT_ADDR) && (addr->gtGetOp1()->OperGet() == GT_SIMD))
+ {
+ BlockRange().Remove(tree);
+ BlockRange().Remove(addr);
+
+ use.ReplaceWith(comp, addr->gtGetOp1());
+ }
+#endif // defined(_TARGET_X86_)
else if (!keepBlk)
{
tree->SetOper(GT_IND);
type = genActualType(type);
+#if defined(FEATURE_SIMD) && !defined(_TARGET_64BIT_)
+ // For SIMD on 32-bit platforms, we always spill SIMD12 to a 16-byte SIMD16 temp.
+ // This is because we don't have a single instruction to store 12 bytes. We also
+ // allocate non-argument locals as 16 bytes; see lvSize().
+ if (type == TYP_SIMD12)
+ {
+ type = TYP_SIMD16;
+ }
+#endif // defined(FEATURE_SIMD) && !defined(_TARGET_64BIT_)
+
#else // LEGACY_BACKEND
if (!varTypeIsGC(type))
{
int Compiler::getSIMDTypeAlignment(var_types simdType)
{
#ifdef _TARGET_XARCH_
- // TODO-x86: Need to figure out stack alignment for SIMD on x86.
// Fixed length vectors have the following alignment preference
- // Vector2/3 = 8 byte alignment
- // Vector4 = 16-byte alignment
+ // Vector2 = 8 byte alignment
+ // Vector3/4 = 16-byte alignment
unsigned size = genTypeSize(simdType);
// preferred alignment for SSE2 128-bit vectors is 16-bytes
{
return 8;
}
-
- // As per Intel manual, AVX vectors preferred alignment is 32-bytes but on Amd64
- // RSP/EBP is aligned at 16-bytes, therefore to align SIMD types at 32-bytes we need even
- // RSP/EBP to be 32-byte aligned. It is not clear whether additional stack space used in
- // aligning stack is worth the benefit and for now will use 16-byte alignment for AVX
- // 256-bit vectors with unaligned load/stores to/from memory.
- return 16;
+ else if (size <= 16)
+ {
+ assert((size == 12) || (size == 16));
+ return 16;
+ }
+ else
+ {
+ assert(size == 32);
+ return 32;
+ }
#else
assert(!"getSIMDTypeAlignment() unimplemented on target arch");
unreached();
}
#else // !_TARGET_XARCH_
- assert(!"Abs intrinsic on non-Amd64 target not implemented");
+ assert(!"Abs intrinsic on non-xarch target not implemented");
unreached();
#endif // !_TARGET_XARCH_
}
}
//-----------------------------------------------------------------------------
-// genLoadLclFldTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field.
-// Since Vector3 is not a hardware supported write size, it is performed
-// as two reads: 8 byte followed by 4-byte.
+// genLoadLclTypeSIMD12: load a TYP_SIMD12 (i.e. Vector3) type field.
+// Since Vector3 is not a hardware supported read size, it is performed
+// as two reads: 4 byte followed by 8 byte.
//
// Arguments:
// treeNode - tree node that is attempting to load TYP_SIMD12 field
// Return Value:
// None.
//
-void CodeGen::genLoadLclFldTypeSIMD12(GenTree* treeNode)
+void CodeGen::genLoadLclTypeSIMD12(GenTree* treeNode)
{
- assert(treeNode->OperGet() == GT_LCL_FLD);
+ assert((treeNode->OperGet() == GT_LCL_FLD) || (treeNode->OperGet() == GT_LCL_VAR));
regNumber targetReg = treeNode->gtRegNum;
- unsigned offs = treeNode->gtLclFld.gtLclOffs;
+ unsigned offs = 0;
unsigned varNum = treeNode->gtLclVarCommon.gtLclNum;
assert(varNum < compiler->lvaCount);
- // Need an addtional Xmm register that is different from
+ if (treeNode->OperGet() == GT_LCL_FLD)
+ {
+ offs = treeNode->gtLclFld.gtLclOffs;
+ }
+
+ // Need an additional Xmm register that is different from
// targetReg to read upper 4 bytes.
assert(treeNode->gtRsvdRegs != RBM_NONE);
assert(genCountBits(treeNode->gtRsvdRegs) == 1);
genProduceReg(treeNode);
}
+#ifdef _TARGET_X86_
+
+//-----------------------------------------------------------------------------
+// genPutArgStkSIMD12: store a TYP_SIMD12 (i.e. Vector3) type field.
+// Since Vector3 is not a hardware supported write size, it is performed
+// as two stores: 8 byte followed by 4-byte.
+//
+// Arguments:
+// treeNode - tree node that is attempting to store TYP_SIMD12 field
+//
+// Return Value:
+// None.
+//
+void CodeGen::genPutArgStkSIMD12(GenTree* treeNode)
+{
+ assert(treeNode->OperGet() == GT_PUTARG_STK);
+
+ GenTreePtr op1 = treeNode->gtOp.gtOp1;
+ assert(!op1->isContained());
+ regNumber operandReg = genConsumeReg(op1);
+
+ // Need an addtional Xmm register to extract upper 4 bytes from data.
+ assert(treeNode->gtRsvdRegs != RBM_NONE);
+ assert(genCountBits(treeNode->gtRsvdRegs) == 1);
+ regNumber tmpReg = genRegNumFromMask(treeNode->gtRsvdRegs);
+
+ // Subtract from ESP; create space for argument.
+ // TODO-CQ: use 'push' instead?
+ inst_RV_IV(INS_sub, REG_SPBASE, 12, EA_PTRSIZE);
+ genStackLevel += 12;
+
+ // 8-byte write
+ getEmitter()->emitIns_AR_R(ins_Store(TYP_DOUBLE), EA_8BYTE, operandReg, REG_SPBASE, 0);
+
+ // Extract upper 4-bytes from data
+ getEmitter()->emitIns_R_R_I(INS_pshufd, emitActualTypeSize(TYP_SIMD16), tmpReg, operandReg, 0x02);
+
+ // 4-byte write
+ getEmitter()->emitIns_AR_R(ins_Store(TYP_FLOAT), EA_4BYTE, tmpReg, REG_SPBASE, 8);
+}
+
+#endif // _TARGET_X86_
+
//-----------------------------------------------------------------------------
// genSIMDIntrinsicUpperSave: save the upper half of a TYP_SIMD32 vector to
// the given register, if any, or to memory.
#endif // !LEGACY_BACKEND
+#ifdef FEATURE_SIMD
+ #define ALIGN_SIMD_TYPES 1 // whether SIMD type locals are to be aligned
+#endif // FEATURE_SIMD
#define FEATURE_WRITE_BARRIER 1 // Generate the proper WriteBarrier calls for GC
#define FEATURE_FIXED_OUT_ARGS 0 // X86 uses push instructions to pass args