// Save/Restore callee saved float regs to stack
void genPreserveCalleeSavedFltRegs(unsigned lclFrameSize);
void genRestoreCalleeSavedFltRegs(unsigned lclFrameSize);
+ // Generate VZeroupper instruction to avoid AVX/SSE transition penalty
+ void genVzeroupperIfNeeded(bool check256bitOnly = true);
#endif // _TARGET_XARCH_ && FEATURE_STACK_FP_X87
// funclet frames: this will be FuncletInfo.fiSpDelta.
void CodeGen::genPreserveCalleeSavedFltRegs(unsigned lclFrameSize)
{
+ genVzeroupperIfNeeded(false);
regMaskTP regMask = compiler->compCalleeFPRegsSavedMask;
// Only callee saved floating point registers should be in regMask
offset -= XMM_REGSIZE_BYTES;
}
}
-
-#ifdef FEATURE_AVX_SUPPORT
- // Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
- // This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is
- // using SSE2.
- if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
- {
- instGen(INS_vzeroupper);
- }
-#endif
}
// Save/Restore compCalleeFPRegsPushed with the smallest register number saved at [RSP+offset], working
// fast path return
if (regMask == RBM_NONE)
{
+ genVzeroupperIfNeeded();
return;
}
assert((offset % 16) == 0);
#endif // _TARGET_AMD64_
-#ifdef FEATURE_AVX_SUPPORT
- // Just before restoring float registers issue a Vzeroupper to zero out upper 128-bits of all YMM regs.
- // This is to avoid penalty if this routine is using AVX-256 and now returning to a routine that is
- // using SSE2.
- if (compiler->getFloatingPointInstructionSet() == InstructionSet_AVX)
- {
- instGen(INS_vzeroupper);
- }
-#endif
-
for (regNumber reg = REG_FLT_CALLEE_SAVED_FIRST; regMask != RBM_NONE; reg = REG_NEXT(reg))
{
regMaskTP regBit = genRegMask(reg);
offset -= XMM_REGSIZE_BYTES;
}
}
+ genVzeroupperIfNeeded();
+}
+
+// Generate Vzeroupper instruction as needed to zero out upper 128b-bit of all YMM registers so that the
+// AVX/Legacy SSE transition penalties can be avoided. This function is been used in genPreserveCalleeSavedFltRegs
+// (prolog) and genRestoreCalleeSavedFltRegs (epilog). Issue VZEROUPPER in Prolog if the method contains
+// 128-bit or 256-bit AVX code, to avoid legacy SSE to AVX transition penalty, which could happen when native
+// code contains legacy SSE code calling into JIT AVX code (e.g. reverse pinvoke). Issue VZEROUPPER in Epilog
+// if the method contains 256-bit AVX code, to avoid AVX to legacy SSE transition penalty.
+//
+// Params
+// check256bitOnly - true to check if the function contains 256-bit AVX instruction and generate Vzeroupper
+// instruction, false to check if the function contains AVX instruciton (either 128-bit or 256-bit).
+//
+void CodeGen::genVzeroupperIfNeeded(bool check256bitOnly /* = true*/)
+{
+#ifdef FEATURE_AVX_SUPPORT
+ bool emitVzeroUpper = false;
+ if (check256bitOnly)
+ {
+ emitVzeroUpper = getEmitter()->Contains256bitAVX();
+ }
+ else
+ {
+ emitVzeroUpper = getEmitter()->ContainsAVX();
+ }
+
+ if (emitVzeroUpper)
+ {
+ assert(compiler->getSIMDInstructionSet() == InstructionSet_AVX);
+ instGen(INS_vzeroupper);
+ }
+#endif
}
+
#endif // defined(_TARGET_XARCH_) && !FEATURE_STACK_FP_X87
//-----------------------------------------------------------------------------------
#endif // defined(_TARGET_X86_)
+#ifdef FEATURE_AVX_SUPPORT
+ // When it's a PInvoke call and the call type is USER function, we issue VZEROUPPER here
+ // if the function contains 256bit AVX instructions, this is to avoid AVX-256 to Legacy SSE
+ // transition penalty, assuming the user function contains legacy SSE instruction.
+ // To limit code size increase impact: we only issue VZEROUPPER before PInvoke call, not issue
+ // VZEROUPPER after PInvoke call because transition penalty from legacy SSE to AVX only happens
+ // when there's preceding 256-bit AVX to legacy SSE transition penalty.
+ if (call->IsPInvoke() && (call->gtCallType == CT_USER_FUNC) && getEmitter()->Contains256bitAVX())
+ {
+ assert(compiler->getSIMDInstructionSet() == InstructionSet_AVX);
+ instGen(INS_vzeroupper);
+ }
+#endif
+
if (target != nullptr)
{
#ifdef _TARGET_X86_
if (opts.compCanUseAVX)
{
codeGen->getEmitter()->SetUseAVX(true);
+ // Assume each JITted method does not contain AVX instruction at first
+ codeGen->getEmitter()->SetContainsAVX(false);
+ codeGen->getEmitter()->SetContains256bitAVX(false);
}
else
#endif // FEATURE_AVX_SUPPORT
useAVXEncodings = value;
}
+bool containsAVXInstruction = false;
+bool ContainsAVX()
+{
+ return containsAVXInstruction;
+}
+void SetContainsAVX(bool value)
+{
+ containsAVXInstruction = value;
+}
+
+bool contains256bitAVXInstruction = false;
+bool Contains256bitAVX()
+{
+ return contains256bitAVXInstruction;
+}
+void SetContains256bitAVX(bool value)
+{
+ contains256bitAVXInstruction = value;
+}
+
bool IsThreeOperandBinaryAVXInstruction(instruction ins);
bool IsThreeOperandMoveAVXInstruction(instruction ins);
bool IsThreeOperandAVXInstruction(instruction ins)
{
return false;
}
+bool ContainsAVX()
+{
+ return false;
+}
+bool Contains256bitAVX()
+{
+ return false;
+}
bool hasVexPrefix(code_t code)
{
return false;
#if defined(_TARGET_XARCH_)
void SetMulOpCounts(GenTreePtr tree);
+ void SetContainsAVXFlags(bool isFloatingPointType = true, unsigned sizeOfSIMDVector = 0);
#endif // defined(_TARGET_XARCH_)
#if !CPU_LOAD_STORE_ARCH
Compiler* compiler = comp;
TreeNodeInfo* info = &(tree->gtLsraInfo);
-
+ // floating type generates AVX instruction (vmovss etc.), set the flag
+ SetContainsAVXFlags(varTypeIsFloating(tree->TypeGet()));
switch (tree->OperGet())
{
GenTree* op1;
{
MakeSrcContained(blkNode, source);
}
+ // use XMM register to fill with constants, it's AVX instruction and set the flag
+ SetContainsAVXFlags();
}
blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll;
// series of 16-byte loads and stores.
blkNode->gtLsraInfo.internalFloatCount = 1;
blkNode->gtLsraInfo.addInternalCandidates(l, l->internalFloatRegCandidates());
+ // Uses XMM reg for load and store and hence check to see whether AVX instructions
+ // are used for codegen, set ContainsAVX flag
+ SetContainsAVXFlags();
}
// If src or dst are on stack, we don't have to generate the address into a register
TreeNodeInfo* info = &(tree->gtLsraInfo);
LinearScan* lsra = m_lsra;
info->dstCount = 1;
+ SetContainsAVXFlags(true, simdTree->gtSIMDSize);
switch (simdTree->gtSIMDIntrinsicID)
{
GenTree* op1;
}
//------------------------------------------------------------------------------
+// SetContainsAVXFlags: Set ContainsAVX flag when it is floating type, set
+// Contains256bitAVX flag when SIMD vector size is 32 bytes
+//
+// Arguments:
+// isFloatingPointType - true if it is floating point type
+// sizeOfSIMDVector - SIMD Vector size
+//
+void Lowering::SetContainsAVXFlags(bool isFloatingPointType /* = true */, unsigned sizeOfSIMDVector /* = 0*/)
+{
+#ifdef FEATURE_AVX_SUPPORT
+ if (isFloatingPointType)
+ {
+ if (comp->getFloatingPointInstructionSet() == InstructionSet_AVX)
+ {
+ comp->getEmitter()->SetContainsAVX(true);
+ }
+ if (sizeOfSIMDVector == 32 && comp->getSIMDInstructionSet() == InstructionSet_AVX)
+ {
+ comp->getEmitter()->SetContains256bitAVX(true);
+ }
+ }
+#endif
+}
+
+//------------------------------------------------------------------------------
// isRMWRegOper: Can this binary tree node be used in a Read-Modify-Write format
//
// Arguments: