From f9a81bc872421e54e32d79fd17541e5fa23d5689 Mon Sep 17 00:00:00 2001 From: Venkata Sivaramakrishna Ramadugu Date: Tue, 14 Jul 2015 14:46:01 -0700 Subject: [PATCH] Fix to issue- .Net 4.6 RC x64 is twice as slow as Legacy Jit64 and x86. Perf regression is due to a stall caused by "cvtsi2sd xmm1, rax" that partially writes lower 8-bytes of xmm1 but keeps other bytes unmodified. Since there are further uses of xmm1, it will introduce a false dependency and cause stalls. Fix is to emit "xorps targetReg, targetReg" before a cvtsi2ss/sd instruction. Fix #993 [tfs-changeset: 1501252] --- src/jit/codegenxarch.cpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/jit/codegenxarch.cpp b/src/jit/codegenxarch.cpp index 911e9d0..1f590ab 100644 --- a/src/jit/codegenxarch.cpp +++ b/src/jit/codegenxarch.cpp @@ -6059,13 +6059,22 @@ CodeGen::genIntToFloatCast(GenTreePtr treeNode) // Also we don't expect to see uint32 -> float/double and uint64 -> float conversions // here since they should have been lowered apropriately. noway_assert(srcType != TYP_UINT); - noway_assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT)); + noway_assert((srcType != TYP_ULONG) || (dstType != TYP_FLOAT)); - + // To convert int to a float/double, cvtsi2ss/sd SSE2 instruction is used + // which does a partial write to lower 4/8 bytes of xmm register keeping the other + // upper bytes unmodified. If "cvtsi2ss/sd xmmReg, r32/r64" occurs inside a loop, + // the partial write could introduce a false dependency and could cause a stall + // if there are further uses of xmmReg. We have such a case occuring with a + // customer reported version of SpectralNorm benchmark, resulting in 2x perf + // regression. To avoid false dependency, we emit "xorps xmmReg, xmmReg" before + // cvtsi2ss/sd instruction. + + genConsumeOperands(treeNode->AsOp()); + getEmitter()->emitIns_R_R(INS_xorps, EA_4BYTE, treeNode->gtRegNum, treeNode->gtRegNum); // Note that here we need to specify srcType that will determine // the size of source reg/mem operand and rex.w prefix. - genConsumeOperands(treeNode->AsOp()); instruction ins = ins_FloatConv(dstType, TYP_INT); getEmitter()->emitInsBinary(ins, emitTypeSize(srcType), treeNode, op1); -- 2.7.4