From ddae375636ca6cdbdf781fcb748d4b160dbfcfc4 Mon Sep 17 00:00:00 2001 From: Egor Chesakov Date: Tue, 15 Jun 2021 10:59:28 -0700 Subject: [PATCH] Split 16-byte SIMD store around GC struct fields into two 8-byte SIMD stores (x86)/ two 8-byte mov-s (x64) (#53116) Fixes #51638 by using 1) Constructing `ASG(OBJ(addr), 0)` for structs that have GC fields and keeping the current IR (i.e. `ASG(BLK(addr), 0)`) for other types. Such bookkeeping would allow the JIT to maintain information about the class layout. 2a) Emitting a sequence of `mov [m64],r64` instead of `movdqu [m128],xmm` when zeroing structs with GC fields that are not guaranteed to be on the stack on win-x64 or linux-x64. 2b) Emitting a sequence of `movq [m64],xmm` when zeroing such structs on win-x86. --- src/coreclr/jit/codegenxarch.cpp | 62 ++++++++++++++++++++++++++++++++-------- src/coreclr/jit/gentree.h | 7 +++++ src/coreclr/jit/importer.cpp | 24 ++++++++++++---- src/coreclr/jit/lowerxarch.cpp | 15 +++++++--- src/coreclr/jit/lsraxarch.cpp | 14 +++++++-- 5 files changed, 98 insertions(+), 24 deletions(-) diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 7fd371e..2bd0142 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -2752,26 +2752,47 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) src = src->AsUnOp()->gtGetOp1(); } + unsigned size = node->GetLayout()->GetSize(); + + // An SSE mov that accesses data larger than 8 bytes may be implemented using + // multiple memory accesses. Hence, the JIT must not use such stores when + // INITBLK zeroes a struct that contains GC pointers and can be observed by + // other threads (i.e. when dstAddr is not an address of a local). + // For example, this can happen when initializing a struct field of an object. + const bool canUse16BytesSimdMov = !node->IsOnHeapAndContainsReferences(); + +#ifdef TARGET_AMD64 + // On Amd64 the JIT will not use SIMD stores for such structs and instead + // will always allocate a GP register for src node. + const bool willUseSimdMov = canUse16BytesSimdMov && (size >= XMM_REGSIZE_BYTES); +#else + // On X86 the JIT will use movq for structs that are larger than 16 bytes + // since it is more beneficial than using two mov-s from a GP register. + const bool willUseSimdMov = (size >= 16); +#endif + if (!src->isContained()) { srcIntReg = genConsumeReg(src); } else { - // If src is contained then it must be 0 and the size must be a multiple - // of XMM_REGSIZE_BYTES so initialization can use only SSE2 instructions. + // If src is contained then it must be 0. assert(src->IsIntegralConst(0)); - assert((node->GetLayout()->GetSize() % XMM_REGSIZE_BYTES) == 0); + assert(willUseSimdMov); +#ifdef TARGET_AMD64 + assert(size % 16 == 0); +#else + assert(size % 8 == 0); +#endif } emitter* emit = GetEmitter(); - unsigned size = node->GetLayout()->GetSize(); assert(size <= INT32_MAX); assert(dstOffset < (INT32_MAX - static_cast(size))); - // Fill as much as possible using SSE2 stores. - if (size >= XMM_REGSIZE_BYTES) + if (willUseSimdMov) { regNumber srcXmmReg = node->GetSingleTempReg(RBM_ALLFLOAT); @@ -2791,9 +2812,25 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) #endif } - instruction simdMov = simdUnalignedMovIns(); - for (unsigned regSize = XMM_REGSIZE_BYTES; size >= regSize; size -= regSize, dstOffset += regSize) + instruction simdMov = simdUnalignedMovIns(); + unsigned regSize = XMM_REGSIZE_BYTES; + unsigned bytesWritten = 0; + + while (bytesWritten < size) { +#ifdef TARGET_X86 + if (!canUse16BytesSimdMov || (bytesWritten + regSize > size)) + { + simdMov = INS_movq; + regSize = 8; + } +#endif + if (bytesWritten + regSize > size) + { + assert(srcIntReg != REG_NA); + break; + } + if (dstLclNum != BAD_VAR_NUM) { emit->emitIns_S_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstLclNum, dstOffset); @@ -2803,11 +2840,12 @@ void CodeGen::genCodeForInitBlkUnroll(GenTreeBlk* node) emit->emitIns_ARX_R(simdMov, EA_ATTR(regSize), srcXmmReg, dstAddrBaseReg, dstAddrIndexReg, dstAddrIndexScale, dstOffset); } + + dstOffset += regSize; + bytesWritten += regSize; } - // TODO-CQ-XArch: On x86 we could initialize 8 byte at once by using MOVQ instead of two 4 byte MOV stores. - // On x64 it may also be worth zero initializing a 4/8 byte remainder using MOVD/MOVQ, that avoids the need - // to allocate a GPR just for the remainder. + size -= bytesWritten; } // Fill the remainder using normal stores. @@ -4604,7 +4642,7 @@ void CodeGen::genCodeForIndexAddr(GenTreeIndexAddr* node) // The VM doesn't allow such large array elements but let's be sure. noway_assert(scale <= INT32_MAX); #else // !TARGET_64BIT - tmpReg = node->GetSingleTempReg(); + tmpReg = node->GetSingleTempReg(); #endif // !TARGET_64BIT GetEmitter()->emitIns_R_I(emitter::inst3opImulForReg(tmpReg), EA_PTRSIZE, indexReg, diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index ef65f7d..e2a5045 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -5686,6 +5686,13 @@ public: bool gtBlkOpGcUnsafe; #endif +#ifdef TARGET_XARCH + bool IsOnHeapAndContainsReferences() + { + return (m_layout != nullptr) && m_layout->HasGCPtr() && !Addr()->OperIsLocalAddr(); + } +#endif + GenTreeBlk(genTreeOps oper, var_types type, GenTree* addr, ClassLayout* layout) : GenTreeIndir(oper, type, addr, nullptr) , m_layout(layout) diff --git a/src/coreclr/jit/importer.cpp b/src/coreclr/jit/importer.cpp index f14d8d0..fc8544f 100644 --- a/src/coreclr/jit/importer.cpp +++ b/src/coreclr/jit/importer.cpp @@ -16286,11 +16286,25 @@ void Compiler::impImportBlockCode(BasicBlock* block) "type operand incompatible with type of address"); } - size = info.compCompHnd->getClassSize(resolvedToken.hClass); // Size - op2 = gtNewIconNode(0); // Value - op1 = impPopStack().val; // Dest - op1 = gtNewBlockVal(op1, size); - op1 = gtNewBlkOpNode(op1, op2, (prefixFlags & PREFIX_VOLATILE) != 0, false); + op2 = gtNewIconNode(0); // Value + op1 = impPopStack().val; // Dest + + if (eeIsValueClass(resolvedToken.hClass)) + { + op1 = gtNewStructVal(resolvedToken.hClass, op1); + if (op1->OperIs(GT_OBJ)) + { + gtSetObjGcInfo(op1->AsObj()); + } + } + else + { + size = info.compCompHnd->getClassSize(resolvedToken.hClass); + assert(size == TARGET_POINTER_SIZE); + op1 = gtNewBlockVal(op1, size); + } + + op1 = gtNewBlkOpNode(op1, op2, (prefixFlags & PREFIX_VOLATILE) != 0, false); goto SPILL_APPEND; case CEE_INITBLK: diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 4d42530..55bfab9 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -216,11 +216,18 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) if (fill == 0) { - // If the size is multiple of XMM register size there's no need to load 0 in a GPR, - // codegen will use xorps to generate 0 directly in the temporary XMM register. - if ((size % XMM_REGSIZE_BYTES) == 0) + if (size >= XMM_REGSIZE_BYTES) { - src->SetContained(); + const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences(); +#ifdef TARGET_AMD64 + const bool willUseOnlySimdMov = canUse16BytesSimdMov && (size % 16 == 0); +#else + const bool willUseOnlySimdMov = (size % 8 == 0); +#endif + if (willUseOnlySimdMov) + { + src->SetContained(); + } } } #ifdef TARGET_AMD64 diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 811010f..854e452 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1069,7 +1069,7 @@ int LinearScan::BuildCall(GenTreeCall* call) // The return value will be on the X87 stack, and we will need to move it. dstCandidates = allRegs(registerType); #else // !TARGET_X86 - dstCandidates = RBM_FLOATRET; + dstCandidates = RBM_FLOATRET; #endif // !TARGET_X86 } else if (registerType == TYP_LONG) @@ -1297,7 +1297,14 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) switch (blkNode->gtBlkOpKind) { case GenTreeBlk::BlkOpKindUnroll: - if (size >= XMM_REGSIZE_BYTES) + { +#ifdef TARGET_AMD64 + const bool canUse16BytesSimdMov = !blkNode->IsOnHeapAndContainsReferences(); + const bool willUseSimdMov = canUse16BytesSimdMov && (size >= 16); +#else + const bool willUseSimdMov = (size >= 16); +#endif + if (willUseSimdMov) { buildInternalFloatRegisterDefForNode(blkNode, internalFloatRegCandidates()); SetContainsAVXFlags(); @@ -1310,7 +1317,8 @@ int LinearScan::BuildBlockStore(GenTreeBlk* blkNode) srcRegMask = allByteRegs(); } #endif - break; + } + break; case GenTreeBlk::BlkOpKindRepInstr: dstAddrRegMask = RBM_RDI; -- 2.7.4