From 87d6d584476890e3a89d626cc9eec59538c05951 Mon Sep 17 00:00:00 2001 From: SingleAccretion <62474226+SingleAccretion@users.noreply.github.com> Date: Thu, 17 Feb 2022 20:18:35 +0300 Subject: [PATCH] Use `push` for 8/12 byte struct args on x86 (#65387) * Refactor struct PUTARG_STK codegen Two changes: 1) Outline cases for GC pointers from "genPutStructArgStk" into their own functions. Helps with readability, will be used in the substative portion of the change. 2) Do not use "Kind::Push" for a case of a struct less than 16 bytes in size, that does not have GC pointers, on x86. With this, we will now always call "genStructPutArgUnroll" for the "Unroll" kind. This means "genAdjustStackForPutArgStk" has to special-case that. This will go away in the final version of the change. No diffs on x86 or Unix-x64 (the only two affected platforms). * Remove the LSRA quirk We're not using XMMs on the "push" path. * Use "push" for structs less than 16 bytes in size It is smaller and a little faster than using an XMM register to first load and then store the struct. * Add an assert to genStructPutArgPush --- src/coreclr/jit/codegen.h | 7 +- src/coreclr/jit/codegenxarch.cpp | 476 +++++++++++++++++++-------------------- src/coreclr/jit/gentree.cpp | 3 + src/coreclr/jit/gentree.h | 7 +- src/coreclr/jit/lowerxarch.cpp | 26 ++- src/coreclr/jit/lsraxarch.cpp | 23 +- 6 files changed, 277 insertions(+), 265 deletions(-) diff --git a/src/coreclr/jit/codegen.h b/src/coreclr/jit/codegen.h index 1e73419..f0dca5e 100644 --- a/src/coreclr/jit/codegen.h +++ b/src/coreclr/jit/codegen.h @@ -1231,9 +1231,14 @@ protected: unsigned genMove2IfNeeded(unsigned size, regNumber tmpReg, GenTree* srcAddr, unsigned offset); unsigned genMove1IfNeeded(unsigned size, regNumber tmpReg, GenTree* srcAddr, unsigned offset); void genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst, GenTree* base, unsigned offset); + void genStoreRegToStackArg(var_types type, regNumber reg, int offset); void genStructPutArgRepMovs(GenTreePutArgStk* putArgStkNode); void genStructPutArgUnroll(GenTreePutArgStk* putArgStkNode); - void genStoreRegToStackArg(var_types type, regNumber reg, int offset); +#ifdef TARGET_X86 + void genStructPutArgPush(GenTreePutArgStk* putArgStkNode); +#else + void genStructPutArgPartialRepMovs(GenTreePutArgStk* putArgStkNode); +#endif #endif // FEATURE_PUT_STRUCT_ARG_STK void genCodeForStoreBlk(GenTreeBlk* storeBlkNode); diff --git a/src/coreclr/jit/codegenxarch.cpp b/src/coreclr/jit/codegenxarch.cpp index 25cd51d..ab7f508 100644 --- a/src/coreclr/jit/codegenxarch.cpp +++ b/src/coreclr/jit/codegenxarch.cpp @@ -3337,108 +3337,64 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) GenTree* src = putArgNode->AsOp()->gtOp1; // We will never call this method for SIMD types, which are stored directly // in genPutStructArgStk(). - noway_assert(src->TypeGet() == TYP_STRUCT); + assert(src->isContained() && src->OperIs(GT_OBJ) && src->TypeIs(TYP_STRUCT)); + assert(!src->AsObj()->GetLayout()->HasGCPtr()); +#ifdef TARGET_X86 + assert(!m_pushStkArg); +#endif unsigned size = putArgNode->GetStackByteSize(); - assert(size <= CPBLK_UNROLL_LIMIT); - - emitter* emit = GetEmitter(); - unsigned putArgOffset = putArgNode->getArgOffset(); - - assert(src->isContained()); - - assert(src->gtOper == GT_OBJ); + assert((XMM_REGSIZE_BYTES <= size) && (size <= CPBLK_UNROLL_LIMIT)); if (src->AsOp()->gtOp1->isUsedFromReg()) { genConsumeReg(src->AsOp()->gtOp1); } - unsigned offset = 0; - + unsigned offset = 0; regNumber xmmTmpReg = REG_NA; regNumber intTmpReg = REG_NA; regNumber longTmpReg = REG_NA; -#ifdef TARGET_X86 - // On x86 we use an XMM register for both 16 and 8-byte chunks, but if it's - // less than 16 bytes, we will just be using pushes - if (size >= 8) - { - xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT); - longTmpReg = xmmTmpReg; - } - if ((size & 0x7) != 0) - { - intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT); - } -#else // !TARGET_X86 - // On x64 we use an XMM register only for 16-byte chunks. + if (size >= XMM_REGSIZE_BYTES) { xmmTmpReg = putArgNode->GetSingleTempReg(RBM_ALLFLOAT); } - if ((size & 0xf) != 0) + if ((size % XMM_REGSIZE_BYTES) != 0) { - intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT); - longTmpReg = intTmpReg; + intTmpReg = putArgNode->GetSingleTempReg(RBM_ALLINT); } -#endif // !TARGET_X86 - // If the size of this struct is larger than 16 bytes - // let's use SSE2 to be able to do 16 byte at a time - // loads and stores. - if (size >= XMM_REGSIZE_BYTES) - { #ifdef TARGET_X86 - assert(!m_pushStkArg); -#endif // TARGET_X86 - size_t slots = size / XMM_REGSIZE_BYTES; - - assert(putArgNode->gtGetOp1()->isContained()); - assert(putArgNode->gtGetOp1()->AsOp()->gtOper == GT_OBJ); + longTmpReg = xmmTmpReg; +#else + longTmpReg = intTmpReg; +#endif + // Let's use SSE2 to be able to do 16 byte at a time with loads and stores. + size_t slots = size / XMM_REGSIZE_BYTES; + while (slots-- > 0) + { // TODO: In the below code the load and store instructions are for 16 bytes, but the - // type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but - // this probably needs to be changed. - while (slots-- > 0) - { - // Load - genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src->gtGetOp1(), offset); + // type is EA_8BYTE. The movdqa/u are 16 byte instructions, so it works, but + // this probably needs to be changed. - // Store - genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset); + // Load + genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmTmpReg, src->gtGetOp1(), offset); + // Store + genStoreRegToStackArg(TYP_STRUCT, xmmTmpReg, offset); - offset += XMM_REGSIZE_BYTES; - } + offset += XMM_REGSIZE_BYTES; } // Fill the remainder (15 bytes or less) if there's one. - if ((size & 0xf) != 0) + if ((size % XMM_REGSIZE_BYTES) != 0) { -#ifdef TARGET_X86 - if (m_pushStkArg) - { - // This case is currently supported only for the case where the total size is - // less than XMM_REGSIZE_BYTES. We need to push the remaining chunks in reverse - // order. However, morph has ensured that we have a struct that is an even - // multiple of TARGET_POINTER_SIZE, so we don't need to worry about alignment. - assert(((size & 0xc) == size) && (offset == 0)); - // If we have a 4 byte chunk, load it from either offset 0 or 8, depending on - // whether we've got an 8 byte chunk, and then push it on the stack. - unsigned pushedBytes = genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, size & 0x8); - // Now if we have an 8 byte chunk, load it from offset 0 (it's the first chunk) - // and push it on the stack. - pushedBytes += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, 0); - } - else -#endif // TARGET_X86 - { - offset += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, offset); - offset += genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); - offset += genMove2IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); - offset += genMove1IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); - assert(offset == size); - } + offset += genMove8IfNeeded(size, longTmpReg, src->AsOp()->gtOp1, offset); + offset += genMove4IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); + offset += genMove2IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); + offset += genMove1IfNeeded(size, intTmpReg, src->AsOp()->gtOp1, offset); + assert(offset == size); } } @@ -3453,18 +3409,180 @@ void CodeGen::genStructPutArgUnroll(GenTreePutArgStk* putArgNode) // void CodeGen::genStructPutArgRepMovs(GenTreePutArgStk* putArgNode) { - GenTree* srcAddr = putArgNode->gtGetOp1(); - assert(srcAddr->TypeGet() == TYP_STRUCT); + GenTree* src = putArgNode->gtGetOp1(); + assert(src->TypeGet() == TYP_STRUCT); + assert(!src->AsObj()->GetLayout()->HasGCPtr()); // Make sure we got the arguments of the cpblk operation in the right registers, and that - // 'srcAddr' is contained as expected. + // 'src' is contained as expected. assert(putArgNode->gtRsvdRegs == (RBM_RDI | RBM_RCX | RBM_RSI)); - assert(srcAddr->isContained()); + assert(src->isContained()); genConsumePutStructArgStk(putArgNode, REG_RDI, REG_RSI, REG_RCX); instGen(INS_r_movsb); } +#ifdef TARGET_X86 +//------------------------------------------------------------------------ +// genStructPutArgPush: Generates code for passing a struct arg by value on stack using "push". +// +// Arguments: +// putArgNode - the PutArgStk tree. +// +// Notes: +// Used only on x86, in two cases: +// - Structs 4, 8, or 12 bytes in size (less than XMM_REGSIZE_BYTES, multiple of TARGET_POINTER_SIZE). +// - Structs that contain GC pointers - they are guaranteed to be sized correctly by the VM. +// +void CodeGen::genStructPutArgPush(GenTreePutArgStk* putArgNode) +{ + // On x86, any struct that contains GC references must be stored to the stack using `push` instructions so + // that the emitter properly detects the need to update the method's GC information. + // + // Strictly speaking, it is only necessary to use "push" to store the GC references themselves, so for structs + // with large numbers of consecutive non-GC-ref-typed fields, we may be able to improve the code size in the + // future. + assert(m_pushStkArg); + + GenTree* src = putArgNode->Data(); + GenTree* srcAddr = putArgNode->Data()->AsObj()->Addr(); + + regNumber srcAddrReg = srcAddr->GetRegNum(); + const bool srcAddrInReg = srcAddrReg != REG_NA; + + unsigned srcLclNum = 0; + unsigned srcLclOffset = 0; + if (srcAddrInReg) + { + srcAddrReg = genConsumeReg(srcAddr); + } + else + { + assert(srcAddr->OperIsLocalAddr()); + + srcLclNum = srcAddr->AsLclVarCommon()->GetLclNum(); + srcLclOffset = srcAddr->AsLclVarCommon()->GetLclOffs(); + } + + ClassLayout* layout = src->AsObj()->GetLayout(); + const unsigned byteSize = putArgNode->GetStackByteSize(); + assert((byteSize % TARGET_POINTER_SIZE == 0) && ((byteSize < XMM_REGSIZE_BYTES) || layout->HasGCPtr())); + const unsigned numSlots = byteSize / TARGET_POINTER_SIZE; + assert(putArgNode->gtNumSlots == numSlots); + + for (int i = numSlots - 1; i >= 0; --i) + { + emitAttr slotAttr = emitTypeSize(layout->GetGCPtrType(i)); + const unsigned byteOffset = i * TARGET_POINTER_SIZE; + if (srcAddrInReg) + { + GetEmitter()->emitIns_AR_R(INS_push, slotAttr, REG_NA, srcAddrReg, byteOffset); + } + else + { + GetEmitter()->emitIns_S(INS_push, slotAttr, srcLclNum, srcLclOffset + byteOffset); + } + + AddStackLevel(TARGET_POINTER_SIZE); + } +} +#endif // TARGET_X86 + +#ifndef TARGET_X86 +//------------------------------------------------------------------------ +// genStructPutArgPartialRepMovs: Generates code for passing a struct arg by value on stack using +// a mix of pointer-sized stores, "movsq" and "rep movsd". +// +// Arguments: +// putArgNode - the PutArgStk tree. +// +// Notes: +// Used on non-x86 targets (Unix x64) for structs with GC pointers. +// +void CodeGen::genStructPutArgPartialRepMovs(GenTreePutArgStk* putArgNode) +{ + // Consume these registers. + // They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing"). + genConsumePutStructArgStk(putArgNode, REG_RDI, REG_RSI, REG_NA); + + GenTreeObj* src = putArgNode->gtGetOp1()->AsObj(); + ClassLayout* layout = src->GetLayout(); + const bool srcIsLocal = src->Addr()->OperIsLocalAddr(); + const emitAttr srcAddrAttr = srcIsLocal ? EA_PTRSIZE : EA_BYREF; + +#if DEBUG + unsigned numGCSlotsCopied = 0; +#endif // DEBUG + + assert(layout->HasGCPtr()); + const unsigned byteSize = putArgNode->GetStackByteSize(); + assert(byteSize % TARGET_POINTER_SIZE == 0); + const unsigned numSlots = byteSize / TARGET_POINTER_SIZE; + assert(putArgNode->gtNumSlots == numSlots); + + // No need to disable GC the way COPYOBJ does. Here the refs are copied in atomic operations always. + for (unsigned i = 0; i < numSlots;) + { + if (!layout->IsGCPtr(i)) + { + // Let's see if we can use rep movsp (alias for movsd or movsq for 32 and 64 bits respectively) + // instead of a sequence of movsp instructions to save cycles and code size. + unsigned adjacentNonGCSlotCount = 0; + do + { + adjacentNonGCSlotCount++; + i++; + } while ((i < numSlots) && !layout->IsGCPtr(i)); + + // If we have a very small contiguous non-ref region, it's better just to + // emit a sequence of movsp instructions + if (adjacentNonGCSlotCount < CPOBJ_NONGC_SLOTS_LIMIT) + { + for (; adjacentNonGCSlotCount > 0; adjacentNonGCSlotCount--) + { + instGen(INS_movsp); + } + } + else + { + GetEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, adjacentNonGCSlotCount); + instGen(INS_r_movsp); + } + } + else + { + // We have a GC (byref or ref) pointer + // TODO-Amd64-Unix: Here a better solution (for code size and CQ) would be to use movsp instruction, + // but the logic for emitting a GC info record is not available (it is internal for the emitter + // only.) See emitGCVarLiveUpd function. If we could call it separately, we could do + // instGen(INS_movsp); and emission of gc info. + + var_types memType = layout->GetGCPtrType(i); + GetEmitter()->emitIns_R_AR(ins_Load(memType), emitTypeSize(memType), REG_RCX, REG_RSI, 0); + genStoreRegToStackArg(memType, REG_RCX, i * TARGET_POINTER_SIZE); +#ifdef DEBUG + numGCSlotsCopied++; +#endif // DEBUG + + i++; + if (i < numSlots) + { + // Source for the copy operation. + // If a LocalAddr, use EA_PTRSIZE - copy from stack. + // If not a LocalAddr, use EA_BYREF - the source location is not on the stack. + GetEmitter()->emitIns_R_I(INS_add, srcAddrAttr, REG_RSI, TARGET_POINTER_SIZE); + + // Always copying to the stack - outgoing arg area + // (or the outgoing arg area of the caller for a tail call) - use EA_PTRSIZE. + GetEmitter()->emitIns_R_I(INS_add, EA_PTRSIZE, REG_RDI, TARGET_POINTER_SIZE); + } + } + } + + assert(numGCSlotsCopied == layout->GetGCPtrCount()); +} +#endif // !TARGET_X86 + //------------------------------------------------------------------------ // If any Vector3 args are on stack and they are not pass-by-ref, the upper 32bits // must be cleared to zeroes. The native compiler doesn't clear the upper bits @@ -7413,40 +7531,31 @@ bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk) } #endif // FEATURE_SIMD - // If the gtPutArgStkKind is one of the push types, we do not pre-adjust the stack. - // This is set in Lowering, and is true if and only if: - // - This argument contains any GC pointers OR - // - It is a GT_FIELD_LIST OR - // - It is less than 16 bytes in size. - CLANG_FORMAT_COMMENT_ANCHOR; - #ifdef DEBUG switch (putArgStk->gtPutArgStkKind) { case GenTreePutArgStk::Kind::RepInstr: case GenTreePutArgStk::Kind::Unroll: - assert(!source->AsObj()->GetLayout()->HasGCPtr() && (argSize >= 16)); + assert(!source->AsObj()->GetLayout()->HasGCPtr()); break; + case GenTreePutArgStk::Kind::Push: case GenTreePutArgStk::Kind::PushAllSlots: - assert(source->OperIs(GT_FIELD_LIST) || source->AsObj()->GetLayout()->HasGCPtr() || (argSize < 16)); + assert(source->OperIs(GT_FIELD_LIST) || source->AsObj()->GetLayout()->HasGCPtr() || + (argSize < XMM_REGSIZE_BYTES)); break; - case GenTreePutArgStk::Kind::Invalid: + default: - assert(!"Uninitialized GenTreePutArgStk::Kind"); - break; + unreached(); } #endif // DEBUG - if (putArgStk->isPushKind()) - { - m_pushStkArg = true; - return false; - } - else + // In lowering (see "LowerPutArgStk") we have determined what sort of instructions + // are going to be used for this node. If we'll not be using "push"es, the stack + // needs to be adjusted first (s. t. the SP points to the base of the outgoing arg). + // + if (!putArgStk->isPushKind()) { - m_pushStkArg = false; - // If argSize is large, we need to probe the stack like we do in the prolog (genAllocLclFrame) // or for localloc (genLclHeap), to ensure we touch the stack pages sequentially, and don't miss // the stack guard pages. The prolog probes, but we don't know at this point how much higher @@ -7467,8 +7576,13 @@ bool CodeGen::genAdjustStackForPutArgStk(GenTreePutArgStk* putArgStk) } AddStackLevel(argSize); + m_pushStkArg = false; return true; } + + // Otherwise, "push" will be adjusting the stack for us. + m_pushStkArg = true; + return false; } //--------------------------------------------------------------------- @@ -7685,10 +7799,6 @@ void CodeGen::genPutArgStkFieldList(GenTreePutArgStk* putArgStk) // // Arguments // treeNode - the GT_PUTARG_STK node -// targetType - the type of the treeNode -// -// Return value: -// None // void CodeGen::genPutArgStk(GenTreePutArgStk* putArgStk) { @@ -7947,6 +8057,7 @@ void CodeGen::genStoreRegToStackArg(var_types type, regNumber srcReg, int offset // corresponding to the argument area (where we will put the argument on the stack). // For tail calls this is the baseVarNum = 0. // For non tail calls this is the outgoingArgSpace. +// void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk) { GenTree* source = putArgStk->gtGetOp1(); @@ -7972,151 +8083,30 @@ void CodeGen::genPutStructArgStk(GenTreePutArgStk* putArgStk) ClassLayout* layout = source->AsObj()->GetLayout(); - if (!layout->HasGCPtr()) - { - switch (putArgStk->gtPutArgStkKind) - { - case GenTreePutArgStk::Kind::RepInstr: - genStructPutArgRepMovs(putArgStk); - break; - case GenTreePutArgStk::Kind::Unroll: - genStructPutArgUnroll(putArgStk); - break; - case GenTreePutArgStk::Kind::Push: - genStructPutArgUnroll(putArgStk); - break; - default: - unreached(); - } - } - else + switch (putArgStk->gtPutArgStkKind) { - // No need to disable GC the way COPYOBJ does. Here the refs are copied in atomic operations always. - CLANG_FORMAT_COMMENT_ANCHOR; - -#ifdef TARGET_X86 - // On x86, any struct that has contains GC references must be stored to the stack using `push` instructions so - // that the emitter properly detects the need to update the method's GC information. - // - // Strictly speaking, it is only necessary to use `push` to store the GC references themselves, so for structs - // with large numbers of consecutive non-GC-ref-typed fields, we may be able to improve the code size in the - // future. - assert(m_pushStkArg); - - GenTree* srcAddr = source->gtGetOp1(); - const unsigned byteSize = putArgStk->GetStackByteSize(); - assert(byteSize % TARGET_POINTER_SIZE == 0); - const unsigned numSlots = byteSize / TARGET_POINTER_SIZE; - assert(putArgStk->gtNumSlots == numSlots); - - regNumber srcRegNum = srcAddr->GetRegNum(); - const bool srcAddrInReg = srcRegNum != REG_NA; - - unsigned srcLclNum = 0; - unsigned srcLclOffset = 0; - if (srcAddrInReg) - { - genConsumeReg(srcAddr); - } - else - { - assert(srcAddr->OperIsLocalAddr()); - - srcLclNum = srcAddr->AsLclVarCommon()->GetLclNum(); - srcLclOffset = srcAddr->AsLclVarCommon()->GetLclOffs(); - } - - for (int i = numSlots - 1; i >= 0; --i) - { - emitAttr slotAttr = emitTypeSize(layout->GetGCPtrType(i)); - const unsigned byteOffset = i * TARGET_POINTER_SIZE; - if (srcAddrInReg) - { - GetEmitter()->emitIns_AR_R(INS_push, slotAttr, REG_NA, srcRegNum, byteOffset); - } - else - { - GetEmitter()->emitIns_S(INS_push, slotAttr, srcLclNum, srcLclOffset + byteOffset); - } - AddStackLevel(TARGET_POINTER_SIZE); - } -#else // !defined(TARGET_X86) - - // Consume these registers. - // They may now contain gc pointers (depending on their type; gcMarkRegPtrVal will "do the right thing"). - genConsumePutStructArgStk(putArgStk, REG_RDI, REG_RSI, REG_NA); - - const bool srcIsLocal = putArgStk->gtOp1->AsObj()->gtOp1->OperIsLocalAddr(); - const emitAttr srcAddrAttr = srcIsLocal ? EA_PTRSIZE : EA_BYREF; - -#if DEBUG - unsigned numGCSlotsCopied = 0; -#endif // DEBUG - - const unsigned byteSize = putArgStk->GetStackByteSize(); - assert(byteSize % TARGET_POINTER_SIZE == 0); - const unsigned numSlots = byteSize / TARGET_POINTER_SIZE; - assert(putArgStk->gtNumSlots == numSlots); - for (unsigned i = 0; i < numSlots;) - { - if (!layout->IsGCPtr(i)) - { - // Let's see if we can use rep movsp (alias for movsd or movsq for 32 and 64 bits respectively) - // instead of a sequence of movsp instructions to save cycles and code size. - unsigned adjacentNonGCSlotCount = 0; - do - { - adjacentNonGCSlotCount++; - i++; - } while ((i < numSlots) && !layout->IsGCPtr(i)); + case GenTreePutArgStk::Kind::RepInstr: + genStructPutArgRepMovs(putArgStk); + break; - // If we have a very small contiguous non-ref region, it's better just to - // emit a sequence of movsp instructions - if (adjacentNonGCSlotCount < CPOBJ_NONGC_SLOTS_LIMIT) - { - for (; adjacentNonGCSlotCount > 0; adjacentNonGCSlotCount--) - { - instGen(INS_movsp); - } - } - else - { - GetEmitter()->emitIns_R_I(INS_mov, EA_4BYTE, REG_RCX, adjacentNonGCSlotCount); - instGen(INS_r_movsp); - } - } - else - { - // We have a GC (byref or ref) pointer - // TODO-Amd64-Unix: Here a better solution (for code size and CQ) would be to use movsp instruction, - // but the logic for emitting a GC info record is not available (it is internal for the emitter - // only.) See emitGCVarLiveUpd function. If we could call it separately, we could do - // instGen(INS_movsp); and emission of gc info. - - var_types memType = layout->GetGCPtrType(i); - GetEmitter()->emitIns_R_AR(ins_Load(memType), emitTypeSize(memType), REG_RCX, REG_RSI, 0); - genStoreRegToStackArg(memType, REG_RCX, i * TARGET_POINTER_SIZE); -#ifdef DEBUG - numGCSlotsCopied++; -#endif // DEBUG +#ifndef TARGET_X86 + case GenTreePutArgStk::Kind::PartialRepInstr: + genStructPutArgPartialRepMovs(putArgStk); + break; +#endif // !TARGET_X86 - i++; - if (i < numSlots) - { - // Source for the copy operation. - // If a LocalAddr, use EA_PTRSIZE - copy from stack. - // If not a LocalAddr, use EA_BYREF - the source location is not on the stack. - GetEmitter()->emitIns_R_I(INS_add, srcAddrAttr, REG_RSI, TARGET_POINTER_SIZE); - - // Always copying to the stack - outgoing arg area - // (or the outgoing arg area of the caller for a tail call) - use EA_PTRSIZE. - GetEmitter()->emitIns_R_I(INS_add, EA_PTRSIZE, REG_RDI, TARGET_POINTER_SIZE); - } - } - } + case GenTreePutArgStk::Kind::Unroll: + genStructPutArgUnroll(putArgStk); + break; - assert(numGCSlotsCopied == layout->GetGCPtrCount()); +#ifdef TARGET_X86 + case GenTreePutArgStk::Kind::Push: + genStructPutArgPush(putArgStk); + break; #endif // TARGET_X86 + + default: + unreached(); } } #endif // defined(FEATURE_PUT_STRUCT_ARG_STK) diff --git a/src/coreclr/jit/gentree.cpp b/src/coreclr/jit/gentree.cpp index 8e1dd02..c627788 100644 --- a/src/coreclr/jit/gentree.cpp +++ b/src/coreclr/jit/gentree.cpp @@ -10758,6 +10758,9 @@ void Compiler::gtDispTree(GenTree* tree, case GenTreePutArgStk::Kind::RepInstr: printf(" (RepInstr)"); break; + case GenTreePutArgStk::Kind::PartialRepInstr: + printf(" (PartialRepInstr)"); + break; case GenTreePutArgStk::Kind::Unroll: printf(" (Unroll)"); break; diff --git a/src/coreclr/jit/gentree.h b/src/coreclr/jit/gentree.h index 2647cd6..a9dcd48 100644 --- a/src/coreclr/jit/gentree.h +++ b/src/coreclr/jit/gentree.h @@ -6775,7 +6775,7 @@ public: // block node. enum class Kind : __int8{ - Invalid, RepInstr, Unroll, Push, PushAllSlots, + Invalid, RepInstr, PartialRepInstr, Unroll, Push, PushAllSlots, }; Kind gtPutArgStkKind; #endif @@ -6825,6 +6825,11 @@ public: #endif } + GenTree*& Data() + { + return gtOp1; + } + #if FEATURE_FASTTAILCALL bool putInIncomingArgArea() const { diff --git a/src/coreclr/jit/lowerxarch.cpp b/src/coreclr/jit/lowerxarch.cpp index 46b48c4..2bf0090 100644 --- a/src/coreclr/jit/lowerxarch.cpp +++ b/src/coreclr/jit/lowerxarch.cpp @@ -600,31 +600,41 @@ void Lowering::LowerPutArgStk(GenTreePutArgStk* putArgStk) // TODO-X86-CQ: The helper call either is not supported on x86 or required more work // (I don't know which). - if (size <= CPBLK_UNROLL_LIMIT && !layout->HasGCPtr()) + if (!layout->HasGCPtr()) { #ifdef TARGET_X86 if (size < XMM_REGSIZE_BYTES) { + // Codegen for "Kind::Push" will always load bytes in TARGET_POINTER_SIZE + // chunks. As such, the correctness of this code depends on the fact that + // morph will copy any "mis-sized" (too small) non-local OBJs into a temp, + // thus preventing any possible out-of-bounds memory reads. + assert(((layout->GetSize() % TARGET_POINTER_SIZE) == 0) || src->OperIsLocalRead() || + (src->OperIsIndir() && src->AsIndir()->Addr()->IsLocalAddrExpr())); putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; } else #endif // TARGET_X86 + if (size <= CPBLK_UNROLL_LIMIT) { putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Unroll; } + else + { + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr; + } } -#ifdef TARGET_X86 - else if (layout->HasGCPtr()) + else // There are GC pointers. { +#ifdef TARGET_X86 // On x86, we must use `push` to store GC references to the stack in order for the emitter to properly update // the function's GC info. These `putargstk` nodes will generate a sequence of `push` instructions. putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::Push; +#else // !TARGET_X86 + putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::PartialRepInstr; +#endif // !TARGET_X86 } -#endif // TARGET_X86 - else - { - putArgStk->gtPutArgStkKind = GenTreePutArgStk::Kind::RepInstr; - } + // Always mark the OBJ and ADDR as contained trees by the putarg_stk. The codegen will deal with this tree. MakeSrcContained(putArgStk, src); if (haveLocalAddr) diff --git a/src/coreclr/jit/lsraxarch.cpp b/src/coreclr/jit/lsraxarch.cpp index 715c325..f9e6977 100644 --- a/src/coreclr/jit/lsraxarch.cpp +++ b/src/coreclr/jit/lsraxarch.cpp @@ -1576,41 +1576,40 @@ int LinearScan::BuildPutArgStk(GenTreePutArgStk* putArgStk) return BuildSimple(putArgStk); } - ClassLayout* layout = src->AsObj()->GetLayout(); - ssize_t size = putArgStk->GetStackByteSize(); switch (putArgStk->gtPutArgStkKind) { - case GenTreePutArgStk::Kind::Push: - case GenTreePutArgStk::Kind::PushAllSlots: case GenTreePutArgStk::Kind::Unroll: // If we have a remainder smaller than XMM_REGSIZE_BYTES, we need an integer temp reg. - if (!layout->HasGCPtr() && (size & (XMM_REGSIZE_BYTES - 1)) != 0) + if ((size % XMM_REGSIZE_BYTES) != 0) { regMaskTP regMask = allRegs(TYP_INT); buildInternalIntRegisterDefForNode(putArgStk, regMask); } -#ifdef TARGET_X86 - if (size >= 8) -#else // !TARGET_X86 if (size >= XMM_REGSIZE_BYTES) -#endif // !TARGET_X86 { - // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES on x64/ux, - // or larger than or equal to 8 bytes on x86, reserve an XMM register to use it for a - // series of 16-byte loads and stores. + // If we have a buffer larger than or equal to XMM_REGSIZE_BYTES, reserve + // an XMM register to use it for a series of 16-byte loads and stores. buildInternalFloatRegisterDefForNode(putArgStk, internalFloatRegCandidates()); SetContainsAVXFlags(); } break; case GenTreePutArgStk::Kind::RepInstr: +#ifndef TARGET_X86 + case GenTreePutArgStk::Kind::PartialRepInstr: +#endif buildInternalIntRegisterDefForNode(putArgStk, RBM_RDI); buildInternalIntRegisterDefForNode(putArgStk, RBM_RCX); buildInternalIntRegisterDefForNode(putArgStk, RBM_RSI); break; +#ifdef TARGET_X86 + case GenTreePutArgStk::Kind::Push: + break; +#endif // TARGET_X86 + default: unreached(); } -- 2.7.4