From: Steve MacLean, Qualcomm Datacenter Technologies, Inc Date: Thu, 6 Apr 2017 17:58:14 +0000 (+0000) Subject: [Arm64] Enable cpblk loop unrolling X-Git-Tag: submit/tizen/20210909.063632~11030^2~7347^2~3 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=21887948b89612b5cb6fc68aa994e1f0812c119b;p=platform%2Fupstream%2Fdotnet%2Fruntime.git [Arm64] Enable cpblk loop unrolling Commit migrated from https://github.com/dotnet/coreclr/commit/e021cad8daf1464313dc3b3c603a519eb30f9d73 --- diff --git a/src/coreclr/src/jit/codegenarm64.cpp b/src/coreclr/src/jit/codegenarm64.cpp index c123136..421814a 100644 --- a/src/coreclr/src/jit/codegenarm64.cpp +++ b/src/coreclr/src/jit/codegenarm64.cpp @@ -3381,22 +3381,8 @@ void CodeGen::genCodeForLoadOffset(instruction ins, emitAttr size, regNumber dst // offset: distance from the base from which to load void CodeGen::genCodeForStoreOffset(instruction ins, emitAttr size, regNumber src, GenTree* base, unsigned offset) { -#if 0 - emitter *emit = getEmitter(); - - if (base->OperIsLocalAddr()) - { - if (base->gtOper == GT_LCL_FLD_ADDR) - offset += base->gtLclFld.gtLclOffs; - emit->emitIns_S_R(ins, size, src, base->gtLclVarCommon.gtLclNum, offset); - } - else - { - emit->emitIns_AR_R(ins, size, src, base->gtRegNum, offset); - } -#else // !0 - NYI("genCodeForStoreOffset"); -#endif // !0 + // For arm64 these functions are identical + genCodeForLoadOffset(ins, size, src, base, offset); } // Generates CpBlk code by performing a loop unroll @@ -3405,80 +3391,87 @@ void CodeGen::genCodeForStoreOffset(instruction ins, emitAttr size, regNumber sr // This may seem small but covers >95% of the cases in several framework assemblies. void CodeGen::genCodeForCpBlkUnroll(GenTreeBlk* cpBlkNode) { -#if 0 // Make sure we got the arguments of the cpblk operation in the right registers unsigned size = cpBlkNode->Size(); GenTreePtr dstAddr = cpBlkNode->Addr(); GenTreePtr source = cpBlkNode->Data(); - noway_assert(source->gtOper == GT_IND); - GenTreePtr srcAddr = source->gtGetOp1(); + GenTreePtr srcAddr = nullptr; assert((size != 0 ) && (size <= CPBLK_UNROLL_LIMIT)); emitter *emit = getEmitter(); - if (!srcAddr->isContained()) - genConsumeReg(srcAddr); + if (source->gtOper == GT_IND) + { + srcAddr = source->gtGetOp1(); + if (srcAddr->isUsedFromReg()) + { + genConsumeReg(srcAddr); + } + } + else + { + noway_assert(source->IsLocal()); + // TODO-Cleanup: Consider making the addrForm() method in Rationalize public, e.g. in GenTree. + // OR: transform source to GT_IND(GT_LCL_VAR_ADDR) + if (source->OperGet() == GT_LCL_VAR) + { + source->SetOper(GT_LCL_VAR_ADDR); + } + else + { + assert(source->OperGet() == GT_LCL_FLD); + source->SetOper(GT_LCL_FLD_ADDR); + } + srcAddr = source; + } - if (!dstAddr->isContained()) + if (dstAddr->isUsedFromReg()) + { genConsumeReg(dstAddr); + } unsigned offset = 0; - // If the size of this struct is larger than 16 bytes - // let's use SSE2 to be able to do 16 byte at a time - // loads and stores. - if (size >= XMM_REGSIZE_BYTES) + // Grab the integer temp register to emit the loads and stores. + regNumber tmpReg = genRegNumFromMask(cpBlkNode->gtRsvdRegs & RBM_ALLINT); + + if (size >= REGSIZE_BYTES) { - assert(cpBlkNode->gtRsvdRegs != RBM_NONE); - assert(genCountBits(cpBlkNode->gtRsvdRegs) == 1); - regNumber xmmReg = genRegNumFromMask(cpBlkNode->gtRsvdRegs); - assert(genIsValidFloatReg(xmmReg)); - size_t slots = size / XMM_REGSIZE_BYTES; + // TODO-ARM64-CQ: Consider using LDP/STP to save codesize. + size_t slots = size / REGSIZE_BYTES; while (slots-- > 0) { // Load - genCodeForLoadOffset(INS_movdqu, EA_8BYTE, xmmReg, srcAddr, offset); + genCodeForLoadOffset(INS_ldr, EA_8BYTE, tmpReg, srcAddr, offset); // Store - genCodeForStoreOffset(INS_movdqu, EA_8BYTE, xmmReg, dstAddr, offset); - offset += XMM_REGSIZE_BYTES; + genCodeForStoreOffset(INS_str, EA_8BYTE, tmpReg, dstAddr, offset); + offset += REGSIZE_BYTES; } } // Fill the remainder (15 bytes or less) if there's one. - if ((size & 0xf) != 0) + if ((size & 0x7) != 0) { - // Grab the integer temp register to emit the remaining loads and stores. - regNumber tmpReg = genRegNumFromMask(cpBlkNode->gtRsvdRegs & RBM_ALLINT); - - if ((size & 8) != 0) - { - genCodeForLoadOffset(INS_mov, EA_8BYTE, tmpReg, srcAddr, offset); - genCodeForStoreOffset(INS_mov, EA_8BYTE, tmpReg, dstAddr, offset); - offset += 8; - } if ((size & 4) != 0) { - genCodeForLoadOffset(INS_mov, EA_4BYTE, tmpReg, srcAddr, offset); - genCodeForStoreOffset(INS_mov, EA_4BYTE, tmpReg, dstAddr, offset); + genCodeForLoadOffset(INS_ldr, EA_4BYTE, tmpReg, srcAddr, offset); + genCodeForStoreOffset(INS_str, EA_4BYTE, tmpReg, dstAddr, offset); offset += 4; } if ((size & 2) != 0) { - genCodeForLoadOffset(INS_mov, EA_2BYTE, tmpReg, srcAddr, offset); - genCodeForStoreOffset(INS_mov, EA_2BYTE, tmpReg, dstAddr, offset); + genCodeForLoadOffset(INS_ldrh, EA_2BYTE, tmpReg, srcAddr, offset); + genCodeForStoreOffset(INS_strh, EA_2BYTE, tmpReg, dstAddr, offset); offset += 2; } if ((size & 1) != 0) { - genCodeForLoadOffset(INS_mov, EA_1BYTE, tmpReg, srcAddr, offset); - genCodeForStoreOffset(INS_mov, EA_1BYTE, tmpReg, dstAddr, offset); + genCodeForLoadOffset(INS_ldrb, EA_1BYTE, tmpReg, srcAddr, offset); + genCodeForStoreOffset(INS_strb, EA_1BYTE, tmpReg, dstAddr, offset); } } -#else // !0 - NYI("genCodeForCpBlkUnroll"); -#endif // !0 } // Generate code for CpObj nodes wich copy structs that have interleaved @@ -3618,13 +3611,10 @@ void CodeGen::genCodeForCpBlk(GenTreeBlk* cpBlkNode) genConsumeBlockOp(cpBlkNode, REG_ARG_0, REG_ARG_1, REG_ARG_2); -#if 0 - // Enable this when we support cpblk loop unrolling. if (blockSize != 0) { - assert(blockSize->gtIntCon.gtIconVal >= CPBLK_UNROLL_LIMIT); + assert(blockSize > CPBLK_UNROLL_LIMIT); } -#endif // 0 genEmitHelperCall(CORINFO_HELP_MEMCPY, 0, EA_UNKNOWN); } diff --git a/src/coreclr/src/jit/lowerarm64.cpp b/src/coreclr/src/jit/lowerarm64.cpp index f5bc55e..a29db38 100644 --- a/src/coreclr/src/jit/lowerarm64.cpp +++ b/src/coreclr/src/jit/lowerarm64.cpp @@ -207,19 +207,11 @@ void Lowering::LowerBlockStore(GenTreeBlk* blkNode) short internalIntCount = 0; regMaskTP internalIntCandidates = RBM_NONE; -#if 0 - // In case of a CpBlk with a constant size and less than CPBLK_UNROLL_LIMIT size - // we should unroll the loop to improve CQ. - // For reference see the code in lowerxarch.cpp. - - // TODO-ARM64-CQ: cpblk loop unrolling is currently not implemented. - if ((size != 0) && (size <= INITBLK_UNROLL_LIMIT)) { blkNode->gtBlkOpKind = GenTreeBlk::BlkOpKindUnroll; } else -#endif // 0 { // In case we have a constant integer this means we went beyond // CPBLK_UNROLL_LIMIT bytes of size, still we should never have the case of diff --git a/src/coreclr/src/jit/lsraarm64.cpp b/src/coreclr/src/jit/lsraarm64.cpp index cba99ba..93f01a3 100644 --- a/src/coreclr/src/jit/lsraarm64.cpp +++ b/src/coreclr/src/jit/lsraarm64.cpp @@ -1255,16 +1255,12 @@ void Lowering::TreeNodeInfoInitBlockStore(GenTreeBlk* blkNode) short internalIntCount = 0; regMaskTP internalIntCandidates = RBM_NONE; -#if 0 if (blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindUnroll) { - // TODO-ARM64-CQ: cpblk loop unrolling is currently not implemented. - // In case of a CpBlk with a constant size and less than CPBLK_UNROLL_LIMIT size - // we should unroll the loop to improve CQ. - // For reference see the code in lsraxarch.cpp. + internalIntCount = 1; + internalIntCandidates = RBM_ALLINT; } else -#endif // 0 { assert(blkNode->gtBlkOpKind == GenTreeBlk::BlkOpKindHelper); dstAddr->gtLsraInfo.setSrcCandidates(l, RBM_ARG_0);