if (genUseBlockInit)
{
assert(untrLclHi > untrLclLo);
-#ifdef TARGET_ARMARCH
- /*
- Generate the following code:
-
- For cnt less than 10
-
- mov rZero1, 0
- mov rZero2, 0
- mov rCnt, <cnt>
- stm <rZero1,rZero2>,[rAddr!]
- <optional> stm <rZero1,rZero2>,[rAddr!]
- <optional> stm <rZero1,rZero2>,[rAddr!]
- <optional> stm <rZero1,rZero2>,[rAddr!]
- <optional> str rZero1,[rAddr]
-
- For rCnt greater than or equal to 10
-
- mov rZero1, 0
- mov rZero2, 0
- mov rCnt, <cnt/2>
- sub rAddr, sp, OFFS
-
- loop:
- stm <rZero1,rZero2>,[rAddr!]
- sub rCnt,rCnt,1
- jnz loop
-
- <optional> str rZero1,[rAddr] // When cnt is odd
-
- NOTE: for ARM64, the instruction is stp, not stm. And we can use ZR instead of allocating registers.
- */
+#ifdef TARGET_ARM
+ // Generate the following code:
+ //
+ // For cnt less than 10
+ //
+ // mov rZero1, 0
+ // mov rZero2, 0
+ // mov rCnt, <cnt>
+ // stm <rZero1,rZero2>,[rAddr!]
+ // <optional> stm <rZero1,rZero2>,[rAddr!]
+ // <optional> stm <rZero1,rZero2>,[rAddr!]
+ // <optional> stm <rZero1,rZero2>,[rAddr!]
+ // <optional> str rZero1,[rAddr]
+ //
+ // For rCnt greater than or equal to 10
+ //
+ // mov rZero1, 0
+ // mov rZero2, 0
+ // mov rCnt, <cnt/2>
+ // sub rAddr, sp, OFFS
+ //
+ // loop:
+ // stm <rZero1,rZero2>,[rAddr!]
+ // sub rCnt,rCnt,1
+ // jnz loop
+ //
+ // <optional> str rZero1,[rAddr] // When cnt is odd
regNumber rAddr;
regNumber rCnt = REG_NA; // Invalid
availMask &= ~genRegMask(initReg); // Remove the pre-calculated initReg as we will zero it and maybe use it for
// a large constant.
-#if defined(TARGET_ARM)
-
if (compiler->compLocallocUsed)
{
availMask &= ~RBM_SAVED_LOCALLOC_SP; // Remove the register reserved when we have a localloc frame
rAddr = genRegNumFromMask(regMask);
availMask &= ~regMask;
-#else // !define(TARGET_ARM)
-
- rAddr = initReg;
- *pInitRegZeroed = false;
-
-#endif // !defined(TARGET_ARM)
-
bool useLoop = false;
unsigned uCntBytes = untrLclHi - untrLclLo;
assert((uCntBytes % sizeof(int)) == 0); // The smallest stack slot is always 4 bytes.
// rAddr is not a live incoming argument reg
assert((genRegMask(rAddr) & intRegState.rsCalleeRegArgMaskLiveIn) == 0);
-#if defined(TARGET_ARM)
if (arm_Valid_Imm_For_Add(untrLclLo, INS_FLAGS_DONT_CARE))
-#else // !TARGET_ARM
- if (emitter::emitIns_valid_imm_for_add(untrLclLo, EA_PTRSIZE))
-#endif // !TARGET_ARM
{
GetEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, rAddr, genFramePointerReg(), untrLclLo);
}
instGen_Set_Reg_To_Imm(EA_PTRSIZE, rCnt, (ssize_t)uCntSlots / 2);
}
-#if defined(TARGET_ARM)
rZero1 = genGetZeroReg(initReg, pInitRegZeroed);
instGen_Set_Reg_To_Zero(EA_PTRSIZE, rZero2);
target_ssize_t stmImm = (target_ssize_t)(genRegMask(rZero1) | genRegMask(rZero2));
-#endif // TARGET_ARM
if (!useLoop)
{
while (uCntBytes >= REGSIZE_BYTES * 2)
{
-#ifdef TARGET_ARM
GetEmitter()->emitIns_R_I(INS_stm, EA_PTRSIZE, rAddr, stmImm);
-#else // !TARGET_ARM
- GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, rAddr, 2 * REGSIZE_BYTES,
- INS_OPTS_POST_INDEX);
-#endif // !TARGET_ARM
uCntBytes -= REGSIZE_BYTES * 2;
}
}
- else // useLoop is true
+ else
{
-#ifdef TARGET_ARM
GetEmitter()->emitIns_R_I(INS_stm, EA_PTRSIZE, rAddr, stmImm); // zero stack slots
GetEmitter()->emitIns_R_I(INS_sub, EA_PTRSIZE, rCnt, 1, INS_FLAGS_SET);
-#else // !TARGET_ARM
- GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, rAddr, 2 * REGSIZE_BYTES,
- INS_OPTS_POST_INDEX); // zero stack slots
- GetEmitter()->emitIns_R_R_I(INS_subs, EA_PTRSIZE, rCnt, rCnt, 1);
-#endif // !TARGET_ARM
GetEmitter()->emitIns_J(INS_bhi, NULL, -3);
uCntBytes %= REGSIZE_BYTES * 2;
}
if (uCntBytes >= REGSIZE_BYTES) // check and zero the last register-sized stack slot (odd number)
{
-#ifdef TARGET_ARM
GetEmitter()->emitIns_R_R_I(INS_str, EA_PTRSIZE, rZero1, rAddr, 0);
-#else // TARGET_ARM
- if ((uCntBytes - REGSIZE_BYTES) == 0)
+ uCntBytes -= REGSIZE_BYTES;
+ }
+
+ noway_assert(uCntBytes == 0);
+
+#elif defined(TARGET_ARM64)
+ int bytesToWrite = untrLclHi - untrLclLo;
+
+ const regNumber zeroSimdReg = REG_ZERO_INIT_FRAME_SIMD;
+ bool simdRegZeroed = false;
+ const int simdRegPairSizeBytes = 2 * FP_REGSIZE_BYTES;
+
+ regNumber addrReg = REG_ZERO_INIT_FRAME_REG1;
+
+ if (addrReg == initReg)
+ {
+ *pInitRegZeroed = false;
+ }
+
+ int addrOffset = 0;
+
+ // The following invariants are held below:
+ //
+ // 1) [addrReg, #addrOffset] points at a location where next chunk of zero bytes will be written;
+ // 2) bytesToWrite specifies the number of bytes on the frame to initialize;
+ // 3) if simdRegZeroed is true then 128-bit wide zeroSimdReg contains zeroes.
+
+ const int bytesUseZeroingLoop = 192;
+
+ if (bytesToWrite >= bytesUseZeroingLoop)
+ {
+ // Generates the following code:
+ //
+ // When the size of the region is greater than or equal to 256 bytes
+ // **and** DC ZVA instruction use is permitted
+ // **and** the instruction block size is configured to 64 bytes:
+ //
+ // movi v16.16b, #0
+ // add x9, fp, #(untrLclLo+64)
+ // add x10, fp, #(untrLclHi-64)
+ // stp q16, q16, [x9, #-64]
+ // stp q16, q16, [x9, #-32]
+ // bfm x9, xzr, #0, #5
+ //
+ // loop:
+ // dc zva, x9
+ // add x9, x9, #64
+ // cmp x9, x10
+ // blo loop
+ //
+ // stp q16, q16, [x10]
+ // stp q16, q16, [x10, #32]
+ //
+ // Otherwise:
+ //
+ // movi v16.16b, #0
+ // add x9, fp, #(untrLclLo-32)
+ // mov x10, #(bytesToWrite-64)
+ //
+ // loop:
+ // stp q16, q16, [x9, #32]
+ // stp q16, q16, [x9, #64]!
+ // subs x10, x10, #64
+ // bge loop
+
+ const int bytesUseDataCacheZeroInstruction = 256;
+
+ GetEmitter()->emitIns_R_I(INS_movi, EA_16BYTE, zeroSimdReg, 0, INS_OPTS_16B);
+ simdRegZeroed = true;
+
+ if ((bytesToWrite >= bytesUseDataCacheZeroInstruction) &&
+ compiler->compOpportunisticallyDependsOn(InstructionSet_Dczva))
{
- GetEmitter()->emitIns_R_R_I(INS_str, EA_PTRSIZE, REG_ZR, rAddr, 0);
+ // The first and the last 64 bytes should be written with two stp q-reg instructions.
+ // This is in order to avoid **unintended** zeroing of the data by dc zva
+ // outside of [fp+untrLclLo, fp+untrLclHi) memory region.
+
+ genInstrWithConstant(INS_add, EA_PTRSIZE, addrReg, genFramePointerReg(), untrLclLo + 64, addrReg);
+ addrOffset = -64;
+
+ const regNumber endAddrReg = REG_ZERO_INIT_FRAME_REG2;
+
+ if (endAddrReg == initReg)
+ {
+ *pInitRegZeroed = false;
+ }
+
+ genInstrWithConstant(INS_add, EA_PTRSIZE, endAddrReg, genFramePointerReg(), untrLclHi - 64, endAddrReg);
+
+ GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_16BYTE, zeroSimdReg, zeroSimdReg, addrReg, addrOffset);
+ addrOffset += simdRegPairSizeBytes;
+
+ GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_16BYTE, zeroSimdReg, zeroSimdReg, addrReg, addrOffset);
+ addrOffset += simdRegPairSizeBytes;
+
+ assert(addrOffset == 0);
+
+ GetEmitter()->emitIns_R_R_I_I(INS_bfm, EA_PTRSIZE, addrReg, REG_ZR, 0, 5);
+ // addrReg points at the beginning of a cache line.
+
+ GetEmitter()->emitIns_R(INS_dczva, EA_PTRSIZE, addrReg);
+ GetEmitter()->emitIns_R_R_I(INS_add, EA_PTRSIZE, addrReg, addrReg, 64);
+ GetEmitter()->emitIns_R_R(INS_cmp, EA_PTRSIZE, addrReg, endAddrReg);
+ GetEmitter()->emitIns_J(INS_blo, NULL, -4);
+
+ addrReg = endAddrReg;
+ bytesToWrite = 64;
}
else
{
- GetEmitter()->emitIns_R_R_I(INS_str, EA_PTRSIZE, REG_ZR, rAddr, REGSIZE_BYTES, INS_OPTS_POST_INDEX);
+ genInstrWithConstant(INS_add, EA_PTRSIZE, addrReg, genFramePointerReg(), untrLclLo - 32, addrReg);
+ addrOffset = 32;
+
+ const regNumber countReg = REG_ZERO_INIT_FRAME_REG2;
+
+ if (countReg == initReg)
+ {
+ *pInitRegZeroed = false;
+ }
+
+ instGen_Set_Reg_To_Imm(EA_PTRSIZE, countReg, bytesToWrite - 64);
+
+ GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_16BYTE, zeroSimdReg, zeroSimdReg, addrReg, 32);
+ GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_16BYTE, zeroSimdReg, zeroSimdReg, addrReg, 64,
+ INS_OPTS_PRE_INDEX);
+
+ GetEmitter()->emitIns_R_R_I(INS_subs, EA_PTRSIZE, countReg, countReg, 64);
+ GetEmitter()->emitIns_J(INS_bge, NULL, -4);
+
+ bytesToWrite %= 64;
}
-#endif // !TARGET_ARM
- uCntBytes -= REGSIZE_BYTES;
}
-#ifdef TARGET_ARM64
- if (uCntBytes > 0)
+ else
{
- assert(uCntBytes == sizeof(int));
- GetEmitter()->emitIns_R_R_I(INS_str, EA_4BYTE, REG_ZR, rAddr, 0);
- uCntBytes -= sizeof(int);
+ genInstrWithConstant(INS_add, EA_PTRSIZE, addrReg, genFramePointerReg(), untrLclLo, addrReg);
+ }
+
+ if (bytesToWrite >= simdRegPairSizeBytes)
+ {
+ // Generates the following code:
+ //
+ // movi v16.16b, #0
+ // stp q16, q16, [x9, #addrOffset]
+ // stp q16, q16, [x9, #(addrOffset+32)]
+ // ...
+ // stp q16, q16, [x9, #(addrOffset+roundDown(bytesToWrite, 32))]
+
+ if (!simdRegZeroed)
+ {
+ GetEmitter()->emitIns_R_I(INS_movi, EA_16BYTE, zeroSimdReg, 0, INS_OPTS_16B);
+ simdRegZeroed = true;
+ }
+
+ for (; bytesToWrite >= simdRegPairSizeBytes; bytesToWrite -= simdRegPairSizeBytes)
+ {
+ GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_16BYTE, zeroSimdReg, zeroSimdReg, addrReg, addrOffset);
+ addrOffset += simdRegPairSizeBytes;
+ }
+ }
+
+ const int regPairSizeBytes = 2 * REGSIZE_BYTES;
+
+ if (bytesToWrite >= regPairSizeBytes)
+ {
+ GetEmitter()->emitIns_R_R_R_I(INS_stp, EA_PTRSIZE, REG_ZR, REG_ZR, addrReg, addrOffset);
+ addrOffset += regPairSizeBytes;
+ bytesToWrite -= regPairSizeBytes;
+ }
+
+ if (bytesToWrite >= REGSIZE_BYTES)
+ {
+ GetEmitter()->emitIns_R_R_I(INS_str, EA_PTRSIZE, REG_ZR, addrReg, addrOffset);
+ addrOffset += REGSIZE_BYTES;
+ bytesToWrite -= REGSIZE_BYTES;
+ }
+
+ if (bytesToWrite == sizeof(int))
+ {
+ GetEmitter()->emitIns_R_R_I(INS_str, EA_4BYTE, REG_ZR, addrReg, addrOffset);
+ bytesToWrite = 0;
}
-#endif // TARGET_ARM64
- noway_assert(uCntBytes == 0);
+ assert(bytesToWrite == 0);
#elif defined(TARGET_XARCH)
assert(compiler->getSIMDSupportLevel() >= SIMD_SSE2_Supported);
emitter* emit = GetEmitter();