From: Steve MacLean Date: Wed, 3 May 2017 22:52:42 +0000 (-0400) Subject: [Arm64/Unix] Revise JIT_MemSet (#11217) X-Git-Tag: accepted/tizen/base/20180629.140029~1190 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=cac4e2e82e63c3d892c935262cfb2b04090acae9;p=platform%2Fupstream%2Fcoreclr.git [Arm64/Unix] Revise JIT_MemSet (#11217) * [Arm64/Unix] Revise JIT_MemSet Use DC ZVA Use stp Correctly handle short set lengths Simplify code & pseudo code Use uint*_t to make pseudo code more readable --- diff --git a/src/vm/arm64/crthelpers.S b/src/vm/arm64/crthelpers.S index 51774ae..c8b108c 100644 --- a/src/vm/arm64/crthelpers.S +++ b/src/vm/arm64/crthelpers.S @@ -18,62 +18,109 @@ // //void JIT_MemSet(void *dst, int val, SIZE_T count) // -// uintptr_t valEx = (unsigned char)val; +// uint64_t valEx = (unsigned char)val; // valEx = valEx | valEx << 8; // valEx = valEx | valEx << 16; // valEx = valEx | valEx << 32; // +// size_t dc_zva_size = 4ULL << DCZID_EL0.BS; +// +// uint64_t use_dc_zva = (val == 0) && !DCZID_EL0.p ? count / (2 * dc_zva_size) : 0; // ~Minimum size (assumes worst case alignment) +// // // If not aligned then make it 8-byte aligned -// if(((uintptr_t)dst&0x7) != 0) +// if(((uint64_t)dst&0xf) != 0) // { -// if(((uintptr_t)dst&0x3) == 0) +// // Calculate alignment we can do without exceeding count +// // Use math to avoid introducing more unpredictable branches +// // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0 +// // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64 +// uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64)) +// +// if(align&0x1) +// { +// *(unit8_t*)dst = (unit8_t)valEx; +// dst = (unit8_t*)dst + 1; +// count-=1; +// } +// +// if(align&0x2) +// { +// *(unit16_t*)dst = (unit16_t)valEx; +// dst = (unit16_t*)dst + 1; +// count-=2; +// } +// +// if(align&0x4) // { -// *(UINT*)dst = (UINT)valEx; -// dst = (UINT*)dst + 1; +// *(unit32_t*)dst = (unit32_t)valEx; +// dst = (unit32_t*)dst + 1; // count-=4; // } -// else if(((uintptr_t)dst&0x1) == 0) +// } +// +// if(use_dc_zva) +// { +// // If not aligned then make it aligned to dc_zva_size +// if(dst&0x8) // { -// while(count > 0 && ((uintptr_t)dst&0x7) != 0) -// { -// *(short*)dst = (short)valEx; -// dst = (short*)dst + 1; -// count-=2; -// } +// *(uint64_t*)dst = (uint64_t)valEx; +// dst = (uint64_t*)dst + 1; +// count-=8; // } -// else +// +// while(dst & (dc_zva_size - 1)) +// { +// *(uint64_t*)dst = valEx; +// dst = (uint64_t*)dst + 1; +// *(uint64_t*)dst = valEx; +// dst = (uint64_t*)dst + 1; +// count-=16; +// } +// +// count -= dc_zva_size; +// +// while(count >= 0) // { -// while(count > 0 && ((uintptr_t)dst&0x7) != 0) -// { -// *(char*)dst = (char)valEx; -// dst = (char*)dst + 1; -// count--; -// } +// dc_zva(dst); +// dst = (uint8_t*)dst + dc_zva_size; +// count-=dc_zva_size; // } +// +// count += dc_zva_size; // } // -// while(count >= 8) +// count-=16; +// +// while(count >= 0) // { -// *(uintptr_t*)dst = valEx; -// dst = (uintptr_t*)dst + 1; -// count-=8; +// *(uint64_t*)dst = valEx; +// dst = (uint64_t*)dst + 1; +// *(uint64_t*)dst = valEx; +// dst = (uint64_t*)dst + 1; +// count-=16; +// } +// +// if(count & 8) +// { +// *(uint64_t*)dst = valEx; +// dst = (uint64_t*)dst + 1; // } // // if(count & 4) // { -// *(UINT*)dst = (UINT)valEx; -// dst = (UINT*)dst + 1; +// *(uint32_t*)dst = (uint32_t)valEx; +// dst = (uint32_t*)dst + 1; // } // // if(count & 2) // { -// *(short*)dst = (short)valEx; -// dst = (short*)dst + 1; +// *(uint16_t*)dst = (uint16_t)valEx; +// dst = (uint16_t*)dst + 1; // } // // if(count & 1) // { -// *(char*)dst = (char)valEx; +// *(uint8_t*)dst = (uint8_t)valEx; // } // // @@ -85,68 +132,89 @@ // as C++ method. LEAF_ENTRY JIT_MemSet, _TEXT - uxtb w8,w1 - sxtw x8,w8 - orr x8,x8,x8, lsl #8 - orr x8,x8,x8, lsl #0x10 - orr x9,x8,x8, lsl #0x20 - and x8,x0,#7 - cbz x8,LOCAL_LABEL(JIT_MemSet_0x7c) - and x8,x0,#3 - cbnz x8,LOCAL_LABEL(JIT_MemSet_0x38) - str w9,[x0] - add x0,x0,#4 - mov x8,#-4 - add x2,x2,x8 - b LOCAL_LABEL(JIT_MemSet_0x7c) -LOCAL_LABEL(JIT_MemSet_0x38): - cbz x2,LOCAL_LABEL(JIT_MemSet_0x7c) - tbnz x0,#0,LOCAL_LABEL(JIT_MemSet_0x60) -LOCAL_LABEL(JIT_MemSet_0x40): - and x8,x0,#7 - cbz x8,LOCAL_LABEL(JIT_MemSet_0x7c) - strh w9,[x0] - add x0,x0,#2 - mov x8,#-2 - add x2,x2,x8 - cbnz x2,LOCAL_LABEL(JIT_MemSet_0x40) - b LOCAL_LABEL(JIT_MemSet_0x7c) -LOCAL_LABEL(JIT_MemSet_0x60): - and x8,x0,#7 - cbz x8,LOCAL_LABEL(JIT_MemSet_0x7c) - strb w9,[x0] - add x0,x0,#1 - mov x8,#-1 - add x2,x2,x8 - cbnz x2,LOCAL_LABEL(JIT_MemSet_0x60) -LOCAL_LABEL(JIT_MemSet_0x7c): - cmp x2,#8 - blo LOCAL_LABEL(JIT_MemSet_0xb8) - lsr x8,x2,#3 - mov x11,x8 - mov x10,x0 - add x8,x10,x11, lsl #3 + ands w8, w1, #0xff + mrs x3, DCZID_EL0 // x3 = DCZID_EL0 + mov x6, #4 + lsr x11, x2, #3 // x11 = count >> 3 + + orr w8, w8, w8, lsl #8 + and x5, x3, #0xf // x5 = dczid_el0.bs + csel x11, x11, xzr, eq // x11 = (val == 0) ? count >> 3 : 0 + tst x3, (1 << 4) + + orr w8, w8, w8, lsl #0x10 + csel x11, x11, xzr, eq // x11 = (val == 0) && !DCZID_EL0.p ? count >> 3 : 0 + ands x3, x0, #7 // x3 = dst & 7 + lsl x9, x6, x5 // x9 = size + + orr x8, x8, x8, lsl #0x20 + lsr x11, x11, x5 // x11 = (val == 0) && !DCZID_EL0.p ? count >> (3 + DCZID_EL0.bs) : 0 + sub x10, x9, #1 // x10 = mask + + b.eq LOCAL_LABEL(JIT_MemSet_0x80) + + movn x4, #7 + clz x5, x2 + lsr x4, x4, x5 + and x3, x3, x4 + + tbz x3, #0, LOCAL_LABEL(JIT_MemSet_0x2c) + strb w8, [x0], #1 + sub x2, x2, #1 +LOCAL_LABEL(JIT_MemSet_0x2c): + tbz x3, #1, LOCAL_LABEL(JIT_MemSet_0x5c) + strh w8, [x0], #2 + sub x2, x2, #2 +LOCAL_LABEL(JIT_MemSet_0x5c): + tbz x3, #2, LOCAL_LABEL(JIT_MemSet_0x80) + str w8, [x0], #4 + sub x2, x2, #4 +LOCAL_LABEL(JIT_MemSet_0x80): + cbz x11, LOCAL_LABEL(JIT_MemSet_0x9c) + tbz x0, #3, LOCAL_LABEL(JIT_MemSet_0x84) + str x8, [x0], #8 + sub x2, x2, #8 + + b LOCAL_LABEL(JIT_MemSet_0x85) +LOCAL_LABEL(JIT_MemSet_0x84): + stp x8, x8, [x0], #16 + sub x2, x2, #16 +LOCAL_LABEL(JIT_MemSet_0x85): + tst x0, x10 + b.ne LOCAL_LABEL(JIT_MemSet_0x84) + + b LOCAL_LABEL(JIT_MemSet_0x8a) +LOCAL_LABEL(JIT_MemSet_0x88): + dc zva, x0 + add x0, x0, x9 +LOCAL_LABEL(JIT_MemSet_0x8a): + subs x2, x2, x9 + b.ge LOCAL_LABEL(JIT_MemSet_0x88) + +LOCAL_LABEL(JIT_MemSet_0x8c): + add x2, x2, x9 + LOCAL_LABEL(JIT_MemSet_0x9c): - cmp x10,x8 - beq LOCAL_LABEL(JIT_MemSet_0xac) - str x9,[x10],#8 - b LOCAL_LABEL(JIT_MemSet_0x9c) -LOCAL_LABEL(JIT_MemSet_0xac): - mov x8,#-8 - madd x2,x11,x8,x2 - add x0,x0,x11, lsl #3 -LOCAL_LABEL(JIT_MemSet_0xb8): - tbz x2,#2,LOCAL_LABEL(JIT_MemSet_0xc4) - str w9,[x0] - add x0,x0,#4 -LOCAL_LABEL(JIT_MemSet_0xc4): - tbz x2,#1,LOCAL_LABEL(JIT_MemSet_0xd0) - strh w9,[x0] - add x0,x0,#2 -LOCAL_LABEL(JIT_MemSet_0xd0): - tbz x2,#0,LOCAL_LABEL(JIT_MemSet_0xd8) - strb w9,[x0] -LOCAL_LABEL(JIT_MemSet_0xd8): + b LOCAL_LABEL(JIT_MemSet_0xa8) +LOCAL_LABEL(JIT_MemSet_0xa0): + stp x8, x8, [x0], #16 +LOCAL_LABEL(JIT_MemSet_0xa8): + subs x2, x2, #16 + b.ge LOCAL_LABEL(JIT_MemSet_0xa0) + +LOCAL_LABEL(JIT_MemSet_0xb0): + tbz x2, #3, LOCAL_LABEL(JIT_MemSet_0xb4) + str x8, [x0], #8 +LOCAL_LABEL(JIT_MemSet_0xb4): + tbz x2, #2, LOCAL_LABEL(JIT_MemSet_0xc8) + str w8, [x0], #4 +LOCAL_LABEL(JIT_MemSet_0xc8): + tbz x2, #1, LOCAL_LABEL(JIT_MemSet_0xdc) + strh w8, [x0], #2 +LOCAL_LABEL(JIT_MemSet_0xdc): + tbz x2, #0, LOCAL_LABEL(JIT_MemSet_0xe8) + strb w8, [x0] +LOCAL_LABEL(JIT_MemSet_0xe8): ret lr LEAF_END_MARKED JIT_MemSet, _TEXT