From 4a3e5664024add678a0147b8216f83b699eefcff Mon Sep 17 00:00:00 2001 From: Steve MacLean Date: Thu, 27 Apr 2017 00:50:37 -0400 Subject: [PATCH] [Arm64/Unix] Revise JIT_MemCpy (dotnet/coreclr#11143) * [Arm64/Unix] Revise JIT_MemCpy Use ldp/stp Correctly handle short copy lengths Simplify code & pseudo code Use uint*_t to make pseudo code more readable Commit migrated from https://github.com/dotnet/coreclr/commit/9a9f941d154b62eb66fd092e72c4ae0200facc42 --- src/coreclr/src/vm/arm64/crthelpers.S | 174 ++++++++++++++++------------------ 1 file changed, 84 insertions(+), 90 deletions(-) diff --git a/src/coreclr/src/vm/arm64/crthelpers.S b/src/coreclr/src/vm/arm64/crthelpers.S index 36eb4ee..51774ae 100644 --- a/src/coreclr/src/vm/arm64/crthelpers.S +++ b/src/coreclr/src/vm/arm64/crthelpers.S @@ -157,60 +157,74 @@ LEAF_END_MARKED JIT_MemSet, _TEXT // // If not aligned then make it 8-byte aligned // if(((uintptr_t)dst&0x7) != 0) // { -// if(((uintptr_t)dst&0x3) == 0) +// // Calculate alignment we can do without exceeding count +// // Use math to avoid introducing more unpredictable branches +// // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0 +// // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64 +// uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64)) +// +// if(align&0x1) // { -// *(UINT*)dst = *(UINT*)src; -// dst = (UINT*)dst + 1; -// src = (UINT*)src + 1; -// count-=4; +// *(unit8_t*)dst = *(unit8_t*)src; +// dst = (unit8_t*)dst + 1; +// src = (unit8_t*)src + 1; +// count-=1; // } -// else if(((uintptr_t)dst&0x1) == 0) +// +// if(align&0x2) // { -// while(count > 0 && ((uintptr_t)dst&0x7) != 0) -// { -// *(short*)dst = *(short*)src; -// dst = (short*)dst + 1; -// src = (short*)src + 1; -// count-=2; -// } +// *(unit16_t*)dst = *(unit16_t*)src; +// dst = (unit16_t*)dst + 1; +// src = (unit16_t*)src + 1; +// count-=2; // } -// else +// +// if(align&0x4) // { -// while(count > 0 && ((uintptr_t)dst&0x7) != 0) -// { -// *(char*)dst = *(char*)src; -// dst = (char*)dst + 1; -// src = (char*)src + 1; -// count--; -// } +// *(unit32_t*)dst = *(unit32_t*)src; +// dst = (unit32_t*)dst + 1; +// src = (unit32_t*)src + 1; +// count-=4; // } // } // -// while(count >= 8) +// count-=16; +// +// while(count >= 0) // { -// *(uintptr_t*)dst = *(uintptr_t*)src; -// dst = (uintptr_t*)dst + 1; -// src = (uintptr_t*)src + 1; -// count-=8; +// *(unit64_t*)dst = *(unit64_t*)src; +// dst = (unit64_t*)dst + 1; +// src = (unit64_t*)src + 1; +// *(unit64_t*)dst = *(unit64_t*)src; +// dst = (unit64_t*)dst + 1; +// src = (unit64_t*)src + 1; +// count-=16; +// } +// +// if(count & 8) +// { +// *(unit64_t*)dst = *(unit64_t*)src; +// dst = (unit64_t*)dst + 1; +// src = (unit64_t*)src + 1; // } // // if(count & 4) // { -// *(UINT*)dst = *(UINT*)src; -// dst = (UINT*)dst + 1; -// src = (UINT*)src + 1; +// *(unit32_t*)dst = *(unit32_t*)src; +// dst = (unit32_t*)dst + 1; +// src = (unit32_t*)src + 1; // } // // if(count & 2) // { -// *(short*)dst = *(short*)src; -// dst = (short*)dst + 1; -// src = (short*)src + 1; +// *(unit16_t*)dst = *(unit16_t*)src; +// dst = (unit16_t*)dst + 1; +// src = (unit16_t*)src + 1; // } // // if(count & 1) // { -// *(char*)dst = *(char*)src; +// *(unit8_t*)dst = *(unit8_t*)src; // } // // @@ -218,69 +232,49 @@ LEAF_END_MARKED JIT_MemSet, _TEXT // Assembly code corresponding to above C++ method. // See comments above for JIT_MemSet method LEAF_ENTRY JIT_MemCpy, _TEXT - and x8,x0,#7 - cbz x8,LOCAL_LABEL(JIT_MemCpy_0x80) - and x8,x0,#3 - cbnz x8,LOCAL_LABEL(JIT_MemCpy_0x2c) - ldr w8,[x1] - str w8,[x0] - add x0,x0,#4 - add x1,x1,#4 - mov x8,#-4 - add x2,x2,x8 - b LOCAL_LABEL(JIT_MemCpy_0x80) + ands x3, x0, #7 + movn x4, #7 + clz x5, x2 + b.eq LOCAL_LABEL(JIT_MemCpy_0xa8) + lsr x4, x4, x5 + and x3, x3, x4 + tbz x3, #0, LOCAL_LABEL(JIT_MemCpy_0x2c) + ldrsb w8, [x1], #1 + strb w8, [x0], #1 + sub x2, x2, #1 LOCAL_LABEL(JIT_MemCpy_0x2c): - cbz x2,LOCAL_LABEL(JIT_MemCpy_0x80) - tbnz x0,#0,LOCAL_LABEL(JIT_MemCpy_0x5c) -LOCAL_LABEL(JIT_MemCpy_0x34): - and x8,x0,#7 - cbz x8,LOCAL_LABEL(JIT_MemCpy_0x80) - ldrsh w8,[x1] - strh w8,[x0] - add x0,x0,#2 - add x1,x1,#2 - mov x8,#-2 - add x2,x2,x8 - cbnz x2,LOCAL_LABEL(JIT_MemCpy_0x34) - b LOCAL_LABEL(JIT_MemCpy_0x80) + tbz x3, #1, LOCAL_LABEL(JIT_MemCpy_0x5c) + ldrsh w8, [x1], #2 + strh w8, [x0], #2 + sub x2, x2, #2 LOCAL_LABEL(JIT_MemCpy_0x5c): - and x8,x0,#7 - cbz x8,LOCAL_LABEL(JIT_MemCpy_0x80) - ldrsb w8,[x1] - strb w8,[x0] - add x0,x0,#1 - add x1,x1,#1 - mov x8,#-1 - add x2,x2,x8 - cbnz x2,LOCAL_LABEL(JIT_MemCpy_0x5c) -LOCAL_LABEL(JIT_MemCpy_0x80): - cmp x2,#8 - blo LOCAL_LABEL(JIT_MemCpy_0xb4) - lsr x9,x2,#3 - mov x8,#-8 - madd x2,x9,x8,x2 + tbz x3, #2, LOCAL_LABEL(JIT_MemCpy_0xa8) + ldr w8, [x1], #4 + str w8, [x0], #4 + sub x2, x2, #4 + b LOCAL_LABEL(JIT_MemCpy_0xa8) LOCAL_LABEL(JIT_MemCpy_0xa0): - ldr x8,[x1],#8 - str x8,[x0],#8 - mov x8,#-1 - add x9,x9,x8 - cbnz x9,LOCAL_LABEL(JIT_MemCpy_0xa0) + ldp x8, x9, [x1], #16 + stp x8, x9, [x0], #16 +LOCAL_LABEL(JIT_MemCpy_0xa8): + subs x2, x2, #16 + b.ge LOCAL_LABEL(JIT_MemCpy_0xa0) +LOCAL_LABEL(JIT_MemCpy_0xb0): + tbz x2, #3, LOCAL_LABEL(JIT_MemCpy_0xb4) + ldr x8, [x1], #8 + str x8, [x0], #8 LOCAL_LABEL(JIT_MemCpy_0xb4): - tbz x2,#2,LOCAL_LABEL(JIT_MemCpy_0xc8) - ldr w8,[x1] - str w8,[x0] - add x0,x0,#4 - add x1,x1,#4 + tbz x2, #2, LOCAL_LABEL(JIT_MemCpy_0xc8) + ldr w8, [x1], #4 + str w8, [x0], #4 LOCAL_LABEL(JIT_MemCpy_0xc8): - tbz x2,#1,LOCAL_LABEL(JIT_MemCpy_0xdc) - ldrsh w8,[x1] - strh w8,[x0] - add x0,x0,#2 - add x1,x1,#2 + tbz x2, #1, LOCAL_LABEL(JIT_MemCpy_0xdc) + ldrsh w8, [x1], #2 + strh w8, [x0], #2 LOCAL_LABEL(JIT_MemCpy_0xdc): - tbz x2,#0,LOCAL_LABEL(JIT_MemCpy_0xe8) - ldrsb w8,[x1] - strb w8,[x0] + tbz x2, #0, LOCAL_LABEL(JIT_MemCpy_0xe8) + ldrsb w8, [x1] + strb w8, [x0] LOCAL_LABEL(JIT_MemCpy_0xe8): ret lr LEAF_END_MARKED JIT_MemCpy, _TEXT -- 2.7.4