// // If not aligned then make it 8-byte aligned
// if(((uintptr_t)dst&0x7) != 0)
// {
-// if(((uintptr_t)dst&0x3) == 0)
+// // Calculate alignment we can do without exceeding count
+// // Use math to avoid introducing more unpredictable branches
+// // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0
+// // Note logic will fail is count >= (1 << 61). But this exceeds max physical memory for arm64
+// uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64))
+//
+// if(align&0x1)
// {
-// *(UINT*)dst = *(UINT*)src;
-// dst = (UINT*)dst + 1;
-// src = (UINT*)src + 1;
-// count-=4;
+// *(unit8_t*)dst = *(unit8_t*)src;
+// dst = (unit8_t*)dst + 1;
+// src = (unit8_t*)src + 1;
+// count-=1;
// }
-// else if(((uintptr_t)dst&0x1) == 0)
+//
+// if(align&0x2)
// {
-// while(count > 0 && ((uintptr_t)dst&0x7) != 0)
-// {
-// *(short*)dst = *(short*)src;
-// dst = (short*)dst + 1;
-// src = (short*)src + 1;
-// count-=2;
-// }
+// *(unit16_t*)dst = *(unit16_t*)src;
+// dst = (unit16_t*)dst + 1;
+// src = (unit16_t*)src + 1;
+// count-=2;
// }
-// else
+//
+// if(align&0x4)
// {
-// while(count > 0 && ((uintptr_t)dst&0x7) != 0)
-// {
-// *(char*)dst = *(char*)src;
-// dst = (char*)dst + 1;
-// src = (char*)src + 1;
-// count--;
-// }
+// *(unit32_t*)dst = *(unit32_t*)src;
+// dst = (unit32_t*)dst + 1;
+// src = (unit32_t*)src + 1;
+// count-=4;
// }
// }
//
-// while(count >= 8)
+// count-=16;
+//
+// while(count >= 0)
// {
-// *(uintptr_t*)dst = *(uintptr_t*)src;
-// dst = (uintptr_t*)dst + 1;
-// src = (uintptr_t*)src + 1;
-// count-=8;
+// *(unit64_t*)dst = *(unit64_t*)src;
+// dst = (unit64_t*)dst + 1;
+// src = (unit64_t*)src + 1;
+// *(unit64_t*)dst = *(unit64_t*)src;
+// dst = (unit64_t*)dst + 1;
+// src = (unit64_t*)src + 1;
+// count-=16;
+// }
+//
+// if(count & 8)
+// {
+// *(unit64_t*)dst = *(unit64_t*)src;
+// dst = (unit64_t*)dst + 1;
+// src = (unit64_t*)src + 1;
// }
//
// if(count & 4)
// {
-// *(UINT*)dst = *(UINT*)src;
-// dst = (UINT*)dst + 1;
-// src = (UINT*)src + 1;
+// *(unit32_t*)dst = *(unit32_t*)src;
+// dst = (unit32_t*)dst + 1;
+// src = (unit32_t*)src + 1;
// }
//
// if(count & 2)
// {
-// *(short*)dst = *(short*)src;
-// dst = (short*)dst + 1;
-// src = (short*)src + 1;
+// *(unit16_t*)dst = *(unit16_t*)src;
+// dst = (unit16_t*)dst + 1;
+// src = (unit16_t*)src + 1;
// }
//
// if(count & 1)
// {
-// *(char*)dst = *(char*)src;
+// *(unit8_t*)dst = *(unit8_t*)src;
// }
//
//
// Assembly code corresponding to above C++ method.
// See comments above for JIT_MemSet method
LEAF_ENTRY JIT_MemCpy, _TEXT
- and x8,x0,#7
- cbz x8,LOCAL_LABEL(JIT_MemCpy_0x80)
- and x8,x0,#3
- cbnz x8,LOCAL_LABEL(JIT_MemCpy_0x2c)
- ldr w8,[x1]
- str w8,[x0]
- add x0,x0,#4
- add x1,x1,#4
- mov x8,#-4
- add x2,x2,x8
- b LOCAL_LABEL(JIT_MemCpy_0x80)
+ ands x3, x0, #7
+ movn x4, #7
+ clz x5, x2
+ b.eq LOCAL_LABEL(JIT_MemCpy_0xa8)
+ lsr x4, x4, x5
+ and x3, x3, x4
+ tbz x3, #0, LOCAL_LABEL(JIT_MemCpy_0x2c)
+ ldrsb w8, [x1], #1
+ strb w8, [x0], #1
+ sub x2, x2, #1
LOCAL_LABEL(JIT_MemCpy_0x2c):
- cbz x2,LOCAL_LABEL(JIT_MemCpy_0x80)
- tbnz x0,#0,LOCAL_LABEL(JIT_MemCpy_0x5c)
-LOCAL_LABEL(JIT_MemCpy_0x34):
- and x8,x0,#7
- cbz x8,LOCAL_LABEL(JIT_MemCpy_0x80)
- ldrsh w8,[x1]
- strh w8,[x0]
- add x0,x0,#2
- add x1,x1,#2
- mov x8,#-2
- add x2,x2,x8
- cbnz x2,LOCAL_LABEL(JIT_MemCpy_0x34)
- b LOCAL_LABEL(JIT_MemCpy_0x80)
+ tbz x3, #1, LOCAL_LABEL(JIT_MemCpy_0x5c)
+ ldrsh w8, [x1], #2
+ strh w8, [x0], #2
+ sub x2, x2, #2
LOCAL_LABEL(JIT_MemCpy_0x5c):
- and x8,x0,#7
- cbz x8,LOCAL_LABEL(JIT_MemCpy_0x80)
- ldrsb w8,[x1]
- strb w8,[x0]
- add x0,x0,#1
- add x1,x1,#1
- mov x8,#-1
- add x2,x2,x8
- cbnz x2,LOCAL_LABEL(JIT_MemCpy_0x5c)
-LOCAL_LABEL(JIT_MemCpy_0x80):
- cmp x2,#8
- blo LOCAL_LABEL(JIT_MemCpy_0xb4)
- lsr x9,x2,#3
- mov x8,#-8
- madd x2,x9,x8,x2
+ tbz x3, #2, LOCAL_LABEL(JIT_MemCpy_0xa8)
+ ldr w8, [x1], #4
+ str w8, [x0], #4
+ sub x2, x2, #4
+ b LOCAL_LABEL(JIT_MemCpy_0xa8)
LOCAL_LABEL(JIT_MemCpy_0xa0):
- ldr x8,[x1],#8
- str x8,[x0],#8
- mov x8,#-1
- add x9,x9,x8
- cbnz x9,LOCAL_LABEL(JIT_MemCpy_0xa0)
+ ldp x8, x9, [x1], #16
+ stp x8, x9, [x0], #16
+LOCAL_LABEL(JIT_MemCpy_0xa8):
+ subs x2, x2, #16
+ b.ge LOCAL_LABEL(JIT_MemCpy_0xa0)
+LOCAL_LABEL(JIT_MemCpy_0xb0):
+ tbz x2, #3, LOCAL_LABEL(JIT_MemCpy_0xb4)
+ ldr x8, [x1], #8
+ str x8, [x0], #8
LOCAL_LABEL(JIT_MemCpy_0xb4):
- tbz x2,#2,LOCAL_LABEL(JIT_MemCpy_0xc8)
- ldr w8,[x1]
- str w8,[x0]
- add x0,x0,#4
- add x1,x1,#4
+ tbz x2, #2, LOCAL_LABEL(JIT_MemCpy_0xc8)
+ ldr w8, [x1], #4
+ str w8, [x0], #4
LOCAL_LABEL(JIT_MemCpy_0xc8):
- tbz x2,#1,LOCAL_LABEL(JIT_MemCpy_0xdc)
- ldrsh w8,[x1]
- strh w8,[x0]
- add x0,x0,#2
- add x1,x1,#2
+ tbz x2, #1, LOCAL_LABEL(JIT_MemCpy_0xdc)
+ ldrsh w8, [x1], #2
+ strh w8, [x0], #2
LOCAL_LABEL(JIT_MemCpy_0xdc):
- tbz x2,#0,LOCAL_LABEL(JIT_MemCpy_0xe8)
- ldrsb w8,[x1]
- strb w8,[x0]
+ tbz x2, #0, LOCAL_LABEL(JIT_MemCpy_0xe8)
+ ldrsb w8, [x1]
+ strb w8, [x0]
LOCAL_LABEL(JIT_MemCpy_0xe8):
ret lr
LEAF_END_MARKED JIT_MemCpy, _TEXT