[Arm64/Unix] Revise JIT_MemCpy (dotnet/coreclr#11143)
authorSteve MacLean <sdmaclea@qti.qualcomm.com>
Thu, 27 Apr 2017 04:50:37 +0000 (00:50 -0400)
committerJan Kotas <jkotas@microsoft.com>
Thu, 27 Apr 2017 04:50:37 +0000 (21:50 -0700)
* [Arm64/Unix] Revise JIT_MemCpy

Use ldp/stp
Correctly handle short copy lengths
Simplify code & pseudo code
Use uint*_t to make pseudo code more readable

Commit migrated from https://github.com/dotnet/coreclr/commit/9a9f941d154b62eb66fd092e72c4ae0200facc42

src/coreclr/src/vm/arm64/crthelpers.S

index 36eb4ee..51774ae 100644 (file)
@@ -157,60 +157,74 @@ LEAF_END_MARKED JIT_MemSet, _TEXT
 //    // If not aligned then make it 8-byte aligned   
 //    if(((uintptr_t)dst&0x7) != 0)
 //    {
-//        if(((uintptr_t)dst&0x3) == 0)
+//        // Calculate alignment we can do without exceeding count
+//        // Use math to avoid introducing more unpredictable branches
+//        // Due to inherent mod in lsr, ~7 is used instead of ~0 to handle count == 0
+//        // Note logic will fail is count >= (1 << 61).  But this exceeds max physical memory for arm64
+//        uint8_t align = (dst & 0x7) & (~uint64_t(7) >> (countLeadingZeros(count) mod 64))
+//
+//        if(align&0x1)
 //        {
-//            *(UINT*)dst = *(UINT*)src;
-//            dst = (UINT*)dst + 1;
-//            src = (UINT*)src + 1;
-//            count-=4;
+//            *(unit8_t*)dst = *(unit8_t*)src;
+//            dst = (unit8_t*)dst + 1;
+//            src = (unit8_t*)src + 1;
+//            count-=1;
 //        }
-//        else if(((uintptr_t)dst&0x1) == 0)
+//
+//        if(align&0x2)
 //        {
-//            while(count > 0 && ((uintptr_t)dst&0x7) != 0)
-//            {
-//                *(short*)dst = *(short*)src;
-//                dst = (short*)dst + 1;
-//                src = (short*)src + 1;
-//                count-=2;
-//            }
+//            *(unit16_t*)dst = *(unit16_t*)src;
+//            dst = (unit16_t*)dst + 1;
+//            src = (unit16_t*)src + 1;
+//            count-=2;
 //        }
-//        else
+//
+//        if(align&0x4)
 //        {
-//            while(count > 0 && ((uintptr_t)dst&0x7) != 0)
-//            {
-//                *(char*)dst = *(char*)src;
-//                dst = (char*)dst + 1;
-//                src = (char*)src + 1;
-//                count--;
-//            }
+//            *(unit32_t*)dst = *(unit32_t*)src;
+//            dst = (unit32_t*)dst + 1;
+//            src = (unit32_t*)src + 1;
+//            count-=4;
 //        }
 //    }
 //
-//    while(count >= 8)
+//    count-=16;
+//
+//    while(count >= 0)
 //    {
-//        *(uintptr_t*)dst = *(uintptr_t*)src;
-//        dst = (uintptr_t*)dst + 1;
-//        src = (uintptr_t*)src + 1;
-//        count-=8;
+//        *(unit64_t*)dst = *(unit64_t*)src;
+//        dst = (unit64_t*)dst + 1;
+//        src = (unit64_t*)src + 1;
+//        *(unit64_t*)dst = *(unit64_t*)src;
+//        dst = (unit64_t*)dst + 1;
+//        src = (unit64_t*)src + 1;
+//        count-=16;
+//    }
+//
+//    if(count & 8)
+//    {
+//        *(unit64_t*)dst = *(unit64_t*)src;
+//        dst = (unit64_t*)dst + 1;
+//        src = (unit64_t*)src + 1;
 //    }
 //
 //    if(count & 4)
 //    {
-//        *(UINT*)dst = *(UINT*)src;
-//        dst = (UINT*)dst + 1;
-//        src = (UINT*)src + 1;
+//        *(unit32_t*)dst = *(unit32_t*)src;
+//        dst = (unit32_t*)dst + 1;
+//        src = (unit32_t*)src + 1;
 //    }
 //
 //    if(count & 2)
 //    {
-//        *(short*)dst = *(short*)src;
-//        dst = (short*)dst + 1;
-//        src = (short*)src + 1;
+//        *(unit16_t*)dst = *(unit16_t*)src;
+//        dst = (unit16_t*)dst + 1;
+//        src = (unit16_t*)src + 1;
 //    }
 //
 //    if(count & 1)
 //    {
-//        *(char*)dst = *(char*)src;
+//        *(unit8_t*)dst = *(unit8_t*)src;
 //    }
 //
 //
@@ -218,69 +232,49 @@ LEAF_END_MARKED JIT_MemSet, _TEXT
 // Assembly code corresponding to above C++ method.
 // See comments above for JIT_MemSet method
 LEAF_ENTRY JIT_MemCpy, _TEXT
-    and         x8,x0,#7
-    cbz         x8,LOCAL_LABEL(JIT_MemCpy_0x80)
-    and         x8,x0,#3
-    cbnz        x8,LOCAL_LABEL(JIT_MemCpy_0x2c)
-    ldr         w8,[x1]
-    str         w8,[x0]
-    add         x0,x0,#4
-    add         x1,x1,#4
-    mov         x8,#-4
-    add         x2,x2,x8
-    b           LOCAL_LABEL(JIT_MemCpy_0x80)
+    ands        x3, x0, #7
+    movn        x4, #7
+    clz         x5, x2
+    b.eq        LOCAL_LABEL(JIT_MemCpy_0xa8)
+    lsr         x4, x4, x5
+    and         x3, x3, x4
+    tbz         x3, #0, LOCAL_LABEL(JIT_MemCpy_0x2c)
+    ldrsb       w8, [x1], #1
+    strb        w8, [x0], #1
+    sub         x2, x2, #1
 LOCAL_LABEL(JIT_MemCpy_0x2c):
-    cbz         x2,LOCAL_LABEL(JIT_MemCpy_0x80)
-    tbnz        x0,#0,LOCAL_LABEL(JIT_MemCpy_0x5c)
-LOCAL_LABEL(JIT_MemCpy_0x34):
-    and         x8,x0,#7
-    cbz         x8,LOCAL_LABEL(JIT_MemCpy_0x80)
-    ldrsh       w8,[x1]
-    strh        w8,[x0]
-    add         x0,x0,#2
-    add         x1,x1,#2
-    mov         x8,#-2
-    add         x2,x2,x8
-    cbnz        x2,LOCAL_LABEL(JIT_MemCpy_0x34)
-    b           LOCAL_LABEL(JIT_MemCpy_0x80)
+    tbz         x3, #1, LOCAL_LABEL(JIT_MemCpy_0x5c)
+    ldrsh       w8, [x1], #2
+    strh        w8, [x0], #2
+    sub         x2, x2, #2
 LOCAL_LABEL(JIT_MemCpy_0x5c):
-    and         x8,x0,#7
-    cbz         x8,LOCAL_LABEL(JIT_MemCpy_0x80)
-    ldrsb       w8,[x1]
-    strb        w8,[x0]
-    add         x0,x0,#1
-    add         x1,x1,#1
-    mov         x8,#-1
-    add         x2,x2,x8
-    cbnz        x2,LOCAL_LABEL(JIT_MemCpy_0x5c)
-LOCAL_LABEL(JIT_MemCpy_0x80):
-    cmp         x2,#8
-    blo         LOCAL_LABEL(JIT_MemCpy_0xb4)
-    lsr         x9,x2,#3
-    mov         x8,#-8
-    madd        x2,x9,x8,x2
+    tbz         x3, #2, LOCAL_LABEL(JIT_MemCpy_0xa8)
+    ldr         w8, [x1], #4
+    str         w8, [x0], #4
+    sub         x2, x2, #4
+    b           LOCAL_LABEL(JIT_MemCpy_0xa8)
 LOCAL_LABEL(JIT_MemCpy_0xa0):
-    ldr         x8,[x1],#8
-    str         x8,[x0],#8
-    mov         x8,#-1
-    add         x9,x9,x8
-    cbnz        x9,LOCAL_LABEL(JIT_MemCpy_0xa0)
+    ldp         x8, x9, [x1], #16
+    stp         x8, x9, [x0], #16
+LOCAL_LABEL(JIT_MemCpy_0xa8):
+    subs        x2, x2, #16
+    b.ge        LOCAL_LABEL(JIT_MemCpy_0xa0)
+LOCAL_LABEL(JIT_MemCpy_0xb0):
+    tbz         x2, #3, LOCAL_LABEL(JIT_MemCpy_0xb4)
+    ldr         x8, [x1], #8
+    str         x8, [x0], #8
 LOCAL_LABEL(JIT_MemCpy_0xb4):
-    tbz         x2,#2,LOCAL_LABEL(JIT_MemCpy_0xc8)
-    ldr         w8,[x1]
-    str         w8,[x0]
-    add         x0,x0,#4
-    add         x1,x1,#4
+    tbz         x2, #2, LOCAL_LABEL(JIT_MemCpy_0xc8)
+    ldr         w8, [x1], #4
+    str         w8, [x0], #4
 LOCAL_LABEL(JIT_MemCpy_0xc8):
-    tbz         x2,#1,LOCAL_LABEL(JIT_MemCpy_0xdc)
-    ldrsh       w8,[x1]
-    strh        w8,[x0]
-    add         x0,x0,#2
-    add         x1,x1,#2
+    tbz         x2, #1, LOCAL_LABEL(JIT_MemCpy_0xdc)
+    ldrsh       w8, [x1], #2
+    strh        w8, [x0], #2
 LOCAL_LABEL(JIT_MemCpy_0xdc):
-    tbz         x2,#0,LOCAL_LABEL(JIT_MemCpy_0xe8)
-    ldrsb       w8,[x1]
-    strb        w8,[x0]
+    tbz         x2, #0, LOCAL_LABEL(JIT_MemCpy_0xe8)
+    ldrsb       w8, [x1]
+    strb        w8, [x0]
 LOCAL_LABEL(JIT_MemCpy_0xe8):
     ret         lr
 LEAF_END_MARKED JIT_MemCpy, _TEXT