From cb650e00e7e3978e2ee2398aee795a6724e714f0 Mon Sep 17 00:00:00 2001 From: Tanner Gooding Date: Fri, 19 Jul 2019 17:52:14 -0700 Subject: [PATCH] Fixing Buffer::BlockCopy, JIT_MemCpy, and JIT_MemSet to just call the appropriate CRT functions for x64 Windows, as is already done for all other platforms/targets (#25763) * Fixing Buffer::BlockCopy to just call the CRT memmove for x64 Windows, as is already done for all other platforms/targets * Fixing up the x64 CrtHelpers.asm to just forward to the CRT implementations for JIT_MemSet and JIT_MemCpy * Keep unix using memcpy and clarify that Windows uses memmove for full framework compat. --- src/vm/amd64/CrtHelpers.asm | 334 +++++++------------------------------------- src/vm/amd64/crthelpers.S | 52 +++++-- src/vm/comutilnative.cpp | 4 - 3 files changed, 90 insertions(+), 300 deletions(-) diff --git a/src/vm/amd64/CrtHelpers.asm b/src/vm/amd64/CrtHelpers.asm index 9d5b280..2025e3c 100644 --- a/src/vm/amd64/CrtHelpers.asm +++ b/src/vm/amd64/CrtHelpers.asm @@ -14,317 +14,77 @@ include AsmMacros.inc -;char *memset(dst, value, count) - sets "count" bytes at "dst" to "value" +extern memset:proc +extern memmove:proc + +; JIT_MemSet/JIT_MemCpy ; -;Purpose: -; Sets the first "count" bytes of the memory starting -; at "dst" to the character value "value". +; It is IMPORTANT that the exception handling code is able to find these guys +; on the stack, but on windows platforms we can just defer to the platform +; implementation. ; -;Algorithm: -;Set dst based on count as follow -; count [0, 16]: use 1/2/4/8 bytes width registers -; count [16, 128]: use 16 bytes width registers (XMM) without loop -; count [128, 512]: use 16 bytes width registers (XMM) with loops, unrolled 8 times -; count [512, upper]: use rep stosb -;Entry: -; char *dst - pointer to memory to fill with value -; char value - value to put in dst bytes -; int count - number of bytes of dst to fill + +; void JIT_MemSet(void* dest, int c, size_t count) ; -;Exit: -; returns dst, with filled bytes +; Purpose: +; Sets the first "count" bytes of the block of memory pointed byte +; "dest" to the specified value (interpreted as an unsigned char). ; -;Uses: +; Entry: +; RCX: void* dest - Pointer to the block of memory to fill. +; RDX: int c - Value to be set. +; R8: size_t count - Number of bytes to be set to the value. ; -;Exceptions: +; Exit: +; +; Uses: +; +; Exceptions: ; -;******************************************************************************* - LEAF_ENTRY JIT_MemSet, _TEXT + test r8, r8 ; check if count is zero + jz Exit_MemSet ; if zero, no bytes to set - movzx edx, dl ; set fill pattern - mov r9, 0101010101010101h - imul rdx, r9 ; rdx is 8 bytes filler + cmp byte ptr [rcx], 0 ; check dest for null - cmp r8, 16 - jbe mset04 + jmp memset ; forward to the CRT implementation - cmp r8, 512 - jbe mset00 - - ; count > 512 - mov r10, rcx ; save dst address - mov r11, rdi ; save rdi - mov eax, edx ; eax is value - mov rdi, rcx ; rdi is dst - mov rcx, r8 ; rcx is count - rep stosb - mov rdi, r11 ; restore rdi - mov rax, r10 +Exit_MemSet: ret - align 16 -mset00: mov rax, rcx ; save dst address - movd xmm0, rdx - punpcklbw xmm0, xmm0 ; xmm0 is 16 bytes filler - - cmp r8, 128 - jbe mset02 - - ; count > 128 && count <= 512 - mov r9, r8 - shr r9, 7 ; count/128 - - align 16 -mset01: movdqu [rcx], xmm0 - movdqu 16[rcx], xmm0 - movdqu 32[rcx], xmm0 - movdqu 48[rcx], xmm0 - movdqu 64[rcx], xmm0 - movdqu 80[rcx], xmm0 - movdqu 96[rcx], xmm0 - movdqu 112[rcx], xmm0 - add rcx, 128 - dec r9 - jnz mset01 - and r8, 7fh ; and r8 with 0111 1111 - - ; the remainder is from 0 to 127 - cmp r8, 16 - jnbe mset02 - - ; the remainder <= 16 - movdqu -16[rcx + r8], xmm0 - ret - - ; count > 16 && count <= 128 for mset02 - align 16 -mset02: movdqu [rcx], xmm0 - movdqu -16[rcx + r8], xmm0 - cmp r8, 32 - jbe mset03 - - ; count > 32 && count <= 64 - movdqu 16[rcx], xmm0 - movdqu -32[rcx + r8], xmm0 - cmp r8, 64 - jbe mset03 - - ; count > 64 && count <= 128 - movdqu 32[rcx], xmm0 - movdqu 48[rcx], xmm0 - movdqu -48[rcx + r8], xmm0 - movdqu -64[rcx + r8], xmm0 -mset03: ret - - align 16 -mset04: mov rax, rcx ; save dst address - test r8b, 24 ; and r8b with 0001 1000 - jz mset05 - - ; count >= 8 && count <= 16 - mov [rcx], rdx - mov -8[rcx + r8], rdx - ret - - align 16 -mset05: test r8b, 4 ; and r8b with 0100 - jz mset06 - - ; count >= 4 && count < 8 - mov [rcx], edx - mov -4[rcx + r8], edx - ret - - ; count >= 0 && count < 4 - align 16 -mset06: test r8b, 1 ; and r8b with 0001 - jz mset07 - mov [rcx],dl -mset07: test r8b, 2 ; and r8b with 0010 - jz mset08 - mov -2[rcx + r8], dx -mset08: ret - LEAF_END_MARKED JIT_MemSet, _TEXT -;JIT_MemCpy - Copy source buffer to destination buffer +; void JIT_MemCpy(void* dest, const void* src, size_t count) ; -;Purpose: -; JIT_MemCpy() copies a source memory buffer to a destination memory -; buffer. This routine recognize overlapping buffers to avoid propogation. -; For cases where propogation is not a problem, memcpy() can be used. +; Purpose: +; Copies the values of "count" bytes from the location pointed to +; by "src" to the memory block pointed by "dest". ; -;Algorithm: -;Copy to destination based on count as follow -; count [0, 64]: overlap check not needed -; count [0, 16]: use 1/2/4/8 bytes width registers -; count [16, 64]: use 16 bytes width registers (XMM) without loop -; count [64, upper]: check overlap -; non-overlap: -; count [64, 512]: use 16 bytes width registers (XMM) with loops, unrolled 4 times -; count [512, upper]: use rep movsb -; overlap:: -; use 16 bytes width registers (XMM) with loops to copy from end to beginnig +; Entry: +; RCX: void* dest - Pointer to the destination array where content is to be copied. +; RDX: const void* src - Pointer to the source of the data to be copied. +; R8: size_t count - Number of bytes to copy. ; -;Entry: -; void *dst = pointer to destination buffer -; const void *src = pointer to source buffer -; size_t count = number of bytes to copy +; Exit: ; -;Exit: -; Returns a pointer to the destination buffer +; Uses: ; -;Uses: +; Exceptions: ; -;Exceptions: -;******************************************************************************* - LEAF_ENTRY JIT_MemCpy, _TEXT + test r8, r8 ; check if count is zero + jz Exit_MemCpy ; if zero, no bytes to copy - mov rax, rcx ; save dst address - cmp r8, 16 - jbe mcpy02 - - cmp r8, 64 - jnbe mcpy07 + cmp byte ptr [rcx], 0 ; check dest for null + cmp byte ptr [rdx], 0 ; check src for null - ; count > 16 && count <= 64 - align 16 -mcpy00: movdqu xmm0, [rdx] - movdqu xmm1, -16[rdx + r8] ; save 16 to 32 bytes src - cmp r8, 32 - jbe mcpy01 - - movdqu xmm2, 16[rdx] - movdqu xmm3, -32[rdx + r8] ; save 32 to 64 bytes src - - ;count > 32 && count <= 64 - movdqu 16[rcx], xmm2 - movdqu -32[rcx + r8], xmm3 - - ;count > 16 && count <= 32 -mcpy01: movdqu [rcx], xmm0 - movdqu -16[rcx + r8], xmm1 - ret - - ; count <= 16 - align 16 -mcpy02: test r8b, 24 ; test count with 0001 1000 - jz mcpy03 - ; count >= 8 && count <= 16 - mov r9, [rdx] - mov r10, -8[rdx + r8] - mov [rcx], r9 - mov -8[rcx + r8], r10 - ret - - align 16 -mcpy03: test r8b, 4 ; test count with 0100 - jz mcpy04 - ; count >= 4 && count < 8 - mov r9d, [rdx] - mov r10d, -4[rdx + r8] - mov [rcx], r9d - mov -4[rcx + r8], r10d - ret - - ; count >= 0 && count < 4 - align 16 -mcpy04: test r8, r8 - jz mcpy06 ; count == 1/2/3 - mov r9b, [rdx] ; save the first byte - - test r8b, 2 ; test count with 0010 - jz mcpy05 - mov r10w, -2[rdx + r8] - mov -2[rcx + r8], r10w -mcpy05: mov [rcx], r9b -mcpy06: ret - - align 16 - ; count > 64, we need to check overlap -mcpy07: mov r9, rdx ; r9 is src address - sub r9, rcx ; if src - dst < 0 jump to mcpy11 - jb mcpy11 ; if b, destination may overlap - -mcpy08: cmp r8, 512 - jnbe mcpy10 - - ; count > 64 && count <= 512 - mov r9, r8 - shr r9, 6 ; count/64 - - align 16 -mcpy09: movdqu xmm0, [rdx] - movdqu xmm1, 16[rdx] - movdqu xmm2, 32[rdx] - movdqu xmm3, 48[rdx] - movdqu [rcx], xmm0 - movdqu 16[rcx], xmm1 - movdqu 32[rcx], xmm2 - movdqu 48[rcx], xmm3 - add rdx, 64 - add rcx, 64 - dec r9 - jnz mcpy09 - - ; the remainder is from 0 to 63 - and r8, 3fh ; and with 0011 1111 - cmp r8, 16 - jnbe mcpy00 + ; Use memmove to handle overlapping buffers for better + ; compatibility with .NET Framework. Needing to handle + ; overlapping buffers in cpblk is undefined by the spec. + jmp memmove ; forward to the CRT implementation - ; the remainder <= 16 - jmp mcpy02 - ret - - ; count > 512 - align 16 -mcpy10: mov r10, rdi ; save rdi - mov r11, rsi ; save rsi - mov rdi, rcx ; rdi is dst - mov rsi, rdx ; rsi is src - mov rcx, r8 ; rcx is count - rep movsb ; mov from rsi to rdi - mov rsi, r11 ; restore rsi - mov rdi, r10 ; restore rdi +Exit_MemCpy: ret -; The source address is less than the destination address. - - align 16 -mcpy11: add r9, r8 ; src - dst + count - cmp r9, 0 ; src + count < = dst jump to mcpy08 - jle mcpy08 - - lea r9, [rdx + r8] ; r9 is the src + count - lea r10, [rcx + r8] ; r10 is the dst + count - - mov r11, r8 - shr r11, 6 ; count/64 - - ; count > 64 - align 16 -mcpy12: movdqu xmm0, -16[r9] - movdqu xmm1, -32[r9] - movdqu xmm2, -48[r9] - movdqu xmm3, -64[r9] - movdqu -16[r10], xmm0 - movdqu -32[r10], xmm1 - movdqu -48[r10], xmm2 - movdqu -64[r10], xmm3 - sub r9, 64 - sub r10, 64 - dec r11 - jnz mcpy12 - - ; the remainder is from 0 to 63 - and r8, 3fh ; and with 0011 1111 - cmp r8, 16 - jnbe mcpy00 - - ; the remainder <= 16 - jmp mcpy02 - LEAF_END_MARKED JIT_MemCpy, _TEXT - end \ No newline at end of file + end diff --git a/src/vm/amd64/crthelpers.S b/src/vm/amd64/crthelpers.S index a9b54e4..81703e2 100644 --- a/src/vm/amd64/crthelpers.S +++ b/src/vm/amd64/crthelpers.S @@ -13,27 +13,61 @@ // implementation. // +// void JIT_MemSet(void* dest, int c, size_t count) +// +// Purpose: +// Sets the first "count" bytes of the block of memory pointed byte +// "dest" to the specified value (interpreted as an unsigned char). +// +// Entry: +// RDI: void* dest - Pointer to the block of memory to fill. +// RSI: int c - Value to be set. +// RDX: size_t count - Number of bytes to be set to the value. +// +// Exit: +// +// Uses: +// +// Exceptions: +// LEAF_ENTRY JIT_MemSet, _TEXT - test rdx, rdx - jz Exit_MemSet + test rdx, rdx // check if count is zero + jz Exit_MemSet // if zero, no bytes to set - cmp byte ptr [rdi], 0 + cmp byte ptr [rdi], 0 // check dest for null - jmp C_PLTFUNC(memset) + jmp C_PLTFUNC(memset) // forward to the CRT implementation Exit_MemSet: ret LEAF_END_MARKED JIT_MemSet, _TEXT +// void JIT_MemCpy(void* dest, const void* src, size_t count) +// +// Purpose: +// Copies the values of "count" bytes from the location pointed to +// by "src" to the memory block pointed by "dest". +// +// Entry: +// RDI: void* dest - Pointer to the destination array where content is to be copied. +// RSI: const void* src - Pointer to the source of the data to be copied. +// RDX: size_t count - Number of bytes to copy. +// +// Exit: +// +// Uses: +// +// Exceptions: +// LEAF_ENTRY JIT_MemCpy, _TEXT - test rdx, rdx - jz Exit_MemCpy + test rdx, rdx // check if count is zero + jz Exit_MemCpy // if zero, no bytes to set - cmp byte ptr [rdi], 0 - cmp byte ptr [rsi], 0 + cmp byte ptr [rdi], 0 // check dest for null + cmp byte ptr [rsi], 0 // check src for null - jmp C_PLTFUNC(memcpy) + jmp C_PLTFUNC(memcpy) // forward to the CRT implementation Exit_MemCpy: ret diff --git a/src/vm/comutilnative.cpp b/src/vm/comutilnative.cpp index bdb2392..ff1da64 100644 --- a/src/vm/comutilnative.cpp +++ b/src/vm/comutilnative.cpp @@ -762,11 +762,7 @@ FCIMPL5(VOID, Buffer::BlockCopy, ArrayBase *src, int srcOffset, ArrayBase *dst, PTR_BYTE dstPtr = dst->GetDataPtr() + dstOffset; if ((srcPtr != dstPtr) && (count > 0)) { -#if defined(_AMD64_) && !defined(PLATFORM_UNIX) - JIT_MemCpy(dstPtr, srcPtr, count); -#else memmove(dstPtr, srcPtr, count); -#endif } FC_GC_POLL(); -- 2.7.4