1 /* -----------------------------------------------------------------------
2 unix64.S - Copyright (c) 2013 The Written Word, Inc.
3 - Copyright (c) 2008 Red Hat, Inc
4 - Copyright (c) 2002 Bo Thorsen <bo@suse.de>
6 x86-64 Foreign Function Interface
8 Permission is hereby granted, free of charge, to any person obtaining
9 a copy of this software and associated documentation files (the
10 ``Software''), to deal in the Software without restriction, including
11 without limitation the rights to use, copy, modify, merge, publish,
12 distribute, sublicense, and/or sell copies of the Software, and to
13 permit persons to whom the Software is furnished to do so, subject to
14 the following conditions:
16 The above copyright notice and this permission notice shall be included
17 in all copies or substantial portions of the Software.
19 THE SOFTWARE IS PROVIDED ``AS IS'', WITHOUT WARRANTY OF ANY KIND,
20 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
21 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
22 NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
23 HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
24 WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
26 DEALINGS IN THE SOFTWARE.
27 ----------------------------------------------------------------------- */
31 #include <fficonfig.h>
33 #include "internal64.h"
38 /* This macro allows the safe creation of jump tables without an
39 actual table. The entry points into the table are all 8 bytes.
40 The use of ORG asserts that we're at the correct location. */
41 /* ??? The clang assembler doesn't handle .org with symbolic expressions. */
42 #if defined(__clang__) || defined(__APPLE__) || (defined (__sun__) && defined(__svr4__))
43 # define E(BASE, X) .balign 8
46 # define E(BASE, X) .balign 8; .org BASE + X * 16
48 # define E(BASE, X) .balign 8; .org BASE + X * 8
52 /* ffi_call_unix64 (void *args, unsigned long bytes, unsigned flags,
53 void *raddr, void (*fnaddr)(void));
55 Bit o trickiness here -- ARGS+BYTES is the base of the stack frame
56 for this function. This has been allocated by ffi_call. We also
57 deallocate some of the stack that has been alloca'd. */
60 .globl C(ffi_call_unix64)
61 FFI_HIDDEN(C(ffi_call_unix64))
66 movq (%rsp), %r10 /* Load return address. */
67 leaq (%rdi, %rsi), %rax /* Find local stack base. */
68 movq %rdx, (%rax) /* Save flags. */
69 movq %rcx, 8(%rax) /* Save raddr. */
70 movq %rbp, 16(%rax) /* Save old frame pointer. */
71 movq %r10, 24(%rax) /* Relocate return address. */
72 movq %rax, %rbp /* Finalize local stack frame. */
74 /* New stack frame based off rbp. This is a itty bit of unwind
75 trickery in that the CFA *has* changed. There is no easy way
76 to describe it correctly on entry to the function. Fortunately,
77 it doesn't matter too much since at all points we can correctly
78 unwind back to ffi_call. Note that the location to which we
79 moved the return address is (the new) CFA-8, so from the
80 perspective of the unwind info, it hasn't moved. */
82 /* cfi_def_cfa(%rbp, 32) */
83 /* cfi_rel_offset(%rbp, 16) */
85 movq %rdi, %r10 /* Save a copy of the register area. */
86 movq %r8, %r11 /* Save a copy of the target fn. */
88 /* Load up all argument registers. */
95 movl 0xb0(%r10), %eax /* Set number of SSE registers. */
100 /* Deallocate the reg arg area, except for r10, then load via pop. */
101 leaq 0xb8(%r10), %rsp
104 /* Call the user function. */
107 /* Deallocate stack arg area; local stack frame in redzone. */
110 movq 0(%rbp), %rcx /* Reload flags. */
111 movq 8(%rbp), %rdi /* Reload raddr. */
112 movq 16(%rbp), %rbp /* Reload old frame pointer. */
114 /* cfi_remember_state */
115 /* cfi_def_cfa(%rsp, 8) */
116 /* cfi_restore(%rbp) */
118 /* The first byte of the flags contains the FFI_TYPE. */
119 cmpb $UNIX64_RET_LAST, %cl
121 leaq L(store_table)(%rip), %r11
124 /* NB: Originally, each slot is 8 byte. 4 bytes of ENDBR64 +
125 4 bytes NOP padding double slot size to 16 bytes. */
128 leaq (%r11, %r10, 8), %r10
130 /* Prep for the structure cases: scratch area in redzone. */
136 E(L(store_table), UNIX64_RET_VOID)
139 E(L(store_table), UNIX64_RET_UINT8)
144 E(L(store_table), UNIX64_RET_UINT16)
149 E(L(store_table), UNIX64_RET_UINT32)
154 E(L(store_table), UNIX64_RET_SINT8)
159 E(L(store_table), UNIX64_RET_SINT16)
164 E(L(store_table), UNIX64_RET_SINT32)
169 E(L(store_table), UNIX64_RET_INT64)
173 E(L(store_table), UNIX64_RET_XMM32)
177 E(L(store_table), UNIX64_RET_XMM64)
181 E(L(store_table), UNIX64_RET_X87)
185 E(L(store_table), UNIX64_RET_X87_2)
190 E(L(store_table), UNIX64_RET_ST_XMM0_RAX)
194 E(L(store_table), UNIX64_RET_ST_RAX_XMM0)
198 E(L(store_table), UNIX64_RET_ST_XMM0_XMM1)
202 E(L(store_table), UNIX64_RET_ST_RAX_RDX)
207 shrl $UNIX64_SIZE_SHIFT, %ecx
213 shrl $UNIX64_SIZE_SHIFT, %ecx
217 L(sa): call PLT(C(abort))
219 /* Many times we can avoid loading any SSE registers at all.
220 It's not worth an indirect jump to load the exact set of
221 SSE registers needed; zero or all is a good compromise. */
224 /* cfi_restore_state */
226 movdqa 0x30(%r10), %xmm0
227 movdqa 0x40(%r10), %xmm1
228 movdqa 0x50(%r10), %xmm2
229 movdqa 0x60(%r10), %xmm3
230 movdqa 0x70(%r10), %xmm4
231 movdqa 0x80(%r10), %xmm5
232 movdqa 0x90(%r10), %xmm6
233 movdqa 0xa0(%r10), %xmm7
234 jmp L(ret_from_load_sse)
237 ENDF(C(ffi_call_unix64))
239 /* 6 general registers, 8 vector registers,
240 32 bytes of rvalue, 8 bytes of alignment. */
241 #define ffi_closure_OFS_G 0
242 #define ffi_closure_OFS_V (6*8)
243 #define ffi_closure_OFS_RVALUE (ffi_closure_OFS_V + 8*16)
244 #define ffi_closure_FS (ffi_closure_OFS_RVALUE + 32 + 8)
246 /* The location of rvalue within the red zone after deallocating the frame. */
247 #define ffi_closure_RED_RVALUE (ffi_closure_OFS_RVALUE - ffi_closure_FS)
250 .globl C(ffi_closure_unix64_sse)
251 FFI_HIDDEN(C(ffi_closure_unix64_sse))
253 C(ffi_closure_unix64_sse):
256 subq $ffi_closure_FS, %rsp
258 /* cfi_adjust_cfa_offset(ffi_closure_FS) */
260 movdqa %xmm0, ffi_closure_OFS_V+0x00(%rsp)
261 movdqa %xmm1, ffi_closure_OFS_V+0x10(%rsp)
262 movdqa %xmm2, ffi_closure_OFS_V+0x20(%rsp)
263 movdqa %xmm3, ffi_closure_OFS_V+0x30(%rsp)
264 movdqa %xmm4, ffi_closure_OFS_V+0x40(%rsp)
265 movdqa %xmm5, ffi_closure_OFS_V+0x50(%rsp)
266 movdqa %xmm6, ffi_closure_OFS_V+0x60(%rsp)
267 movdqa %xmm7, ffi_closure_OFS_V+0x70(%rsp)
271 ENDF(C(ffi_closure_unix64_sse))
274 .globl C(ffi_closure_unix64)
275 FFI_HIDDEN(C(ffi_closure_unix64))
277 C(ffi_closure_unix64):
280 subq $ffi_closure_FS, %rsp
282 /* cfi_adjust_cfa_offset(ffi_closure_FS) */
284 movq %rdi, ffi_closure_OFS_G+0x00(%rsp)
285 movq %rsi, ffi_closure_OFS_G+0x08(%rsp)
286 movq %rdx, ffi_closure_OFS_G+0x10(%rsp)
287 movq %rcx, ffi_closure_OFS_G+0x18(%rsp)
288 movq %r8, ffi_closure_OFS_G+0x20(%rsp)
289 movq %r9, ffi_closure_OFS_G+0x28(%rsp)
292 movl FFI_TRAMPOLINE_SIZE(%r10), %edi /* Load cif */
293 movl FFI_TRAMPOLINE_SIZE+4(%r10), %esi /* Load fun */
294 movl FFI_TRAMPOLINE_SIZE+8(%r10), %edx /* Load user_data */
296 movq FFI_TRAMPOLINE_SIZE(%r10), %rdi /* Load cif */
297 movq FFI_TRAMPOLINE_SIZE+8(%r10), %rsi /* Load fun */
298 movq FFI_TRAMPOLINE_SIZE+16(%r10), %rdx /* Load user_data */
301 leaq ffi_closure_OFS_RVALUE(%rsp), %rcx /* Load rvalue */
302 movq %rsp, %r8 /* Load reg_args */
303 leaq ffi_closure_FS+8(%rsp), %r9 /* Load argp */
304 call PLT(C(ffi_closure_unix64_inner))
306 /* Deallocate stack frame early; return value is now in redzone. */
307 addq $ffi_closure_FS, %rsp
309 /* cfi_adjust_cfa_offset(-ffi_closure_FS) */
311 /* The first byte of the return value contains the FFI_TYPE. */
312 cmpb $UNIX64_RET_LAST, %al
314 leaq L(load_table)(%rip), %r11
317 /* NB: Originally, each slot is 8 byte. 4 bytes of ENDBR64 +
318 4 bytes NOP padding double slot size to 16 bytes. */
321 leaq (%r11, %r10, 8), %r10
322 leaq ffi_closure_RED_RVALUE(%rsp), %rsi
327 E(L(load_table), UNIX64_RET_VOID)
330 E(L(load_table), UNIX64_RET_UINT8)
334 E(L(load_table), UNIX64_RET_UINT16)
338 E(L(load_table), UNIX64_RET_UINT32)
342 E(L(load_table), UNIX64_RET_SINT8)
346 E(L(load_table), UNIX64_RET_SINT16)
350 E(L(load_table), UNIX64_RET_SINT32)
354 E(L(load_table), UNIX64_RET_INT64)
358 E(L(load_table), UNIX64_RET_XMM32)
362 E(L(load_table), UNIX64_RET_XMM64)
366 E(L(load_table), UNIX64_RET_X87)
370 E(L(load_table), UNIX64_RET_X87_2)
375 E(L(load_table), UNIX64_RET_ST_XMM0_RAX)
379 E(L(load_table), UNIX64_RET_ST_RAX_XMM0)
383 E(L(load_table), UNIX64_RET_ST_XMM0_XMM1)
387 E(L(load_table), UNIX64_RET_ST_RAX_RDX)
398 L(la): call PLT(C(abort))
401 ENDF(C(ffi_closure_unix64))
404 .globl C(ffi_go_closure_unix64_sse)
405 FFI_HIDDEN(C(ffi_go_closure_unix64_sse))
407 C(ffi_go_closure_unix64_sse):
410 subq $ffi_closure_FS, %rsp
412 /* cfi_adjust_cfa_offset(ffi_closure_FS) */
414 movdqa %xmm0, ffi_closure_OFS_V+0x00(%rsp)
415 movdqa %xmm1, ffi_closure_OFS_V+0x10(%rsp)
416 movdqa %xmm2, ffi_closure_OFS_V+0x20(%rsp)
417 movdqa %xmm3, ffi_closure_OFS_V+0x30(%rsp)
418 movdqa %xmm4, ffi_closure_OFS_V+0x40(%rsp)
419 movdqa %xmm5, ffi_closure_OFS_V+0x50(%rsp)
420 movdqa %xmm6, ffi_closure_OFS_V+0x60(%rsp)
421 movdqa %xmm7, ffi_closure_OFS_V+0x70(%rsp)
425 ENDF(C(ffi_go_closure_unix64_sse))
428 .globl C(ffi_go_closure_unix64)
429 FFI_HIDDEN(C(ffi_go_closure_unix64))
431 C(ffi_go_closure_unix64):
434 subq $ffi_closure_FS, %rsp
436 /* cfi_adjust_cfa_offset(ffi_closure_FS) */
438 movq %rdi, ffi_closure_OFS_G+0x00(%rsp)
439 movq %rsi, ffi_closure_OFS_G+0x08(%rsp)
440 movq %rdx, ffi_closure_OFS_G+0x10(%rsp)
441 movq %rcx, ffi_closure_OFS_G+0x18(%rsp)
442 movq %r8, ffi_closure_OFS_G+0x20(%rsp)
443 movq %r9, ffi_closure_OFS_G+0x28(%rsp)
446 movl 4(%r10), %edi /* Load cif */
447 movl 8(%r10), %esi /* Load fun */
448 movl %r10d, %edx /* Load closure (user_data) */
450 movq 8(%r10), %rdi /* Load cif */
451 movq 16(%r10), %rsi /* Load fun */
452 movq %r10, %rdx /* Load closure (user_data) */
457 ENDF(C(ffi_go_closure_unix64))
459 #if defined(FFI_EXEC_STATIC_TRAMP)
461 .globl C(ffi_closure_unix64_sse_alt)
462 FFI_HIDDEN(C(ffi_closure_unix64_sse_alt))
464 C(ffi_closure_unix64_sse_alt):
465 /* See the comments above trampoline_code_table. */
467 movq 8(%rsp), %r10 /* Load closure in r10 */
468 addq $16, %rsp /* Restore the stack */
469 jmp C(ffi_closure_unix64_sse)
470 ENDF(C(ffi_closure_unix64_sse_alt))
473 .globl C(ffi_closure_unix64_alt)
474 FFI_HIDDEN(C(ffi_closure_unix64_alt))
476 C(ffi_closure_unix64_alt):
477 /* See the comments above trampoline_code_table. */
479 movq 8(%rsp), %r10 /* Load closure in r10 */
480 addq $16, %rsp /* Restore the stack */
481 jmp C(ffi_closure_unix64)
482 ENDF(C(ffi_closure_unix64_alt))
485 * Below is the definition of the trampoline code table. Each element in
486 * the code table is a trampoline.
488 * Because we jump to the trampoline, we place a _CET_ENDBR at the
489 * beginning of the trampoline to mark it as a valid branch target. This is
490 * part of the the Intel CET (Control Flow Enforcement Technology).
493 * The trampoline uses register r10. It saves the original value of r10 on
496 * The trampoline has two parameters - target code to jump to and data for
497 * the target code. The trampoline extracts the parameters from its parameter
498 * block (see tramp_table_map()). The trampoline saves the data address on
499 * the stack. Finally, it jumps to the target code.
501 * The target code can choose to:
503 * - restore the value of r10
504 * - load the data address in a register
505 * - restore the stack pointer to what it was when the trampoline was invoked.
508 #define X86_DATA_OFFSET 4077
509 #define X86_CODE_OFFSET 4073
511 #define X86_DATA_OFFSET 4081
512 #define X86_CODE_OFFSET 4077
515 .align UNIX64_TRAMP_MAP_SIZE
516 .globl trampoline_code_table
517 FFI_HIDDEN(C(trampoline_code_table))
519 C(trampoline_code_table):
520 .rept UNIX64_TRAMP_MAP_SIZE / UNIX64_TRAMP_SIZE
522 subq $16, %rsp /* Make space on the stack */
523 movq %r10, (%rsp) /* Save %r10 on stack */
524 movq X86_DATA_OFFSET(%rip), %r10 /* Copy data into %r10 */
525 movq %r10, 8(%rsp) /* Save data on stack */
526 movq X86_CODE_OFFSET(%rip), %r10 /* Copy code into %r10 */
527 jmp *%r10 /* Jump to code */
530 ENDF(C(trampoline_code_table))
531 .align UNIX64_TRAMP_MAP_SIZE
532 #endif /* FFI_EXEC_STATIC_TRAMP */
534 /* Sadly, OSX cctools-as doesn't understand .cfi directives at all. */
537 .section __TEXT,__eh_frame,coalesced,no_toc+strip_static_syms+live_support
539 #elif defined(HAVE_AS_X86_64_UNWIND_SECTION_TYPE)
540 .section .eh_frame,"a",@unwind
542 .section .eh_frame,"a",@progbits
545 #ifdef HAVE_AS_X86_PCREL
546 # define PCREL(X) X - .
548 # define PCREL(X) X@rel
551 /* Simplify advancing between labels. Assume DW_CFA_advance_loc1 fits. */
553 /* Use DW_CFA_advance_loc2 when IBT is enabled. */
554 # define ADV(N, P) .byte 3; .2byte L(N)-L(P)
556 # define ADV(N, P) .byte 2, L(N)-L(P)
561 .set L(set0),L(ECIE)-L(SCIE)
562 .long L(set0) /* CIE Length */
564 .long 0 /* CIE Identifier Tag */
565 .byte 1 /* CIE Version */
566 .ascii "zR\0" /* CIE Augmentation */
567 .byte 1 /* CIE Code Alignment Factor */
568 .byte 0x78 /* CIE Data Alignment Factor */
569 .byte 0x10 /* CIE RA Column */
570 .byte 1 /* Augmentation size */
571 .byte 0x1b /* FDE Encoding (pcrel sdata4) */
572 .byte 0xc, 7, 8 /* DW_CFA_def_cfa, %rsp offset 8 */
573 .byte 0x80+16, 1 /* DW_CFA_offset, %rip offset 1*-8 */
577 .set L(set1),L(EFDE1)-L(SFDE1)
578 .long L(set1) /* FDE Length */
580 .long L(SFDE1)-L(CIE) /* FDE CIE offset */
581 .long PCREL(L(UW0)) /* Initial location */
582 .long L(UW4)-L(UW0) /* Address range */
583 .byte 0 /* Augmentation size */
585 .byte 0xc, 6, 32 /* DW_CFA_def_cfa, %rbp 32 */
586 .byte 0x80+6, 2 /* DW_CFA_offset, %rbp 2*-8 */
588 .byte 0xa /* DW_CFA_remember_state */
589 .byte 0xc, 7, 8 /* DW_CFA_def_cfa, %rsp 8 */
590 .byte 0xc0+6 /* DW_CFA_restore, %rbp */
592 .byte 0xb /* DW_CFA_restore_state */
596 .set L(set2),L(EFDE2)-L(SFDE2)
597 .long L(set2) /* FDE Length */
599 .long L(SFDE2)-L(CIE) /* FDE CIE offset */
600 .long PCREL(L(UW5)) /* Initial location */
601 .long L(UW7)-L(UW5) /* Address range */
602 .byte 0 /* Augmentation size */
604 .byte 0xe /* DW_CFA_def_cfa_offset */
605 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */
609 .set L(set3),L(EFDE3)-L(SFDE3)
610 .long L(set3) /* FDE Length */
612 .long L(SFDE3)-L(CIE) /* FDE CIE offset */
613 .long PCREL(L(UW8)) /* Initial location */
614 .long L(UW11)-L(UW8) /* Address range */
615 .byte 0 /* Augmentation size */
617 .byte 0xe /* DW_CFA_def_cfa_offset */
618 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */
620 .byte 0xe, 8 /* DW_CFA_def_cfa_offset 8 */
623 .set L(set4),L(EFDE4)-L(SFDE4)
624 .long L(set4) /* FDE Length */
626 .long L(SFDE4)-L(CIE) /* FDE CIE offset */
627 .long PCREL(L(UW12)) /* Initial location */
628 .long L(UW14)-L(UW12) /* Address range */
629 .byte 0 /* Augmentation size */
631 .byte 0xe /* DW_CFA_def_cfa_offset */
632 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */
636 .set L(set5),L(EFDE5)-L(SFDE5)
637 .long L(set5) /* FDE Length */
639 .long L(SFDE5)-L(CIE) /* FDE CIE offset */
640 .long PCREL(L(UW15)) /* Initial location */
641 .long L(UW17)-L(UW15) /* Address range */
642 .byte 0 /* Augmentation size */
644 .byte 0xe /* DW_CFA_def_cfa_offset */
645 .byte ffi_closure_FS + 8, 1 /* uleb128, assuming 128 <= FS < 255 */
649 .subsections_via_symbols
650 .section __LD,__compact_unwind,regular,debug
652 /* compact unwind for ffi_call_unix64 */
653 .quad C(ffi_call_unix64)
654 .set L1,L(UW4)-L(UW0)
656 .long 0x04000000 /* use dwarf unwind info */
660 /* compact unwind for ffi_closure_unix64_sse */
661 .quad C(ffi_closure_unix64_sse)
662 .set L2,L(UW7)-L(UW5)
664 .long 0x04000000 /* use dwarf unwind info */
668 /* compact unwind for ffi_closure_unix64 */
669 .quad C(ffi_closure_unix64)
670 .set L3,L(UW11)-L(UW8)
672 .long 0x04000000 /* use dwarf unwind info */
676 /* compact unwind for ffi_go_closure_unix64_sse */
677 .quad C(ffi_go_closure_unix64_sse)
678 .set L4,L(UW14)-L(UW12)
680 .long 0x04000000 /* use dwarf unwind info */
684 /* compact unwind for ffi_go_closure_unix64 */
685 .quad C(ffi_go_closure_unix64)
686 .set L5,L(UW17)-L(UW15)
688 .long 0x04000000 /* use dwarf unwind info */
693 #endif /* __x86_64__ */
694 #if defined __ELF__ && defined __linux__
695 .section .note.GNU-stack,"",@progbits