2 ; jsimdext.inc - common declarations
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
5 ; Copyright (C) 2010, 2016, 2018-2019, D. R. Commander.
6 ; Copyright (C) 2018, Matthieu Darbois.
7 ; Copyright (C) 2018, Matthias Räncker.
9 ; Based on the x86 SIMD extension for IJG JPEG library - version 1.02
11 ; Copyright (C) 1999-2006, MIYASAKA Masaru.
13 ; This software is provided 'as-is', without any express or implied
14 ; warranty. In no event will the authors be held liable for any damages
15 ; arising from the use of this software.
17 ; Permission is granted to anyone to use this software for any purpose,
18 ; including commercial applications, and to alter it and redistribute it
19 ; freely, subject to the following restrictions:
21 ; 1. The origin of this software must not be misrepresented; you must not
22 ; claim that you wrote the original software. If you use this software
23 ; in a product, an acknowledgment in the product documentation would be
24 ; appreciated but is not required.
25 ; 2. Altered source versions must be plainly marked as such, and must not be
26 ; misrepresented as being the original software.
27 ; 3. This notice may not be removed or altered from any source distribution.
29 ; ==========================================================================
30 ; System-dependent configurations
32 %ifdef WIN32 ; ----(nasm -fwin32 -DWIN32 ...)--------
33 ; * Microsoft Visual C++
34 ; * MinGW (Minimalist GNU for Windows)
38 ; -- segment definition --
41 %define SEG_TEXT .text align=32
42 %define SEG_CONST .rdata align=32
44 %define SEG_TEXT .text align=32 public use32 class=CODE
45 %define SEG_CONST .rdata align=32 public use32 class=CONST
48 %elifdef WIN64 ; ----(nasm -fwin64 -DWIN64 ...)--------
49 ; * Microsoft Visual C++
51 ; -- segment definition --
54 %define SEG_TEXT .text align=32
55 %define SEG_CONST .rdata align=32
57 %define SEG_TEXT .text align=32 public use64 class=CODE
58 %define SEG_CONST .rdata align=32 public use64 class=CONST
60 %define EXTN(name) name ; foo() -> foo
62 %elifdef OBJ32 ; ----(nasm -fobj -DOBJ32 ...)----------
63 ; * Borland C++ (Win32)
65 ; -- segment definition --
67 %define SEG_TEXT _text align=32 public use32 class=CODE
68 %define SEG_CONST _data align=32 public use32 class=DATA
70 %elifdef ELF ; ----(nasm -felf[64] -DELF ...)------------
72 ; * *BSD family Unix using elf format
73 ; * Unix System V, including Solaris x86, UnixWare and SCO Unix
75 ; mark stack as non-executable
76 section .note.GNU-stack noalloc noexec nowrite progbits
78 ; -- segment definition --
81 %define SEG_TEXT .text progbits align=32
82 %define SEG_CONST .rodata progbits align=32
84 %define SEG_TEXT .text progbits alloc exec nowrite align=32
85 %define SEG_CONST .rodata progbits alloc noexec nowrite align=32
88 ; To make the code position-independent, append -DPIC to the commandline
90 %define GOT_SYMBOL _GLOBAL_OFFSET_TABLE_ ; ELF supports PIC
91 %define EXTN(name) name ; foo() -> foo
93 %elifdef AOUT ; ----(nasm -faoutb/aout -DAOUT ...)----
94 ; * Older Linux using a.out format (nasm -f aout -DAOUT ...)
95 ; * *BSD family Unix using a.out format (nasm -f aoutb -DAOUT ...)
97 ; -- segment definition --
99 %define SEG_TEXT .text
100 %define SEG_CONST .data
102 ; To make the code position-independent, append -DPIC to the commandline
104 %define GOT_SYMBOL __GLOBAL_OFFSET_TABLE_ ; BSD-style a.out supports PIC
106 %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
107 ; * NeXTstep/OpenStep/Rhapsody/Darwin/MacOS X (Mach-O format)
109 ; -- segment definition --
111 %define SEG_TEXT .text ;align=32 ; nasm doesn't accept align=32. why?
112 %define SEG_CONST .rodata align=32
114 ; The generation of position-independent code (PIC) is the default on Darwin.
117 %define GOT_SYMBOL _MACHO_PIC_ ; Mach-O style code-relative addressing
119 %else ; ----(Other case)----------------------
121 ; -- segment definition --
123 %define SEG_TEXT .text
124 %define SEG_CONST .data
126 %endif ; ----------------------------------------------
128 ; ==========================================================================
130 ; --------------------------------------------------------------------------
134 %ifnidn __OUTPUT_FORMAT__, elfx32
135 %define POINTER qword ; general pointer type
136 %define SIZEOF_POINTER SIZEOF_QWORD ; sizeof(POINTER)
137 %define POINTER_BIT QWORD_BIT ; sizeof(POINTER)*BYTE_BIT
159 %define POINTER dword ; general pointer type
160 %define SIZEOF_POINTER SIZEOF_DWORD ; sizeof(POINTER)
161 %define POINTER_BIT DWORD_BIT ; sizeof(POINTER)*BYTE_BIT
164 ; x86_64 ILP32 ABI (x32)
183 %define INT dword ; signed integer type
184 %define SIZEOF_INT SIZEOF_DWORD ; sizeof(INT)
185 %define INT_BIT DWORD_BIT ; sizeof(INT)*BYTE_BIT
187 %define FP32 dword ; IEEE754 single
188 %define SIZEOF_FP32 SIZEOF_DWORD ; sizeof(FP32)
189 %define FP32_BIT DWORD_BIT ; sizeof(FP32)*BYTE_BIT
191 %define MMWORD qword ; int64 (MMX register)
192 %define SIZEOF_MMWORD SIZEOF_QWORD ; sizeof(MMWORD)
193 %define MMWORD_BIT QWORD_BIT ; sizeof(MMWORD)*BYTE_BIT
195 ; NASM is buggy and doesn't properly handle operand sizes for SSE
196 ; instructions, so for now we have to define XMMWORD as blank.
197 %define XMMWORD ; int128 (SSE register)
198 %define SIZEOF_XMMWORD SIZEOF_OWORD ; sizeof(XMMWORD)
199 %define XMMWORD_BIT OWORD_BIT ; sizeof(XMMWORD)*BYTE_BIT
201 %define YMMWORD ; int256 (AVX register)
202 %define SIZEOF_YMMWORD SIZEOF_YWORD ; sizeof(YMMWORD)
203 %define YMMWORD_BIT YWORD_BIT ; sizeof(YMMWORD)*BYTE_BIT
205 ; Similar hacks for when we load a dword or MMWORD into an xmm# register
209 %define SIZEOF_BYTE 1 ; sizeof(byte)
210 %define SIZEOF_WORD 2 ; sizeof(word)
211 %define SIZEOF_DWORD 4 ; sizeof(dword)
212 %define SIZEOF_QWORD 8 ; sizeof(qword)
213 %define SIZEOF_OWORD 16 ; sizeof(oword)
214 %define SIZEOF_YWORD 32 ; sizeof(yword)
216 %define BYTE_BIT 8 ; CHAR_BIT in C
217 %define WORD_BIT 16 ; sizeof(word)*BYTE_BIT
218 %define DWORD_BIT 32 ; sizeof(dword)*BYTE_BIT
219 %define QWORD_BIT 64 ; sizeof(qword)*BYTE_BIT
220 %define OWORD_BIT 128 ; sizeof(oword)*BYTE_BIT
221 %define YWORD_BIT 256 ; sizeof(yword)*BYTE_BIT
223 ; --------------------------------------------------------------------------
224 ; External Symbol Name
227 %define EXTN(name) _ %+ name ; foo() -> _foo
230 ; --------------------------------------------------------------------------
233 %ifdef ELF ; ----(nasm -felf[64] -DELF ...)--------
234 %define GLOBAL_FUNCTION(name) global EXTN(name):function hidden
235 %define GLOBAL_DATA(name) global EXTN(name):data hidden
236 %elifdef MACHO ; ----(nasm -fmacho -DMACHO ...)--------
238 %define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
239 %define GLOBAL_DATA(name) global EXTN(name):private_extern
241 %if __NASM_VERSION_ID__ >= 0x020E0000
242 %define GLOBAL_FUNCTION(name) global EXTN(name):private_extern
243 %define GLOBAL_DATA(name) global EXTN(name):private_extern
248 %ifndef GLOBAL_FUNCTION
249 %define GLOBAL_FUNCTION(name) global EXTN(name)
252 %define GLOBAL_DATA(name) global EXTN(name)
255 ; --------------------------------------------------------------------------
256 ; Macros for position-independent code (PIC) support
262 %ifdef PIC ; -------------------------------------------
264 %ifidn GOT_SYMBOL, _MACHO_PIC_ ; --------------------
266 ; At present, nasm doesn't seem to support PIC generation for Mach-O.
267 ; The PIC support code below is a little tricky.
272 %define GOTOFF(got, sym) (got) + (sym) - const_base
275 ; NOTE: this macro destroys ecx resister.
277 add ecx, byte (%%ref - $)
280 mov ecx, POINTER [esp]
284 xor ebp, ebp ; ebp = 0
285 %ifidni %1, ebx ; (%1 == ebx)
286 ; db 0x8D,0x9C + jmp near const_base =
287 ; lea ebx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,9C,E9,(offset32)
288 db 0x8D, 0x9C ; 8D,9C
289 jmp near const_base ; E9,(const_base-%%ref)
292 ; db 0x8D,0x8C + jmp near const_base =
293 ; lea ecx, [ecx+ebp*8+(const_base-%%ref)] ; 8D,8C,E9,(offset32)
294 db 0x8D, 0x8C ; 8D,8C
295 jmp near const_base ; E9,(const_base-%%ref)
302 %else ; GOT_SYMBOL != _MACHO_PIC_ ----------------
304 %define GOTOFF(got, sym) (got) + (sym) wrt ..gotoff
309 add %1, GOT_SYMBOL + $$ - $ wrt ..gotpc
312 mov %1, POINTER [esp]
317 %endif ; GOT_SYMBOL == _MACHO_PIC_ ----------------
319 %imacro pushpic 1.nolist
322 %imacro poppic 1.nolist
325 %imacro movpic 2.nolist
329 %else ; !PIC -----------------------------------------
331 %define GOTOFF(got, sym) (sym)
333 %imacro get_GOT 1.nolist
335 %imacro pushpic 1.nolist
337 %imacro poppic 1.nolist
339 %imacro movpic 2.nolist
342 %endif ; PIC -----------------------------------------
344 ; --------------------------------------------------------------------------
345 ; Align the next instruction on {2,4,8,16,..}-byte boundary.
346 ; ".balign n,,m" in GNU as
348 %define MSKLE(x, y) (~(((y) & 0xFFFF) - ((x) & 0xFFFF)) >> 16)
349 %define FILLB(b, n) (($$-(b)) & ((n)-1))
351 %imacro alignx 1-2.nolist 0xFFFF
353 times MSKLE(FILLB(%%bs, %1), %2) & MSKLE(16, FILLB($, %1)) & FILLB($, %1) \
355 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 9 \
356 db 0x8D, 0x9C, 0x23, 0x00, 0x00, 0x00, 0x00 ; lea ebx,[ebx+0x00000000]
357 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 7 \
358 db 0x8D, 0xAC, 0x25, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000]
359 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 6 \
360 db 0x8D, 0xAD, 0x00, 0x00, 0x00, 0x00 ; lea ebp,[ebp+0x00000000]
361 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 4 \
362 db 0x8D, 0x6C, 0x25, 0x00 ; lea ebp,[ebp+0x00]
363 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 3 \
364 db 0x8D, 0x6D, 0x00 ; lea ebp,[ebp+0x00]
365 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 2 \
366 db 0x8B, 0xED ; mov ebp,ebp
367 times MSKLE(FILLB(%%bs, %1), %2) & FILLB($, %1) / 1 \
371 ; Align the next data on {2,4,8,16,..}-byte boundary.
373 %imacro alignz 1.nolist
374 align %1, db 0 ; filling zeros
381 %imacro collect_args 1
382 sub rsp, SIZEOF_XMMWORD
383 movaps XMMWORD [rsp], xmm6
384 sub rsp, SIZEOF_XMMWORD
385 movaps XMMWORD [rsp], xmm7
410 %imacro uncollect_args 1
425 movaps xmm7, XMMWORD [rsp]
426 add rsp, SIZEOF_XMMWORD
427 movaps xmm6, XMMWORD [rsp]
428 add rsp, SIZEOF_XMMWORD
432 sub rsp, %1 * SIZEOF_XMMWORD
433 movaps XMMWORD [rsp+0*SIZEOF_XMMWORD], xmm8
435 movaps XMMWORD [rsp+1*SIZEOF_XMMWORD], xmm9
438 movaps XMMWORD [rsp+2*SIZEOF_XMMWORD], xmm10
441 movaps XMMWORD [rsp+3*SIZEOF_XMMWORD], xmm11
446 movaps xmm8, XMMWORD [rsp+0*SIZEOF_XMMWORD]
448 movaps xmm9, XMMWORD [rsp+1*SIZEOF_XMMWORD]
451 movaps xmm10, XMMWORD [rsp+2*SIZEOF_XMMWORD]
454 movaps xmm11, XMMWORD [rsp+3*SIZEOF_XMMWORD]
456 add rsp, %1 * SIZEOF_XMMWORD
461 %imacro collect_args 1
486 %imacro uncollect_args 1
515 ; --------------------------------------------------------------------------
516 ; Defines picked up from the C headers
518 %include "jsimdcfg.inc"
520 ; --------------------------------------------------------------------------