2 * LIBOIL - Library of Optimized Inner Loops
3 * Copyright (c) 2005 David A. Schleef <ds@schleef.org>
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
9 * 1. Redistributions of source code must retain the above copyright
10 * notice, this list of conditions and the following disclaimer.
11 * 2. Redistributions in binary form must reproduce the above copyright
12 * notice, this list of conditions and the following disclaimer in the
13 * documentation and/or other materials provided with the distribution.
15 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
19 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
23 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
24 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
25 * POSSIBILITY OF SUCH DAMAGE.
32 #include <liboil/liboilfunction.h>
33 #include <liboil/liboilclasses.h>
36 swab_u32_i386 (uint32_t *dest, uint32_t *src, int n)
40 " movl 0(%1), %%eax\n"
42 " movl %%eax, 0(%0)\n"
47 : "+r" (dest), "+r" (src), "+r" (n)
51 OIL_DEFINE_IMPL (swab_u32_i386, swab_u32);
54 swab_u32_i386_unroll2 (uint32_t *dest, uint32_t *src, int n)
59 " movl 0(%1), %%eax\n"
61 " movl %%eax, 0(%0)\n"
67 " movl 0(%1), %%eax\n"
68 " movl 4(%1), %%ecx\n"
71 " movl %%eax, 0(%0)\n"
72 " movl %%ecx, 4(%0)\n"
78 : "+r" (dest), "+r" (src), "+r" (n)
80 : "eax", "ecx", "memory");
82 OIL_DEFINE_IMPL (swab_u32_i386_unroll2, swab_u32);
85 swab_u32_i386_unroll4 (uint32_t *dest, uint32_t *src, int n)
90 " movl 0(%1), %%eax\n"
92 " movl %%eax, 0(%0)\n"
102 " movl 0(%1), %%eax\n"
103 " movl 4(%1), %%ecx\n"
106 " movl %%eax, 0(%0)\n"
107 " movl %%ecx, 4(%0)\n"
108 " movl 8(%1), %%eax\n"
109 " movl 12(%1), %%ecx\n"
112 " movl %%eax, 8(%0)\n"
113 " movl %%ecx, 12(%0)\n"
119 : "+r" (dest), "+r" (src), "+r" (n)
121 : "eax", "ecx", "memory");
123 OIL_DEFINE_IMPL (swab_u32_i386_unroll4, swab_u32);
128 * This could be improved by using aligned stores
131 swab_u16_mmx (uint16_t *dest, uint16_t *src, int n)
136 " movw 0(%1), %%ax\n"
138 " movw %%ax, 0(%0)\n"
148 " movq 0(%1), %%mm0\n"
149 " movq 0(%1), %%mm1\n"
152 " por %%mm0, %%mm1\n"
153 " movq %%mm1, 0(%0)\n"
160 : "+r" (dest), "+r" (src), "+r" (n)
164 OIL_DEFINE_IMPL_FULL (swab_u16_mmx, swab_u16, OIL_IMPL_FLAG_MMX);
167 swab_u16_mmx_unroll2 (uint16_t *dest, uint16_t *src, int n)
172 " movw 0(%1), %%ax\n"
174 " movw %%ax, 0(%0)\n"
184 " movq 0(%1), %%mm0\n"
185 " movq 0(%1), %%mm1\n"
186 " movq 8(%1), %%mm2\n"
188 " movq 8(%1), %%mm3\n"
191 " por %%mm0, %%mm1\n"
193 " movq %%mm1, 0(%0)\n"
194 " por %%mm2, %%mm3\n"
195 " movq %%mm3, 8(%0)\n"
202 : "+r" (dest), "+r" (src), "+r" (n)
206 OIL_DEFINE_IMPL_FULL (swab_u16_mmx_unroll2, swab_u16, OIL_IMPL_FLAG_MMX);
208 /* 10 instructions to swab 2 words? not likely */
210 swab_u32_mmx (uint16_t *dest, uint16_t *src, int n)
215 " movl 0(%1), %%eax\n"
217 " movl %%eax, 0(%0)\n"
227 " movq 0(%1), %%mm0\n"
228 " movq 0(%1), %%mm1\n"
231 " por %%mm0, %%mm1\n"
232 " movq %%mm1, %%mm0\n"
233 " pslld $16, %%mm0\n"
234 " psrld $16, %%mm1\n"
235 " por %%mm0, %%mm1\n"
236 " movq %%mm1, 0(%0)\n"
243 : "+r" (dest), "+r" (src), "+r" (n)
247 OIL_DEFINE_IMPL_FULL (swab_u32_mmx, swab_u32, OIL_IMPL_FLAG_MMX);
250 swab_u16_sse2 (uint16_t *dest, uint16_t *src, int n)
255 " movw 0(%1), %%ax\n"
257 " movw %%ax, 0(%0)\n"
267 " movdqu 0(%1), %%xmm0\n"
268 " movdqu 0(%1), %%xmm1\n"
269 " psllw $8, %%xmm0\n"
270 " psrlw $8, %%xmm1\n"
271 " por %%xmm0, %%xmm1\n"
272 " movdqu %%xmm1, 0(%0)\n"
278 : "+r" (dest), "+r" (src), "+r" (n)
282 OIL_DEFINE_IMPL_FULL (swab_u16_sse2, swab_u16, OIL_IMPL_FLAG_SSE2);
285 swab_u32_sse2 (uint16_t *dest, uint16_t *src, int n)
290 " movl 0(%1), %%eax\n"
292 " movl %%eax, 0(%0)\n"
302 " movdqu 0(%1), %%xmm0\n"
303 " movdqu 0(%1), %%xmm1\n"
304 " psllw $8, %%xmm0\n"
305 " psrlw $8, %%xmm1\n"
306 " por %%xmm0, %%xmm1\n"
307 " movdqu %%xmm1, %%xmm0\n"
308 " pslld $16, %%xmm0\n"
309 " psrld $16, %%xmm1\n"
310 " por %%xmm0, %%xmm1\n"
311 " movdqu %%xmm1, 0(%0)\n"
317 : "+r" (dest), "+r" (src), "+r" (n)
321 OIL_DEFINE_IMPL_FULL (swab_u32_sse2, swab_u32, OIL_IMPL_FLAG_SSE2);