#include <stddef.h>
#include <inttypes.h> /* for __WORDSIZE */
+#include "asmalign.h"
+
#ifndef __WORDSIZE
// #warning You have misconfigured system and probably will lose performance!
#define __WORDSIZE MP_WORDSIZE
#define PREFETCHW "prefetcht0"
#define PAVGB "pavgb"
#else
+#ifdef __APPLE__
+#define PREFETCH "#"
+#define PREFETCHW "#"
+#elif
#define PREFETCH "/nop"
#define PREFETCHW "/nop"
#endif
+#endif
#ifdef HAVE_3DNOW
/* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */
#define SFENCE "sfence"
#else
#define MOVNTQ "movq"
+#ifdef __APPLE__
+#define SFENCE "#"
+#elif
#define SFENCE "/nop"
#endif
+#endif
static inline void RENAME(rgb24to32)(const uint8_t *src,uint8_t *dst,long src_size)
{
"movq %3, %%mm5 \n\t"
"movq %4, %%mm6 \n\t"
"movq %5, %%mm7 \n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
PREFETCH" 32(%1) \n\t"
"movd (%1), %%mm0 \n\t"
"movq %3, %%mm5 \n\t"
"movq %4, %%mm6 \n\t"
"movq %5, %%mm7 \n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
PREFETCH" 32(%1) \n\t"
"movd (%1), %%mm0 \n\t"
/* TODO: unroll this loop */
asm volatile (
"xor %%"REG_a", %%"REG_a" \n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
PREFETCH" 32(%0, %%"REG_a") \n\t"
"movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq "MANGLE(mask24r)", %%mm5 \n\t"
"movq "MANGLE(mask24g)", %%mm6 \n\t"
"movq "MANGLE(mask24b)", %%mm7 \n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
PREFETCH" 32(%1, %%"REG_a") \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
asm volatile(
"xor %%"REG_a", %%"REG_a" \n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
PREFETCH" 32(%2, %%"REG_a") \n\t"
//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
asm volatile(
"xor %%"REG_a", %%"REG_a" \n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
PREFETCH" 32(%2, %%"REG_a") \n\t"
"xor %%"REG_a", %%"REG_a" \n\t"
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
asm volatile(
"xor %%"REG_a", %%"REG_a" \n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
"xorl %%eax, %%eax \n\t"
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
PREFETCH" 64(%0, %%eax, 4) \n\t"
"movq (%0, %%eax, 4), %%mm0 \n\t" // UYVY UYVY(0)
asm volatile(
"xorl %%eax, %%eax \n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
PREFETCH" 64(%0, %%eax, 4) \n\t"
"movq (%0, %%eax, 4), %%mm0 \n\t" // YUYV YUYV(0)
"movq "MANGLE(w1111)", %%mm5 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
PREFETCH" 64(%0, %%"REG_b") \n\t"
"movd (%0, %%"REG_b"), %%mm0 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
"add %%"REG_b", %%"REG_b" \n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
PREFETCH" 64(%0, %%"REG_b") \n\t"
PREFETCH" 64(%1, %%"REG_b") \n\t"
Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*/
+#include "asmalign.h"
+
#undef REAL_MOVNTQ
#undef MOVNTQ
#undef PAVGB
"movq %%mm3, %%mm4 \n\t"\
"lea " offset "(%0), %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
- ".balign 16 \n\t" /* FIXME Unroll? */\
+ ASMALIGN16 /* FIXME Unroll? */\
"1: \n\t"\
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
"movq " #x "(%%"REG_S", %%"REG_a", 2), %%mm2\n\t" /* srcData */\
#define YSCALEYUV2YV121 \
"mov %2, %%"REG_a" \n\t"\
- ".balign 16 \n\t" /* FIXME Unroll? */\
+ ASMALIGN16 /* FIXME Unroll? */\
"1: \n\t"\
"movq (%0, %%"REG_a", 2), %%mm0 \n\t"\
"movq 8(%0, %%"REG_a", 2), %%mm1\n\t"\
*/
#define YSCALEYUV2PACKEDX \
"xor %%"REG_a", %%"REG_a" \n\t"\
- ".balign 16 \n\t"\
+ ASMALIGN16\
"nop \n\t"\
"1: \n\t"\
"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"\n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm3\n\t"\
"movq %%mm3, %%mm4 \n\t"\
- ".balign 16 \n\t"\
+ ASMALIGN16\
"2: \n\t"\
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
"movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm1\n\t"\
"movq %%mm1, %%mm7 \n\t"\
- ".balign 16 \n\t"\
+ ASMALIGN16\
"2: \n\t"\
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
"movq (%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y1srcData */\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"xor %%"REG_a", %%"REG_a" \n\t"\
- ".balign 16 \n\t"\
+ ASMALIGN16\
"1: \n\t"\
"movq (%0, %%"REG_a", 2), %%mm0 \n\t" /*buf0[eax]*/\
"movq (%1, %%"REG_a", 2), %%mm1 \n\t" /*buf1[eax]*/\
"movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c")\n\t"\
"movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c")\n\t"\
"xor "#index", "#index" \n\t"\
- ".balign 16 \n\t"\
+ ASMALIGN16\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
#define REAL_YSCALEYUV2RGB(index, c) \
"xor "#index", "#index" \n\t"\
- ".balign 16 \n\t"\
+ ASMALIGN16\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
#define REAL_YSCALEYUV2PACKED1(index, c) \
"xor "#index", "#index" \n\t"\
- ".balign 16 \n\t"\
+ ASMALIGN16\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
"movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
#define REAL_YSCALEYUV2RGB1(index, c) \
"xor "#index", "#index" \n\t"\
- ".balign 16 \n\t"\
+ ASMALIGN16\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
"movq 4096(%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
#define REAL_YSCALEYUV2PACKED1b(index, c) \
"xor "#index", "#index" \n\t"\
- ".balign 16 \n\t"\
+ ASMALIGN16\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
// do vertical chrominance interpolation
#define REAL_YSCALEYUV2RGB1b(index, c) \
"xor "#index", "#index" \n\t"\
- ".balign 16 \n\t"\
+ ASMALIGN16\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"movq "MANGLE(w1111)", %%mm5 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b"\n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
PREFETCH" 64(%0, %%"REG_b") \n\t"
"movd (%0, %%"REG_b"), %%mm0 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_b" \n\t"
"add %%"REG_b", %%"REG_b" \n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
PREFETCH" 64(%0, %%"REG_b") \n\t"
PREFETCH" 64(%1, %%"REG_b") \n\t"
"movq "MANGLE(w02)", %%mm6 \n\t"
"push %%"REG_BP" \n\t" // we use 7 regs here ...
"mov %%"REG_a", %%"REG_BP" \n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
"movzwl (%2, %%"REG_BP"), %%eax \n\t"
"movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
"movq "MANGLE(w02)", %%mm6 \n\t"
"push %%"REG_BP" \n\t" // we use 7 regs here ...
"mov %%"REG_a", %%"REG_BP" \n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
"movzwl (%2, %%"REG_BP"), %%eax \n\t"
"movzwl 2(%2, %%"REG_BP"), %%ebx\n\t"
asm volatile(
"pxor %%mm7, %%mm7 \n\t"
"movq "MANGLE(w02)", %%mm6 \n\t"
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
"mov %2, %%"REG_c" \n\t"
"movzwl (%%"REG_c", %0), %%eax \n\t"
"xor %%"REG_a", %%"REG_a" \n\t" // i
"xor %%"REG_b", %%"REG_b" \n\t" // xx
"xorl %%ecx, %%ecx \n\t" // 2*xalpha
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
"movzbl (%0, %%"REG_b"), %%edi \n\t" //src[xx]
"movzbl 1(%0, %%"REG_b"), %%esi \n\t" //src[xx+1]
"xor %%"REG_a", %%"REG_a" \n\t" // i
"xor %%"REG_b", %%"REG_b" \n\t" // xx
"xorl %%ecx, %%ecx \n\t" // 2*xalpha
- ".balign 16 \n\t"
+ ASMALIGN16
"1: \n\t"
"mov %0, %%"REG_S" \n\t"
"movzbl (%%"REG_S", %%"REG_b"), %%edi \n\t" //src[xx]