*/
#include <stddef.h>
+#include <stdint.h>
#include "libavutil/attributes.h"
+#include "libavutil/x86/asm.h"
#undef PREFETCH
#undef MOVNTQ
"movq %%mm5, %%mm7 \n\t"
STORE_BGR24_MMX
:: "r"(dest), "r"(s)
+ NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
:"memory");
dest += 24;
s += 32;
:"=m"(*d)
:"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
+ NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi)
:"memory");
/* borrowed 32 to 24 */
__asm__ volatile(
STORE_BGR24_MMX
:: "r"(d), "m"(*s)
+ NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
:"memory");
d += 24;
s += 8;
"por %%mm5, %%mm3 \n\t"
:"=m"(*d)
:"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
+ NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi)
:"memory");
/* borrowed 32 to 24 */
__asm__ volatile(
STORE_BGR24_MMX
:: "r"(d), "m"(*s)
+ NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
:"memory");
d += 24;
s += 8;
"pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
PACK_RGB32
::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
+ NAMED_CONSTRAINTS_ADD(mul15_hi)
:"memory");
d += 16;
s += 4;
"pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
PACK_RGB32
::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
+ NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi)
:"memory");
d += 16;
s += 4;
"2: \n\t"
: "+a" (mmx_size)
: "r" (src-mmx_size), "r"(dst-mmx_size)
+ NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b)
);
__asm__ volatile(SFENCE:::"memory");
:: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
"r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
"g" (-mmxSize)
+ NAMED_CONSTRAINTS_ADD(mmx_ff)
: "%"REG_a
);
"add $8, %%"REG_a" \n\t"
" js 1b \n\t"
: : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
+ NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset)
: "%"REG_a, "%"REG_d
);
ydst += lumStride;
"add $4, %%"REG_a" \n\t"
" js 1b \n\t"
: : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
+ NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset)
: "%"REG_a, "%"REG_d
);
#endif /* HAVE_7REGS */
#endif /* !COMPILE_TEMPLATE_SSE2 */
-#if !COMPILE_TEMPLATE_AMD3DNOW
+#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
int width, int height, int src1Stride,
int src2Stride, int dstStride)
::: "memory"
);
}
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
+
+#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
+#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
+void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
+ const uint8_t *unused,
+ const uint8_t *src1,
+ const uint8_t *src2,
+ int w,
+ uint32_t *unused2);
+static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
+ int width, int height, int srcStride,
+ int dst1Stride, int dst2Stride)
+{
+ int h;
+
+ for (h = 0; h < height; h++) {
+ RENAME(ff_nv12ToUV)(dst1, dst2, NULL, src, NULL, width, NULL);
+ src += srcStride;
+ dst1 += dst1Stride;
+ dst2 += dst2Stride;
+ }
+ __asm__(
+ EMMS" \n\t"
+ SFENCE" \n\t"
+ ::: "memory"
+ );
+}
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+#endif /* !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL */
#if !COMPILE_TEMPLATE_SSE2
#if !COMPILE_TEMPLATE_AMD3DNOW
}
}
+static void RENAME(extract_odd)(const uint8_t *src, uint8_t *dst, x86_reg count)
+{
+ src ++;
+ dst += count;
+ src += 2*count;
+ count= - count;
+
+ if(count < -16) {
+ count += 16;
+ __asm__ volatile(
+ "pcmpeqw %%mm7, %%mm7 \n\t"
+ "psrlw $8, %%mm7 \n\t"
+ "1: \n\t"
+ "movq -32(%1, %0, 2), %%mm0 \n\t"
+ "movq -24(%1, %0, 2), %%mm1 \n\t"
+ "movq -16(%1, %0, 2), %%mm2 \n\t"
+ "movq -8(%1, %0, 2), %%mm3 \n\t"
+ "pand %%mm7, %%mm0 \n\t"
+ "pand %%mm7, %%mm1 \n\t"
+ "pand %%mm7, %%mm2 \n\t"
+ "pand %%mm7, %%mm3 \n\t"
+ "packuswb %%mm1, %%mm0 \n\t"
+ "packuswb %%mm3, %%mm2 \n\t"
+ MOVNTQ" %%mm0,-16(%2, %0) \n\t"
+ MOVNTQ" %%mm2,- 8(%2, %0) \n\t"
+ "add $16, %0 \n\t"
+ " js 1b \n\t"
+ : "+r"(count)
+ : "r"(src), "r"(dst)
+ );
+ count -= 16;
+ }
+ while(count<0) {
+ dst[count]= src[2*count];
+ count++;
+ }
+}
+
#if !COMPILE_TEMPLATE_AMD3DNOW
static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
{
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
for (y=0; y<height; y++) {
- RENAME(extract_even)(src+1, ydst, width);
+ RENAME(extract_odd)(src, ydst, width);
if(y&1) {
RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
udst+= chromStride;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
for (y=0; y<height; y++) {
- RENAME(extract_even)(src+1, ydst, width);
+ RENAME(extract_odd)(src, ydst, width);
RENAME(extract_even2)(src, udst, vdst, chromWidth);
src += srcStride;
uyvytoyuv420 = RENAME(uyvytoyuv420);
#endif /* !COMPILE_TEMPLATE_SSE2 */
-#if !COMPILE_TEMPLATE_AMD3DNOW
+#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
interleaveBytes = RENAME(interleaveBytes);
-#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
+#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
+#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
+#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
+ deinterleaveBytes = RENAME(deinterleaveBytes);
+#endif
+#endif
}