#include <string.h>
+#define BLEND(D,S,alpha) (((D) * (256 - (alpha)) + (S) * (alpha)) >> 8)
+
#ifdef HAVE_GCC_ASM
#if defined(HAVE_CPU_I386) || defined(HAVE_CPU_X86_64)
#define BUILD_X86_ASM
/* Below are the implementations of everything */
-#define BLEND(D,S,alpha) (((D) * (255 - (alpha)) + (S) * (alpha)) >> 8)
-
inline static void
_blend_u8_c (guint8 * dest, const guint8 * src,
gint src_stride, gint dest_stride, gint src_width, gint src_height,
gint i; \
gint dest_stride = GST_ROUND_UP_4 (width * bpp); \
\
- red = CLAMP (1.164 * (colY - 16) + 1.596 * (colV - 128), 0, 255); \
- green = \
- CLAMP (1.164 * (colY - 16) - 0.813 * (colV - 128) - 0.391 * (colU - 128), \
- 0, 255); \
- blue = CLAMP (1.164 * (colY - 16) + 2.018 * (colU - 128), 0, 255); \
+ red = YUV_TO_R (colY, colU, colV); \
+ green = YUV_TO_G (colY, colU, colV); \
+ blue = YUV_TO_B (colY, colU, colV); \
\
for (i = 0; i < height; i++) { \
MEMSET_RGB (dest, red, green, blue, width); \
/* MMX Implementations */
#ifdef BUILD_X86_ASM
+
+#define MEMSET_xRGB_MMX(name, r, g, b) \
+static inline void \
+_memset_##name##_mmx (guint8* dest, gint red, gint green, gint blue, gint width) { \
+ guint32 val = (red << r) | (green << g) | (blue << b); \
+ \
+ _memset_u32_mmx ((guint32 *) dest, val, width); \
+}
+
#define A32
#define NAME_BLEND _blend_loop_argb_mmx
#define NAME_FILL_COLOR _fill_color_loop_argb_mmx
A32_COLOR (argb_mmx, TRUE, _fill_color_loop_argb_mmx);
A32_COLOR (bgra_mmx, TRUE, _fill_color_loop_bgra_mmx);
A32_COLOR (ayuv_mmx, FALSE, _fill_color_loop_argb_mmx);
+
+I420_BLEND (mmx, _memcpy_u8_mmx, _blend_u8_mmx);
+I420_FILL_CHECKER (mmx, _memset_u8_mmx);
+I420_FILL_COLOR (mmx, _memset_u8_mmx);
+
+RGB_BLEND (rgb_mmx, 3, _memcpy_u8_mmx, _blend_u8_mmx);
+
+RGB_BLEND (xrgb_mmx, 4, _memcpy_u8_mmx, _blend_u8_mmx);
+MEMSET_xRGB_MMX (xrgb, 16, 8, 0);
+RGB_FILL_COLOR (xrgb_mmx, 4, _memset_xrgb_mmx);
+
+MEMSET_xRGB_MMX (xbgr, 0, 8, 16);
+RGB_FILL_COLOR (xbgr_mmx, 4, _memset_xbgr_mmx);
+
+MEMSET_xRGB_MMX (rgbx, 24, 16, 8);
+RGB_FILL_COLOR (rgbx_mmx, 4, _memset_rgbx_mmx);
+
+MEMSET_xRGB_MMX (bgrx, 8, 16, 24);
+RGB_FILL_COLOR (bgrx_mmx, 4, _memset_bgrx_mmx);
#endif
/* Init function */
if (cpu_flags & OIL_IMPL_FLAG_MMX) {
gst_video_mixer_blend_argb = blend_argb_mmx;
gst_video_mixer_blend_bgra = blend_bgra_mmx;
+ gst_video_mixer_blend_i420 = blend_i420_mmx;
+ gst_video_mixer_blend_rgb = blend_rgb_mmx;
+ gst_video_mixer_blend_xrgb = blend_xrgb_mmx;
+
+ gst_video_mixer_fill_checker_i420 = fill_checker_i420_mmx;
gst_video_mixer_fill_color_argb = fill_color_argb_mmx;
gst_video_mixer_fill_color_bgra = fill_color_bgra_mmx;
gst_video_mixer_fill_color_ayuv = fill_color_ayuv_mmx;
+ gst_video_mixer_fill_color_i420 = fill_color_i420_mmx;
+ gst_video_mixer_fill_color_xrgb = fill_color_xrgb_mmx;
+ gst_video_mixer_fill_color_xbgr = fill_color_xbgr_mmx;
+ gst_video_mixer_fill_color_rgbx = fill_color_rgbx_mmx;
+ gst_video_mixer_fill_color_bgrx = fill_color_bgrx_mmx;
}
#endif
}
gint dest_add = dest_stride - (4 * src_width);
for (i = 0; i < src_height; i++) {
- gulong old_ebx;
-
/* (P1 * (256 - A) + (P2 * A)) / 256
* => (P1 * 256 - P1 * A + P2 * A) / 256
* => (P1 * 256 + A * (P2 - P1) / 256
*/
/* *INDENT-OFF* */
__asm__ __volatile__ (
- " movl %%ebx , %6 \n\t"
-
" pcmpeqd %%mm5 , %%mm5 \n\t" /* mm5 = 0xffff... */
#if A_OFF == 0
" psrld $24 , %%mm5 \n\t" /* mm5 = 00 00 00 ff 00 00 00 ff, selector for alpha */
" movd %%eax , %%mm6 \n\t" /* mm6 = s_alpha */
" punpckldq %%mm6 , %%mm6 \n\t" /* mm6 = 00 00 00 aa 00 00 00 aa, alpha scale factor */
- " movl %5 , %%ebx \n\t" /* ebx = src_width */
- " test $1 , %%ebx \n\t" /* check odd pixel */
+ " movl %5 , %%ecx \n\t" /* ecx = src_width */
+ " test $1 , %%ecx \n\t" /* check odd pixel */
" je 1f \n\t"
/* do odd pixel */
" add $4 , %0 \n\t"
"1: \n\t"
- " sar $1 , %%ebx \n\t" /* prepare for 2 pixel per loop */
- " cmp $0 , %%ebx \n\t"
+ " sar $1 , %%ecx \n\t" /* prepare for 2 pixel per loop */
+ " cmp $0 , %%ecx \n\t"
" je 3f \n\t"
"2: \n\t"
" add $8 , %1 \n\t"
" add $8 , %0 \n\t"
- " dec %%ebx \n\t"
+ " dec %%ecx \n\t"
" jne 2b \n\t"
"3: \n\t"
- " movl %6 , %%ebx \n\t"
:"=r" (src), "=r" (dest)
- :"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width), "m" (old_ebx)
- :"%eax", "memory"
+ :"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width)
+ :"%eax", "%ecx", "memory"
#ifdef __MMX__
- , "mm0", "mm1", "mm2", "mm5", "mm6", "mm7"
+ , "mm0", "mm1", "mm2", "mm4", "mm3", "mm5", "mm6", "mm7"
#endif
);
/* *INDENT-ON* */
}
#endif
+#ifdef GENERIC
+static inline void
+_memcpy_u8_mmx (guint8 * dest, const guint8 * src, guint count)
+{
+ /* *INDENT-OFF* */
+ __asm__ __volatile__ (
+ "1: \n\t"
+ "test $7, %0 \n\t"
+ "je 3f \n\t"
+ "2: \n\t"
+ "movb (%2), %%ah \n\t"
+ "movb %%ah, (%1) \n\t"
+ "inc %2 \n\t"
+ "inc %1 \n\t"
+ "dec %0 \n\t"
+ "test $7, %0 \n\t"
+ "jne 2b \n\t"
+ "3: \n\t"
+ "sar $3, %0 \n\t"
+ "cmp $0, %0 \n\t"
+ "je 5f \n\t"
+ "4: \n\t"
+ "movq (%2), %%mm0 \n\t"
+ "movq %%mm0, (%1) \n\t"
+ "add $8, %2 \n\t"
+ "add $8, %1 \n\t"
+ "dec %0 \n\t"
+ "jne 4b \n\t"
+ "5: \n\t"
+ "emms \n\t"
+ : "=r" (count), "=r" (dest), "=r" (src)
+ : "0" (count), "1" (dest), "2" (src)
+ : "memory", "ah"
+#ifdef __MMX__
+ , "mm0"
+#endif
+ );
+ /* *INDENT-ON* */
+}
+
+static inline void
+_memset_u8_mmx (guint8 * dest, guint val, guint count)
+{
+ guint8 val8 = val;
+ guint64 val64;
+
+ val64 = (val << 24) | (val << 16) | (val << 8) | (val);
+ val64 = (val64 << 32) | val64;
+
+ /* *INDENT-OFF* */
+ __asm__ __volatile__ (
+ "1: \n\t"
+ "test $7, %0 \n\t"
+ "je 3f \n\t"
+ "2: \n\t"
+ "movb %4, (%1) \n\t"
+ "inc %1 \n\t"
+ "dec %0 \n\t"
+ "test $7, %0 \n\t"
+ "jne 2b \n\t"
+ "3: \n\t"
+ "sar $3, %0 \n\t"
+ "cmp $0, %0 \n\t"
+ "je 5f \n\t"
+ "movq %5, %%mm0 \n\t"
+ "4: \n\t"
+ "movq %%mm0, (%1) \n\t"
+ "add $8, %1 \n\t"
+ "dec %0 \n\t"
+ "jne 4b \n\t"
+ "5: \n\t"
+ "emms \n\t"
+ : "=r" (count), "=r" (dest)
+ : "0" (count), "1" (dest), "r" (val8), "m" (val64)
+ : "memory"
+#ifdef __MMX__
+ , "mm0"
+#endif
+ );
+ /* *INDENT-ON* */
+}
+
+static inline void
+_memset_u32_mmx (guint32 * dest, guint32 val, guint count)
+{
+ guint64 val64 = val;
+
+ val64 |= (val64 << 32);
+
+ /* *INDENT-OFF* */
+ __asm__ __volatile__ (
+ "1: \n\t"
+ "test $1, %0 \n\t"
+ "je 3f \n\t"
+ "2: \n\t"
+ "movl %4, (%1) \n\t"
+ "add $4, %1 \n\t"
+ "dec %0 \n\t"
+ "test $1, %0 \n\t"
+ "jne 2b \n\t"
+ "3: \n\t"
+ "sar $1, %0 \n\t"
+ "cmp $0, %0 \n\t"
+ "je 5f \n\t"
+ "movq %5, %%mm0 \n\t"
+ "4: \n\t"
+ "movq %%mm0, (%1) \n\t"
+ "add $8, %1 \n\t"
+ "dec %0 \n\t"
+ "jne 4b \n\t"
+ "5: \n\t"
+ "emms \n\t"
+ : "=r" (count), "=r" (dest)
+ : "0" (count), "1" (dest), "r" (val), "m" (val64)
+ : "memory"
+#ifdef __MMX__
+ , "mm0"
+#endif
+ );
+ /* *INDENT-ON* */
+}
+
+static inline void
+_blend_u8_mmx (guint8 * dest, const guint8 * src,
+ gint src_stride, gint dest_stride, gint src_width, gint src_height,
+ gint dest_width, gint s_alpha)
+{
+ gint i;
+ gint src_add = src_stride - src_width;
+ gint dest_add = dest_stride - src_width;
+
+ for (i = 0; i < src_height; i++) {
+ /* Do first 3 "odd" pixels */
+ while ((src_width & 0x03)) {
+ *dest = BLEND (*dest, *src, s_alpha);
+ dest++;
+ src++;
+ src_width--;
+ }
+
+ /* (P1 * (256 - A) + (P2 * A)) / 256
+ * => (P1 * 256 - P1 * A + P2 * A) / 256
+ * => (P1 * 256 + A * (P2 - P1) / 256
+ * => P1 + (A * (P2 - P1)) / 256
+ */
+ /* *INDENT-OFF* */
+ __asm__ __volatile__ (
+ " mov %4 , %%eax \n\t" /* eax = s_alpha */
+ " movd %%eax , %%mm6 \n\t" /* mm6 = s_alpha */
+ " punpcklwd %%mm6 , %%mm6 \n\t" /* mm6 = 00 00 00 00 00 aa 00 aa, alpha scale factor */
+ " punpckldq %%mm6 , %%mm6 \n\t" /* mm6 = 00 aa 00 aa 00 aa 00 aa */
+
+ " pxor %%mm7 , %%mm7 \n\t" /* mm7 = 00 00 00 00 00 00 00 00 */
+
+ " movl %5 , %%ecx \n\t" /* ecx = src_width */
+
+ "1: \n\t"
+ " test $7 , %%ecx \n\t"
+ " je 2f \n\t"
+
+ /* do first 4 "odd" bytes */
+ " movd (%2) , %%mm2 \n\t" /* mm2 = src, 00 00 00 00 sv su sy sa */
+ " movd (%3) , %%mm1 \n\t" /* mm1 = dest, 00 00 00 00 dv du dy da */
+ " punpcklbw %%mm7 , %%mm2 \n\t"
+ " punpcklbw %%mm7 , %%mm1 \n\t"
+ " psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */
+ " pmullw %%mm6 , %%mm2 \n\t" /* mm2 = a * mm2 */
+ " psllw $8 , %%mm1 \n\t" /* scale up */
+ " paddw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 + mm1 */
+ " psrlw $8 , %%mm2 \n\t" /* scale down */
+ " packuswb %%mm2 , %%mm2 \n\t"
+ " movd %%mm2 , (%3) \n\t" /* dest = mm1 */
+ " add $4 , %1 \n\t"
+ " add $4 , %0 \n\t"
+
+ "2: \n\t"
+ " sar $3 , %%ecx \n\t" /* prepare for 8 bytes per loop */
+ " cmp $0 , %%ecx \n\t"
+ " je 4f \n\t"
+
+ "3: \n\t"
+ /* do even pixels */
+ " movq (%2) , %%mm2 \n\t" /* mm2 = src, sv1 su1 sy1 sa1 sv0 su0 sy0 sa0 */
+ " movq (%3) , %%mm1 \n\t" /* mm1 = dest, dv1 du1 dy1 da1 dv0 du0 dy0 da0 */
+ " movq %%mm2 , %%mm4 \n\t"
+ " movq %%mm1 , %%mm3 \n\t"
+ " punpcklbw %%mm7 , %%mm2 \n\t"
+ " punpckhbw %%mm7 , %%mm4 \n\t"
+ " punpcklbw %%mm7 , %%mm1 \n\t"
+ " punpckhbw %%mm7 , %%mm3 \n\t"
+ " psubw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 - mm1 */
+ " psubw %%mm3 , %%mm4 \n\t" /* mm4 = mm4 - mm3 */
+ " pmullw %%mm6 , %%mm2 \n\t" /* mm2 = a * mm2 */
+ " pmullw %%mm6 , %%mm4 \n\t" /* mm2 = a * mm2 */
+ " psllw $8 , %%mm1 \n\t" /* scale up */
+ " psllw $8 , %%mm3 \n\t" /* scale up */
+ " paddw %%mm1 , %%mm2 \n\t" /* mm2 = mm2 + mm1 */
+ " paddw %%mm3 , %%mm4 \n\t" /* mm4 = mm4 + mm3 */
+ " psrlw $8 , %%mm2 \n\t" /* scale down */
+ " psrlw $8 , %%mm4 \n\t" /* scale down */
+ " packuswb %%mm4 , %%mm2 \n\t"
+ " movq %%mm2 , (%3) \n\t"
+ " add $8 , %0 \n\t"
+ " add $8 , %1 \n\t"
+ " dec %%ecx \n\t"
+ " jne 3b \n\t"
+
+ "4: \n\t"
+ :"=r" (src), "=r" (dest)
+ :"0" (src), "1" (dest), "m" (s_alpha), "m" (src_width)
+ :"%eax", "%ecx", "memory"
+#ifdef __MMX__
+ , "mm1", "mm2", "mm3", "mm4", "mm6", "mm7"
+#endif
+ );
+ /* *INDENT-ON* */
+ src += src_add;
+ dest += dest_add;
+ }
+ __asm__ __volatile__ ("emms");
+}
+#endif