fixed yv12toyuy2
authorMichael Niedermayer <michaelni@gmx.at>
Mon, 5 Nov 2001 18:26:49 +0000 (18:26 +0000)
committerMichael Niedermayer <michaelni@gmx.at>
Mon, 5 Nov 2001 18:26:49 +0000 (18:26 +0000)
Originally committed as revision 2724 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

postproc/rgb2rgb.c
postproc/rgb2rgb.h
postproc/rgb2rgb_template.c

index ddb805b..5d538ec 100644 (file)
@@ -291,56 +291,71 @@ void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, cons
 }
 /**
  *
- * num_pixels must be a multiple of 16 for the MMX version
+ * width must be a multiple of 16 for the MMX version
  */
-void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, unsigned num_pixels)
+void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+       int width, int height, int lumStride, int chromStride, int dstStride)
 {
+       int y;
+       const int chromWidth= width>>1;
+       for(y=0; y<height; y++)
+       {
 #ifdef HAVE_MMX
-       asm volatile(
-               "xorl %%eax, %%eax              \n\t"
-               "1:                             \n\t"
-               PREFETCH" 32(%1, %%eax, 2)      \n\t"
-               PREFETCH" 32(%2, %%eax)         \n\t"
-               PREFETCH" 32(%3, %%eax)         \n\t"
-               "movq (%2, %%eax), %%mm0        \n\t" // U(0)
-               "movq %%mm0, %%mm2              \n\t" // U(0)
-               "movq (%3, %%eax), %%mm1        \n\t" // V(0)
-               "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
-               "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
-
-               "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
-               "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
-               "movq %%mm3, %%mm4              \n\t" // Y(0)
-               "movq %%mm5, %%mm6              \n\t" // Y(8)
-               "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
-               "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
-               "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
-               "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
+//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
+               asm volatile(
+                       "xorl %%eax, %%eax              \n\t"
+                       "1:                             \n\t"
+                       PREFETCH" 32(%1, %%eax, 2)      \n\t"
+                       PREFETCH" 32(%2, %%eax)         \n\t"
+                       PREFETCH" 32(%3, %%eax)         \n\t"
+                       "movq (%2, %%eax), %%mm0        \n\t" // U(0)
+                       "movq %%mm0, %%mm2              \n\t" // U(0)
+                       "movq (%3, %%eax), %%mm1        \n\t" // V(0)
+                       "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
+                       "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
 
-               MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
-               MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
-               MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
-               MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
+                       "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
+                       "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
+                       "movq %%mm3, %%mm4              \n\t" // Y(0)
+                       "movq %%mm5, %%mm6              \n\t" // Y(8)
+                       "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
+                       "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
+                       "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
+                       "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
 
-               "addl $8, %%eax                 \n\t"
-               "cmpl %4, %%eax                 \n\t"
-               " jb 1b                         \n\t"
-               EMMS" \n\t"
-               SFENCE
-               ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (num_pixels>>1)
-               : "memory", "%eax"
-       );
+                       MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
+                       MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
+                       MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
+                       MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
 
+                       "addl $8, %%eax                 \n\t"
+                       "cmpl %4, %%eax                 \n\t"
+                       " jb 1b                         \n\t"
+                       ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
+                       : "%eax"
+               );
 #else
-       int i;
-       num_pixels>>=1;
-       for(i=0; i<num_pixels; i++)
-       {
-               dst[4*i+0] = ysrc[2*i+0];
-               dst[4*i+1] = usrc[i];
-               dst[4*i+2] = ysrc[2*i+1];
-               dst[4*i+3] = vsrc[i];
+               int i;
+               for(i=0; i<chromWidth; i++)
+               {
+                       dst[4*i+0] = ysrc[2*i+0];
+                       dst[4*i+1] = usrc[i];
+                       dst[4*i+2] = ysrc[2*i+1];
+                       dst[4*i+3] = vsrc[i];
+               }
+#endif
+               if(y&1)
+               {
+                       usrc += chromStride;
+                       vsrc += chromStride;
+               }
+               ysrc += lumStride;
+               dst += dstStride;
        }
+#ifdef HAVE_MMX
+asm(    EMMS" \n\t"
+        SFENCE" \n\t"
+        :::"memory");
 #endif
 }
 
@@ -410,4 +425,4 @@ void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                 vdst[i]        = src[4*i+3];
        }
 #endif
-}
\ No newline at end of file
+}
index bfdda8d..db29be3 100644 (file)
@@ -21,8 +21,8 @@ extern void palette8torgb16(const uint8_t *src, uint8_t *dst, unsigned num_pixel
 extern void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette);
 extern void palette8torgb24(const uint8_t *src, uint8_t *dst, unsigned num_pixels, const uint8_t *palette);
 
-extern void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, unsigned num_pixels);
+extern void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+       int width, int height, int lumStride, int chromStride, int dstStride);
 extern void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, unsigned num_pixels);
 
-
 #endif
index ddb805b..5d538ec 100644 (file)
@@ -291,56 +291,71 @@ void palette8torgb15(const uint8_t *src, uint8_t *dst, unsigned num_pixels, cons
 }
 /**
  *
- * num_pixels must be a multiple of 16 for the MMX version
+ * width must be a multiple of 16 for the MMX version
  */
-void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, unsigned num_pixels)
+void yv12toyuy2(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
+       int width, int height, int lumStride, int chromStride, int dstStride)
 {
+       int y;
+       const int chromWidth= width>>1;
+       for(y=0; y<height; y++)
+       {
 #ifdef HAVE_MMX
-       asm volatile(
-               "xorl %%eax, %%eax              \n\t"
-               "1:                             \n\t"
-               PREFETCH" 32(%1, %%eax, 2)      \n\t"
-               PREFETCH" 32(%2, %%eax)         \n\t"
-               PREFETCH" 32(%3, %%eax)         \n\t"
-               "movq (%2, %%eax), %%mm0        \n\t" // U(0)
-               "movq %%mm0, %%mm2              \n\t" // U(0)
-               "movq (%3, %%eax), %%mm1        \n\t" // V(0)
-               "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
-               "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
-
-               "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
-               "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
-               "movq %%mm3, %%mm4              \n\t" // Y(0)
-               "movq %%mm5, %%mm6              \n\t" // Y(8)
-               "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
-               "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
-               "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
-               "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
+//FIXME handle 2 lines a once (fewer prefetch, reuse some chrom, but very likely limited by mem anyway)
+               asm volatile(
+                       "xorl %%eax, %%eax              \n\t"
+                       "1:                             \n\t"
+                       PREFETCH" 32(%1, %%eax, 2)      \n\t"
+                       PREFETCH" 32(%2, %%eax)         \n\t"
+                       PREFETCH" 32(%3, %%eax)         \n\t"
+                       "movq (%2, %%eax), %%mm0        \n\t" // U(0)
+                       "movq %%mm0, %%mm2              \n\t" // U(0)
+                       "movq (%3, %%eax), %%mm1        \n\t" // V(0)
+                       "punpcklbw %%mm1, %%mm0         \n\t" // UVUV UVUV(0)
+                       "punpckhbw %%mm1, %%mm2         \n\t" // UVUV UVUV(8)
 
-               MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
-               MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
-               MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
-               MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
+                       "movq (%1, %%eax,2), %%mm3      \n\t" // Y(0)
+                       "movq 8(%1, %%eax,2), %%mm5     \n\t" // Y(8)
+                       "movq %%mm3, %%mm4              \n\t" // Y(0)
+                       "movq %%mm5, %%mm6              \n\t" // Y(8)
+                       "punpcklbw %%mm0, %%mm3         \n\t" // YUYV YUYV(0)
+                       "punpckhbw %%mm0, %%mm4         \n\t" // YUYV YUYV(4)
+                       "punpcklbw %%mm2, %%mm5         \n\t" // YUYV YUYV(8)
+                       "punpckhbw %%mm2, %%mm6         \n\t" // YUYV YUYV(12)
 
-               "addl $8, %%eax                 \n\t"
-               "cmpl %4, %%eax                 \n\t"
-               " jb 1b                         \n\t"
-               EMMS" \n\t"
-               SFENCE
-               ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (num_pixels>>1)
-               : "memory", "%eax"
-       );
+                       MOVNTQ" %%mm3, (%0, %%eax, 4)   \n\t"
+                       MOVNTQ" %%mm4, 8(%0, %%eax, 4)  \n\t"
+                       MOVNTQ" %%mm5, 16(%0, %%eax, 4) \n\t"
+                       MOVNTQ" %%mm6, 24(%0, %%eax, 4) \n\t"
 
+                       "addl $8, %%eax                 \n\t"
+                       "cmpl %4, %%eax                 \n\t"
+                       " jb 1b                         \n\t"
+                       ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "r" (chromWidth)
+                       : "%eax"
+               );
 #else
-       int i;
-       num_pixels>>=1;
-       for(i=0; i<num_pixels; i++)
-       {
-               dst[4*i+0] = ysrc[2*i+0];
-               dst[4*i+1] = usrc[i];
-               dst[4*i+2] = ysrc[2*i+1];
-               dst[4*i+3] = vsrc[i];
+               int i;
+               for(i=0; i<chromWidth; i++)
+               {
+                       dst[4*i+0] = ysrc[2*i+0];
+                       dst[4*i+1] = usrc[i];
+                       dst[4*i+2] = ysrc[2*i+1];
+                       dst[4*i+3] = vsrc[i];
+               }
+#endif
+               if(y&1)
+               {
+                       usrc += chromStride;
+                       vsrc += chromStride;
+               }
+               ysrc += lumStride;
+               dst += dstStride;
        }
+#ifdef HAVE_MMX
+asm(    EMMS" \n\t"
+        SFENCE" \n\t"
+        :::"memory");
 #endif
 }
 
@@ -410,4 +425,4 @@ void yuy2toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
                 vdst[i]        = src[4*i+3];
        }
 #endif
-}
\ No newline at end of file
+}