byte interleaving for mga
authorMichael Niedermayer <michaelni@gmx.at>
Mon, 25 Mar 2002 16:35:24 +0000 (16:35 +0000)
committerMichael Niedermayer <michaelni@gmx.at>
Mon, 25 Mar 2002 16:35:24 +0000 (16:35 +0000)
untested (no g200 mga or whatever i would need ...)
experimental sse2 version (even less tested as no p4 either ...)
sse2 version would need 16-byte aligned src & dst else sig11
sse2 version is disabled by default

Originally committed as revision 5338 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

postproc/rgb2rgb.c
postproc/rgb2rgb.h
postproc/rgb2rgb_template.c

index abe1223..f9f3ec9 100644 (file)
@@ -409,3 +409,21 @@ void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst
                rgb24toyv12_C(src, ydst, udst, vdst, width,  height, lumStride, chromStride, srcStride);
 #endif
 }
+
+void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst,
+                    int width, int height, int src1Stride, int src2Stride, int dstStride)
+{
+#ifdef CAN_COMPILE_X86_ASM
+       // ordered per speed fasterst first
+       if(gCpuCaps.hasMMX2)
+               interleaveBytes_MMX2(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
+       else if(gCpuCaps.has3DNow)
+               interleaveBytes_3DNow(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
+       else if(gCpuCaps.hasMMX)
+               interleaveBytes_MMX(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
+       else
+               interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
+#else
+               interleaveBytes_C(src1, src2, dst, width, height, src1Stride, src2Stride, dstStride);
+#endif
+}
index 5c3b3f8..e5dd3be 100644 (file)
@@ -34,6 +34,10 @@ extern void rgb24toyv12(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_
        unsigned int width, unsigned int height,
        unsigned int lumStride, unsigned int chromStride, unsigned int srcStride);
 
+extern void interleaveBytes(uint8_t *src1, uint8_t *src2, uint8_t *dst,
+                           int width, int height, int src1Stride, int src2Stride, int dstStride);
+       
+
 #define MODE_RGB  0x1
 #define MODE_BGR  0x2
 
index 3a1a332..46f36d8 100644 (file)
@@ -1197,3 +1197,83 @@ static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_
                src  += srcStride;
        }
 }
+
+void RENAME(interleaveBytes)(uint8_t *src1, uint8_t *src2, uint8_t *dest,
+                           int width, int height, int src1Stride, int src2Stride, int dstStride){
+       int h;
+
+       for(h=0; h < height; h++)
+       {
+               int w;
+
+#ifdef HAVE_MMX
+#ifdef HAVE_SSE2
+               asm(
+                       "xorl %%eax, %%eax              \n\t"
+                       "1:                             \n\t"
+                       PREFETCH" 64(%1, %%eax)         \n\t"
+                       PREFETCH" 64(%2, %%eax)         \n\t"
+                       "movdqa (%1, %%eax), %%xmm0     \n\t"
+                       "movdqa (%1, %%eax), %%xmm1     \n\t"
+                       "movdqa (%2, %%eax), %%xmm2     \n\t"
+                       "punpcklbw %%xmm2, %%xmm0       \n\t"
+                       "punpckhbw %%xmm2, %%xmm1       \n\t"
+                       "movntdq %%xmm0, (%0, %%eax, 2) \n\t"
+                       "movntdq %%xmm1, 16(%0, %%eax, 2)\n\t"
+                       "addl $16, %%eax                        \n\t"
+                       "cmpl %3, %%eax                 \n\t"
+                       " jb 1b                         \n\t"
+                       ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
+                       : "memory", "%eax"
+               );
+#else
+               asm(
+                       "xorl %%eax, %%eax              \n\t"
+                       "1:                             \n\t"
+                       PREFETCH" 64(%1, %%eax)         \n\t"
+                       PREFETCH" 64(%2, %%eax)         \n\t"
+                       "movq (%1, %%eax), %%mm0        \n\t"
+                       "movq 8(%1, %%eax), %%mm2       \n\t"
+                       "movq %%mm0, %%mm1              \n\t"
+                       "movq %%mm2, %%mm3              \n\t"
+                       "movq (%2, %%eax), %%mm4        \n\t"
+                       "movq 8(%2, %%eax), %%mm5       \n\t"
+                       "punpcklbw %%mm4, %%mm0         \n\t"
+                       "punpckhbw %%mm4, %%mm1         \n\t"
+                       "punpcklbw %%mm5, %%mm2         \n\t"
+                       "punpckhbw %%mm5, %%mm3         \n\t"
+                       MOVNTQ" %%mm0, (%0, %%eax, 2)   \n\t"
+                       MOVNTQ" %%mm1, 8(%0, %%eax, 2)  \n\t"
+                       MOVNTQ" %%mm2, 16(%0, %%eax, 2) \n\t"
+                       MOVNTQ" %%mm3, 24(%0, %%eax, 2) \n\t"
+                       "addl $16, %%eax                        \n\t"
+                       "cmpl %3, %%eax                 \n\t"
+                       " jb 1b                         \n\t"
+                       ::"r"(dest), "r"(src1), "r"(src2), "r" (width-15)
+                       : "memory", "%eax"
+               );
+#endif
+               for(w= (width&(~15)); w < width; w++)
+               {
+                       dest[2*w+0] = src1[w];
+                       dest[2*w+1] = src2[w];
+               }
+#else
+               for(w=0; w < width; w++)
+               {
+                       dest[2*w+0] = src1[w];
+                       dest[2*w+1] = src2[w];
+               }
+#endif
+               dest += dstStride;
+                src1 += src1Stride;
+                src2 += src2Stride;
+       }
+#ifdef HAVE_MMX
+       asm(
+               EMMS" \n\t"
+               SFENCE" \n\t"
+               ::: "memory"
+               );
+#endif
+}