c optimizations
authorMichael Niedermayer <michaelni@gmx.at>
Tue, 30 Oct 2001 22:24:38 +0000 (22:24 +0000)
committerMichael Niedermayer <michaelni@gmx.at>
Tue, 30 Oct 2001 22:24:38 +0000 (22:24 +0000)
bugfix

Originally committed as revision 2577 to svn://svn.mplayerhq.hu/mplayer/trunk/postproc

postproc/swscale.c
postproc/swscale_template.c

index fd385b4..484f349 100644 (file)
@@ -290,10 +290,10 @@ static int canMMX2BeUsed=0;
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
                "movq 4096(%2, %%eax), %%mm5    \n\t" /* uvbuf0[eax+2048]*/\
                "movq 4096(%3, %%eax), %%mm4    \n\t" /* uvbuf1[eax+2048]*/\
-               "paddw %%mm2, %%mm3             \n\t"\
-               "paddw %%mm5, %%mm4             \n\t"\
-               "psraw $5, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
-               "psraw $5, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
+               "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
+               "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
+               "psrlw $5, %%mm3                \n\t"\
+               "psrlw $5, %%mm4                \n\t"\
                "psubw w400, %%mm3              \n\t" /* (U-128)8*/\
                "psubw w400, %%mm4              \n\t" /* (V-128)8*/\
                "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
@@ -785,7 +785,6 @@ FULL_YSCALEYUV2RGB
                        );
                }
 #else
-//FIXME unroll C loop and dont recalculate UV
                asm volatile ("\n\t"::: "memory");
 
                if(dstbpp==32)
@@ -898,8 +897,9 @@ static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, ui
                yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
                return;
        }
-#ifdef HAVE_MMX
        if( yalpha > 2048 ) buf0 = buf1;
+
+#ifdef HAVE_MMX
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
        {
                if(dstbpp == 32)
@@ -1013,48 +1013,99 @@ static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, ui
                }
        }
 #else
-//FIXME unroll C loop and dont recalculate UV
+//FIXME write 2 versions (for even & odd lines)
        asm volatile ("\n\t"::: "memory");
 
-       if(dstbpp==32 || dstbpp==24)
+       if(dstbpp==32)
        {
-               for(i=0;i<dstw;i++){
+               for(i=0; i<dstw-1; i+=2){
                        // vertical linear interpolation && yuv2rgb in a single step:
-                       int Y=yuvtab_2568[buf0[i]>>7];
+                       int Y1=yuvtab_2568[buf0[i]>>7];
+                       int Y2=yuvtab_2568[buf0[i+1]>>7];
                        int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
                        int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
-                       dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
-                       dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
-                       dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
-                       dest+=dstbpp>>3;
+
+                       int Cb= yuvtab_40cf[U];
+                       int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
+                       int Cr= yuvtab_3343[V];
+
+                       dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
+                       dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
+                       dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
+
+                       dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
+                       dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
+                       dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
+               }
+       }
+       if(dstbpp==24)
+       {
+               for(i=0; i<dstw-1; i+=2){
+                       // vertical linear interpolation && yuv2rgb in a single step:
+                       int Y1=yuvtab_2568[buf0[i]>>7];
+                       int Y2=yuvtab_2568[buf0[i+1]>>7];
+                       int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
+                       int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
+
+                       int Cb= yuvtab_40cf[U];
+                       int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
+                       int Cr= yuvtab_3343[V];
+
+                       dest[0]=clip_table[((Y1 + Cb) >>13)];
+                       dest[1]=clip_table[((Y1 + Cg) >>13)];
+                       dest[2]=clip_table[((Y1 + Cr) >>13)];
+
+                       dest[3]=clip_table[((Y2 + Cb) >>13)];
+                       dest[4]=clip_table[((Y2 + Cg) >>13)];
+                       dest[5]=clip_table[((Y2 + Cr) >>13)];
+                       dest+=6;
                }
        }
        else if(dstbpp==16)
        {
-               for(i=0;i<dstw;i++){
+               for(i=0; i<dstw-1; i+=2){
                        // vertical linear interpolation && yuv2rgb in a single step:
-                       int Y=yuvtab_2568[buf0[i]>>7];
+                       int Y1=yuvtab_2568[buf0[i]>>7];
+                       int Y2=yuvtab_2568[buf0[i+1]>>7];
                        int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
                        int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
 
+                       int Cb= yuvtab_40cf[U];
+                       int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
+                       int Cr= yuvtab_3343[V];
+
                        ((uint16_t*)dest)[i] =
-                               (clip_table[(Y + yuvtab_40cf[U]) >>13]>>3) |
-                               ((clip_table[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13]<<3)&0x07E0) |
-                               ((clip_table[(Y + yuvtab_3343[V]) >>13]<<8)&0xF800);
+                               (clip_table[(Y1 + Cb) >>13]>>3) |
+                               ((clip_table[(Y1 + Cg) >>13]<<3)&0x07E0) |
+                               ((clip_table[(Y1 + Cr) >>13]<<8)&0xF800);
+
+                       ((uint16_t*)dest)[i+1] =
+                               (clip_table[(Y2 + Cb) >>13]>>3) |
+                               ((clip_table[(Y2 + Cg) >>13]<<3)&0x07E0) |
+                               ((clip_table[(Y2 + Cr) >>13]<<8)&0xF800);
                }
        }
        else if(dstbpp==15)
        {
-               for(i=0;i<dstw;i++){
+               for(i=0; i<dstw-1; i+=2){
                        // vertical linear interpolation && yuv2rgb in a single step:
-                       int Y=yuvtab_2568[buf0[i]>>7];
+                       int Y1=yuvtab_2568[buf0[i]>>7];
+                       int Y2=yuvtab_2568[buf0[i+1]>>7];
                        int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
                        int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
 
+                       int Cb= yuvtab_40cf[U];
+                       int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
+                       int Cr= yuvtab_3343[V];
+
                        ((uint16_t*)dest)[i] =
-                               (clip_table[(Y + yuvtab_40cf[U]) >>13]>>3) |
-                               ((clip_table[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13]<<2)&0x03E0) |
-                               ((clip_table[(Y + yuvtab_3343[V]) >>13]<<7)&0x7C00);
+                               (clip_table[(Y1 + Cb) >>13]>>3) |
+                               ((clip_table[(Y1 + Cg) >>13]<<2)&0x03E0) |
+                               ((clip_table[(Y1 + Cr) >>13]<<7)&0x7C00);
+                       ((uint16_t*)dest)[i+1] =
+                               (clip_table[(Y2 + Cb) >>13]>>3) |
+                               ((clip_table[(Y2 + Cg) >>13]<<2)&0x03E0) |
+                               ((clip_table[(Y2 + Cr) >>13]<<7)&0x7C00);
                }
        }
 #endif
index fd385b4..484f349 100644 (file)
@@ -290,10 +290,10 @@ static int canMMX2BeUsed=0;
                "movq (%3, %%eax), %%mm3        \n\t" /* uvbuf1[eax]*/\
                "movq 4096(%2, %%eax), %%mm5    \n\t" /* uvbuf0[eax+2048]*/\
                "movq 4096(%3, %%eax), %%mm4    \n\t" /* uvbuf1[eax+2048]*/\
-               "paddw %%mm2, %%mm3             \n\t"\
-               "paddw %%mm5, %%mm4             \n\t"\
-               "psraw $5, %%mm3                \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
-               "psraw $5, %%mm4                \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
+               "paddw %%mm2, %%mm3             \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
+               "paddw %%mm5, %%mm4             \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
+               "psrlw $5, %%mm3                \n\t"\
+               "psrlw $5, %%mm4                \n\t"\
                "psubw w400, %%mm3              \n\t" /* (U-128)8*/\
                "psubw w400, %%mm4              \n\t" /* (V-128)8*/\
                "movq %%mm3, %%mm2              \n\t" /* (U-128)8*/\
@@ -785,7 +785,6 @@ FULL_YSCALEYUV2RGB
                        );
                }
 #else
-//FIXME unroll C loop and dont recalculate UV
                asm volatile ("\n\t"::: "memory");
 
                if(dstbpp==32)
@@ -898,8 +897,9 @@ static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, ui
                yuv2rgbX(buf0, buf1, uvbuf0, uvbuf1, dest, dstw, yalpha, uvalpha, dstbpp);
                return;
        }
-#ifdef HAVE_MMX
        if( yalpha > 2048 ) buf0 = buf1;
+
+#ifdef HAVE_MMX
        if( uvalpha < 2048 ) // note this is not correct (shifts chrominance by 0.5 pixels) but its a bit faster
        {
                if(dstbpp == 32)
@@ -1013,48 +1013,99 @@ static inline void yuv2rgb1(uint16_t *buf0, uint16_t *buf1, uint16_t *uvbuf0, ui
                }
        }
 #else
-//FIXME unroll C loop and dont recalculate UV
+//FIXME write 2 versions (for even & odd lines)
        asm volatile ("\n\t"::: "memory");
 
-       if(dstbpp==32 || dstbpp==24)
+       if(dstbpp==32)
        {
-               for(i=0;i<dstw;i++){
+               for(i=0; i<dstw-1; i+=2){
                        // vertical linear interpolation && yuv2rgb in a single step:
-                       int Y=yuvtab_2568[buf0[i]>>7];
+                       int Y1=yuvtab_2568[buf0[i]>>7];
+                       int Y2=yuvtab_2568[buf0[i+1]>>7];
                        int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
                        int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
-                       dest[0]=clip_table[((Y + yuvtab_40cf[U]) >>13)];
-                       dest[1]=clip_table[((Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13)];
-                       dest[2]=clip_table[((Y + yuvtab_3343[V]) >>13)];
-                       dest+=dstbpp>>3;
+
+                       int Cb= yuvtab_40cf[U];
+                       int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
+                       int Cr= yuvtab_3343[V];
+
+                       dest[4*i+0]=clip_table[((Y1 + Cb) >>13)];
+                       dest[4*i+1]=clip_table[((Y1 + Cg) >>13)];
+                       dest[4*i+2]=clip_table[((Y1 + Cr) >>13)];
+
+                       dest[4*i+4]=clip_table[((Y2 + Cb) >>13)];
+                       dest[4*i+5]=clip_table[((Y2 + Cg) >>13)];
+                       dest[4*i+6]=clip_table[((Y2 + Cr) >>13)];
+               }
+       }
+       if(dstbpp==24)
+       {
+               for(i=0; i<dstw-1; i+=2){
+                       // vertical linear interpolation && yuv2rgb in a single step:
+                       int Y1=yuvtab_2568[buf0[i]>>7];
+                       int Y2=yuvtab_2568[buf0[i+1]>>7];
+                       int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
+                       int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
+
+                       int Cb= yuvtab_40cf[U];
+                       int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
+                       int Cr= yuvtab_3343[V];
+
+                       dest[0]=clip_table[((Y1 + Cb) >>13)];
+                       dest[1]=clip_table[((Y1 + Cg) >>13)];
+                       dest[2]=clip_table[((Y1 + Cr) >>13)];
+
+                       dest[3]=clip_table[((Y2 + Cb) >>13)];
+                       dest[4]=clip_table[((Y2 + Cg) >>13)];
+                       dest[5]=clip_table[((Y2 + Cr) >>13)];
+                       dest+=6;
                }
        }
        else if(dstbpp==16)
        {
-               for(i=0;i<dstw;i++){
+               for(i=0; i<dstw-1; i+=2){
                        // vertical linear interpolation && yuv2rgb in a single step:
-                       int Y=yuvtab_2568[buf0[i]>>7];
+                       int Y1=yuvtab_2568[buf0[i]>>7];
+                       int Y2=yuvtab_2568[buf0[i+1]>>7];
                        int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
                        int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
 
+                       int Cb= yuvtab_40cf[U];
+                       int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
+                       int Cr= yuvtab_3343[V];
+
                        ((uint16_t*)dest)[i] =
-                               (clip_table[(Y + yuvtab_40cf[U]) >>13]>>3) |
-                               ((clip_table[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13]<<3)&0x07E0) |
-                               ((clip_table[(Y + yuvtab_3343[V]) >>13]<<8)&0xF800);
+                               (clip_table[(Y1 + Cb) >>13]>>3) |
+                               ((clip_table[(Y1 + Cg) >>13]<<3)&0x07E0) |
+                               ((clip_table[(Y1 + Cr) >>13]<<8)&0xF800);
+
+                       ((uint16_t*)dest)[i+1] =
+                               (clip_table[(Y2 + Cb) >>13]>>3) |
+                               ((clip_table[(Y2 + Cg) >>13]<<3)&0x07E0) |
+                               ((clip_table[(Y2 + Cr) >>13]<<8)&0xF800);
                }
        }
        else if(dstbpp==15)
        {
-               for(i=0;i<dstw;i++){
+               for(i=0; i<dstw-1; i+=2){
                        // vertical linear interpolation && yuv2rgb in a single step:
-                       int Y=yuvtab_2568[buf0[i]>>7];
+                       int Y1=yuvtab_2568[buf0[i]>>7];
+                       int Y2=yuvtab_2568[buf0[i+1]>>7];
                        int U=((uvbuf0[i/2]*uvalpha1+uvbuf1[i/2]*uvalpha)>>19);
                        int V=((uvbuf0[i/2+2048]*uvalpha1+uvbuf1[i/2+2048]*uvalpha)>>19);
 
+                       int Cb= yuvtab_40cf[U];
+                       int Cg= yuvtab_1a1e[V] + yuvtab_0c92[U];
+                       int Cr= yuvtab_3343[V];
+
                        ((uint16_t*)dest)[i] =
-                               (clip_table[(Y + yuvtab_40cf[U]) >>13]>>3) |
-                               ((clip_table[(Y + yuvtab_1a1e[V] + yuvtab_0c92[U]) >>13]<<2)&0x03E0) |
-                               ((clip_table[(Y + yuvtab_3343[V]) >>13]<<7)&0x7C00);
+                               (clip_table[(Y1 + Cb) >>13]>>3) |
+                               ((clip_table[(Y1 + Cg) >>13]<<2)&0x03E0) |
+                               ((clip_table[(Y1 + Cr) >>13]<<7)&0x7C00);
+                       ((uint16_t*)dest)[i+1] =
+                               (clip_table[(Y2 + Cb) >>13]>>3) |
+                               ((clip_table[(Y2 + Cg) >>13]<<2)&0x03E0) |
+                               ((clip_table[(Y2 + Cr) >>13]<<7)&0x7C00);
                }
        }
 #endif