From 59fe111e8122acc614ace5618e1a4ba7416e9875 Mon Sep 17 00:00:00 2001 From: Michael Niedermayer Date: Thu, 16 May 2002 23:29:09 +0000 Subject: [PATCH] new hopefully faster MC Originally committed as revision 505 to svn://svn.ffmpeg.org/ffmpeg/trunk --- libavcodec/dsputil.c | 352 +++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 344 insertions(+), 8 deletions(-) diff --git a/libavcodec/dsputil.c b/libavcodec/dsputil.c index 54779bf..b7c95cf 100644 --- a/libavcodec/dsputil.c +++ b/libavcodec/dsputil.c @@ -16,7 +16,7 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. * - * gmc & q-pel support by Michael Niedermayer + * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer */ #include #include @@ -252,6 +252,347 @@ void add_pixels_clamped_c(const DCTELEM *block, UINT8 *pixels, int line_size) } } +//FIXME someone with a alignemtent picky cpu should change these + +#define LD32(a) (*((uint32_t*)(a))) +#define LD64(a) (*((uint64_t*)(a))) + +#if 0 + +#define PIXOP2(OPNAME, OP) \ +void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i>1));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i>1));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i>1));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i>1));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + const uint64_t a= LD64(pixels );\ + const uint64_t b= LD64(pixels+1);\ + uint64_t l0= (a&0x0303030303030303ULL)\ + + (b&0x0303030303030303ULL)\ + + 0x0202020202020202ULL;\ + uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ + + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ + uint64_t l1,h1;\ +\ + pixels+=line_size;\ + for(i=0; i>2)\ + + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ + OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ + pixels+=line_size;\ + block +=line_size;\ + a= LD64(pixels );\ + b= LD64(pixels+1);\ + l0= (a&0x0303030303030303ULL)\ + + (b&0x0303030303030303ULL)\ + + 0x0202020202020202ULL;\ + h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ + + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ + OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + const uint64_t a= LD64(pixels );\ + const uint64_t b= LD64(pixels+1);\ + uint64_t l0= (a&0x0303030303030303ULL)\ + + (b&0x0303030303030303ULL)\ + + 0x0101010101010101ULL;\ + uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ + + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ + uint64_t l1,h1;\ +\ + pixels+=line_size;\ + for(i=0; i>2)\ + + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ + OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ + pixels+=line_size;\ + block +=line_size;\ + a= LD64(pixels );\ + b= LD64(pixels+1);\ + l0= (a&0x0303030303030303ULL)\ + + (b&0x0303030303030303ULL)\ + + 0x0101010101010101ULL;\ + h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\ + + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\ + OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\ + pixels+=line_size;\ + block +=line_size;\ + }\ +}\ +\ +void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\ + OPNAME ## _pixels,\ + OPNAME ## _pixels_x2,\ + OPNAME ## _pixels_y2,\ + OPNAME ## _pixels_xy2,\ +};\ +\ +void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\ + OPNAME ## _pixels,\ + OPNAME ## _no_rnd_pixels_x2,\ + OPNAME ## _no_rnd_pixels_y2,\ + OPNAME ## _no_rnd_pixels_xy2,\ +}; + +#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) ) +#else // 64 bit variant + +#define PIXOP2(OPNAME, OP) \ +void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i>1));\ + pixels+=4;\ + block +=4;\ + }\ + pixels+=line_size-8;\ + block +=line_size-8;\ + }\ +}\ +\ +void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i>1));\ + pixels+=4;\ + block +=4;\ + }\ + pixels+=line_size-8;\ + block +=line_size-8;\ + }\ +}\ +\ +void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i>1));\ + pixels+=4;\ + block +=4;\ + }\ + pixels+=line_size-8;\ + block +=line_size-8;\ + }\ +}\ +\ +void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int i;\ + for(i=0; i>1));\ + pixels+=4;\ + block +=4;\ + }\ + pixels+=line_size-8;\ + block +=line_size-8;\ + }\ +}\ +\ +void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int j;\ + for(j=0; j<2; j++){\ + int i;\ + const uint32_t a= LD32(pixels );\ + const uint32_t b= LD32(pixels+1);\ + uint32_t l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x02020202UL;\ + uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + uint32_t l1,h1;\ +\ + pixels+=line_size;\ + for(i=0; i>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + a= LD32(pixels );\ + b= LD32(pixels+1);\ + l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x02020202UL;\ + h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + }\ + pixels+=4-line_size*(h+1);\ + block +=4-line_size*h;\ + }\ +}\ +\ +void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\ +{\ + int j;\ + for(j=0; j<2; j++){\ + int i;\ + const uint32_t a= LD32(pixels );\ + const uint32_t b= LD32(pixels+1);\ + uint32_t l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x01010101UL;\ + uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + uint32_t l1,h1;\ +\ + pixels+=line_size;\ + for(i=0; i>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + a= LD32(pixels );\ + b= LD32(pixels+1);\ + l0= (a&0x03030303UL)\ + + (b&0x03030303UL)\ + + 0x01010101UL;\ + h0= ((a&0xFCFCFCFCUL)>>2)\ + + ((b&0xFCFCFCFCUL)>>2);\ + OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\ + pixels+=line_size;\ + block +=line_size;\ + }\ + pixels+=4-line_size*(h+1);\ + block +=4-line_size*h;\ + }\ +}\ +\ +void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\ + OPNAME ## _pixels,\ + OPNAME ## _pixels_x2,\ + OPNAME ## _pixels_y2,\ + OPNAME ## _pixels_xy2,\ +};\ +\ +void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\ + OPNAME ## _pixels,\ + OPNAME ## _no_rnd_pixels_x2,\ + OPNAME ## _no_rnd_pixels_y2,\ + OPNAME ## _no_rnd_pixels_xy2,\ +}; +#define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) +#endif + +#define op_put(a, b) a = b + +PIXOP2(avg, op_avg) +PIXOP2(put, op_put) +#undef op_avg +#undef op_put + +/* FIXME this stuff could be removed as its ot really used anymore */ #define PIXOP(BTYPE, OPNAME, OP, INCR) \ \ static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h) \ @@ -356,13 +697,9 @@ void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_siz #define avg2(a,b) ((a+b+1)>>1) #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) -#define op_put(a, b) a = b #define op_avg(a, b) a = avg2(a, b) #define op_sub(a, b) a -= b -PIXOP(UINT8, put, op_put, line_size) -PIXOP(UINT8, avg, op_avg, line_size) - PIXOP(DCTELEM, sub, op_sub, 8) /* not rounding primitives */ @@ -371,9 +708,6 @@ PIXOP(DCTELEM, sub, op_sub, 8) #define avg2(a,b) ((a+b)>>1) #define avg4(a,b,c,d) ((a+b+c+d+1)>>2) -PIXOP(UINT8, put_no_rnd, op_put, line_size) -PIXOP(UINT8, avg_no_rnd, op_avg, line_size) - /* motion estimation */ #undef avg2 @@ -381,6 +715,8 @@ PIXOP(UINT8, avg_no_rnd, op_avg, line_size) #define avg2(a,b) ((a+b+1)>>1) #define avg4(a,b,c,d) ((a+b+c+d+2)>>2) +/* end of removeale stuff */ + static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder) { const int A=(16-x16)*(16-y16); -- 2.7.4