3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
30 #include "libavutil/imgutils.h"
33 #include "simple_idct.h"
37 #include "mpegvideo.h"
42 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
43 uint32_t ff_squareTbl[512] = {0, };
46 #include "dsputil_template.c"
50 #include "dsputil_template.c"
54 #include "dsputil_template.c"
56 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
57 #define pb_7f (~0UL/255 * 0x7f)
58 #define pb_80 (~0UL/255 * 0x80)
60 const uint8_t ff_zigzag_direct[64] = {
61 0, 1, 8, 16, 9, 2, 3, 10,
62 17, 24, 32, 25, 18, 11, 4, 5,
63 12, 19, 26, 33, 40, 48, 41, 34,
64 27, 20, 13, 6, 7, 14, 21, 28,
65 35, 42, 49, 56, 57, 50, 43, 36,
66 29, 22, 15, 23, 30, 37, 44, 51,
67 58, 59, 52, 45, 38, 31, 39, 46,
68 53, 60, 61, 54, 47, 55, 62, 63
71 /* Specific zigzag scan for 248 idct. NOTE that unlike the
72 specification, we interleave the fields */
73 const uint8_t ff_zigzag248_direct[64] = {
74 0, 8, 1, 9, 16, 24, 2, 10,
75 17, 25, 32, 40, 48, 56, 33, 41,
76 18, 26, 3, 11, 4, 12, 19, 27,
77 34, 42, 49, 57, 50, 58, 35, 43,
78 20, 28, 5, 13, 6, 14, 21, 29,
79 36, 44, 51, 59, 52, 60, 37, 45,
80 22, 30, 7, 15, 23, 31, 38, 46,
81 53, 61, 54, 62, 39, 47, 55, 63,
84 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
85 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
87 const uint8_t ff_alternate_horizontal_scan[64] = {
88 0, 1, 2, 3, 8, 9, 16, 17,
89 10, 11, 4, 5, 6, 7, 15, 14,
90 13, 12, 19, 18, 24, 25, 32, 33,
91 26, 27, 20, 21, 22, 23, 28, 29,
92 30, 31, 34, 35, 40, 41, 48, 49,
93 42, 43, 36, 37, 38, 39, 44, 45,
94 46, 47, 50, 51, 56, 57, 58, 59,
95 52, 53, 54, 55, 60, 61, 62, 63,
98 const uint8_t ff_alternate_vertical_scan[64] = {
99 0, 8, 16, 24, 1, 9, 2, 10,
100 17, 25, 32, 40, 48, 56, 57, 49,
101 41, 33, 26, 18, 3, 11, 4, 12,
102 19, 27, 34, 42, 50, 58, 35, 43,
103 51, 59, 20, 28, 5, 13, 6, 14,
104 21, 29, 36, 44, 52, 60, 37, 45,
105 53, 61, 22, 30, 7, 15, 23, 31,
106 38, 46, 54, 62, 39, 47, 55, 63,
109 /* Input permutation for the simple_idct_mmx */
110 static const uint8_t simple_mmx_permutation[64]={
111 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
112 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
113 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
114 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
115 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
116 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
117 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
118 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
121 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
123 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
127 st->scantable= src_scantable;
131 j = src_scantable[i];
132 st->permutated[i] = permutation[j];
141 j = st->permutated[i];
143 st->raster_end[i]= end;
147 static int pix_sum_c(uint8_t * pix, int line_size)
152 for (i = 0; i < 16; i++) {
153 for (j = 0; j < 16; j += 8) {
164 pix += line_size - 16;
169 static int pix_norm1_c(uint8_t * pix, int line_size)
172 uint32_t *sq = ff_squareTbl + 256;
175 for (i = 0; i < 16; i++) {
176 for (j = 0; j < 16; j += 8) {
187 #if LONG_MAX > 2147483647
188 register uint64_t x=*(uint64_t*)pix;
190 s += sq[(x>>8)&0xff];
191 s += sq[(x>>16)&0xff];
192 s += sq[(x>>24)&0xff];
193 s += sq[(x>>32)&0xff];
194 s += sq[(x>>40)&0xff];
195 s += sq[(x>>48)&0xff];
196 s += sq[(x>>56)&0xff];
198 register uint32_t x=*(uint32_t*)pix;
200 s += sq[(x>>8)&0xff];
201 s += sq[(x>>16)&0xff];
202 s += sq[(x>>24)&0xff];
203 x=*(uint32_t*)(pix+4);
205 s += sq[(x>>8)&0xff];
206 s += sq[(x>>16)&0xff];
207 s += sq[(x>>24)&0xff];
212 pix += line_size - 16;
217 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
220 for(i=0; i+8<=w; i+=8){
221 dst[i+0]= av_bswap32(src[i+0]);
222 dst[i+1]= av_bswap32(src[i+1]);
223 dst[i+2]= av_bswap32(src[i+2]);
224 dst[i+3]= av_bswap32(src[i+3]);
225 dst[i+4]= av_bswap32(src[i+4]);
226 dst[i+5]= av_bswap32(src[i+5]);
227 dst[i+6]= av_bswap32(src[i+6]);
228 dst[i+7]= av_bswap32(src[i+7]);
231 dst[i+0]= av_bswap32(src[i+0]);
235 static void bswap16_buf(uint16_t *dst, const uint16_t *src, int len)
238 *dst++ = av_bswap16(*src++);
241 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
244 uint32_t *sq = ff_squareTbl + 256;
247 for (i = 0; i < h; i++) {
248 s += sq[pix1[0] - pix2[0]];
249 s += sq[pix1[1] - pix2[1]];
250 s += sq[pix1[2] - pix2[2]];
251 s += sq[pix1[3] - pix2[3]];
258 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
261 uint32_t *sq = ff_squareTbl + 256;
264 for (i = 0; i < h; i++) {
265 s += sq[pix1[0] - pix2[0]];
266 s += sq[pix1[1] - pix2[1]];
267 s += sq[pix1[2] - pix2[2]];
268 s += sq[pix1[3] - pix2[3]];
269 s += sq[pix1[4] - pix2[4]];
270 s += sq[pix1[5] - pix2[5]];
271 s += sq[pix1[6] - pix2[6]];
272 s += sq[pix1[7] - pix2[7]];
279 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
282 uint32_t *sq = ff_squareTbl + 256;
285 for (i = 0; i < h; i++) {
286 s += sq[pix1[ 0] - pix2[ 0]];
287 s += sq[pix1[ 1] - pix2[ 1]];
288 s += sq[pix1[ 2] - pix2[ 2]];
289 s += sq[pix1[ 3] - pix2[ 3]];
290 s += sq[pix1[ 4] - pix2[ 4]];
291 s += sq[pix1[ 5] - pix2[ 5]];
292 s += sq[pix1[ 6] - pix2[ 6]];
293 s += sq[pix1[ 7] - pix2[ 7]];
294 s += sq[pix1[ 8] - pix2[ 8]];
295 s += sq[pix1[ 9] - pix2[ 9]];
296 s += sq[pix1[10] - pix2[10]];
297 s += sq[pix1[11] - pix2[11]];
298 s += sq[pix1[12] - pix2[12]];
299 s += sq[pix1[13] - pix2[13]];
300 s += sq[pix1[14] - pix2[14]];
301 s += sq[pix1[15] - pix2[15]];
309 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
313 /* read the pixels */
315 block[0] = pixels[0];
316 block[1] = pixels[1];
317 block[2] = pixels[2];
318 block[3] = pixels[3];
319 block[4] = pixels[4];
320 block[5] = pixels[5];
321 block[6] = pixels[6];
322 block[7] = pixels[7];
328 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
329 const uint8_t *s2, int stride){
332 /* read the pixels */
334 block[0] = s1[0] - s2[0];
335 block[1] = s1[1] - s2[1];
336 block[2] = s1[2] - s2[2];
337 block[3] = s1[3] - s2[3];
338 block[4] = s1[4] - s2[4];
339 block[5] = s1[5] - s2[5];
340 block[6] = s1[6] - s2[6];
341 block[7] = s1[7] - s2[7];
349 void ff_put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
353 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
355 /* read the pixels */
357 pixels[0] = cm[block[0]];
358 pixels[1] = cm[block[1]];
359 pixels[2] = cm[block[2]];
360 pixels[3] = cm[block[3]];
361 pixels[4] = cm[block[4]];
362 pixels[5] = cm[block[5]];
363 pixels[6] = cm[block[6]];
364 pixels[7] = cm[block[7]];
371 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
375 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
377 /* read the pixels */
379 pixels[0] = cm[block[0]];
380 pixels[1] = cm[block[1]];
381 pixels[2] = cm[block[2]];
382 pixels[3] = cm[block[3]];
389 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
393 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
395 /* read the pixels */
397 pixels[0] = cm[block[0]];
398 pixels[1] = cm[block[1]];
405 void ff_put_signed_pixels_clamped_c(const DCTELEM *block,
406 uint8_t *restrict pixels,
411 for (i = 0; i < 8; i++) {
412 for (j = 0; j < 8; j++) {
415 else if (*block > 127)
418 *pixels = (uint8_t)(*block + 128);
422 pixels += (line_size - 8);
426 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
431 /* read the pixels */
433 pixels[0] = block[0];
434 pixels[1] = block[1];
435 pixels[2] = block[2];
436 pixels[3] = block[3];
437 pixels[4] = block[4];
438 pixels[5] = block[5];
439 pixels[6] = block[6];
440 pixels[7] = block[7];
447 void ff_add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
451 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
453 /* read the pixels */
455 pixels[0] = cm[pixels[0] + block[0]];
456 pixels[1] = cm[pixels[1] + block[1]];
457 pixels[2] = cm[pixels[2] + block[2]];
458 pixels[3] = cm[pixels[3] + block[3]];
459 pixels[4] = cm[pixels[4] + block[4]];
460 pixels[5] = cm[pixels[5] + block[5]];
461 pixels[6] = cm[pixels[6] + block[6]];
462 pixels[7] = cm[pixels[7] + block[7]];
468 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
472 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
474 /* read the pixels */
476 pixels[0] = cm[pixels[0] + block[0]];
477 pixels[1] = cm[pixels[1] + block[1]];
478 pixels[2] = cm[pixels[2] + block[2]];
479 pixels[3] = cm[pixels[3] + block[3]];
485 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
489 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
491 /* read the pixels */
493 pixels[0] = cm[pixels[0] + block[0]];
494 pixels[1] = cm[pixels[1] + block[1]];
500 static int sum_abs_dctelem_c(DCTELEM *block)
504 sum+= FFABS(block[i]);
508 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
512 for (i = 0; i < h; i++) {
513 memset(block, value, 16);
518 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
522 for (i = 0; i < h; i++) {
523 memset(block, value, 8);
528 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
531 uint16_t *dst1 = (uint16_t *) dst;
532 uint16_t *dst2 = (uint16_t *)(dst + linesize);
534 for (j = 0; j < 8; j++) {
535 for (i = 0; i < 8; i++) {
536 dst1[i] = dst2[i] = src[i] * 0x0101;
544 #define avg2(a,b) ((a+b+1)>>1)
545 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
547 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
549 const int A=(16-x16)*(16-y16);
550 const int B=( x16)*(16-y16);
551 const int C=(16-x16)*( y16);
552 const int D=( x16)*( y16);
557 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
558 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
559 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
560 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
561 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
562 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
563 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
564 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
570 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
571 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
574 const int s= 1<<shift;
584 for(x=0; x<8; x++){ //XXX FIXME optimize
585 int src_x, src_y, frac_x, frac_y, index;
594 if((unsigned)src_x < width){
595 if((unsigned)src_y < height){
596 index= src_x + src_y*stride;
597 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
598 + src[index +1]* frac_x )*(s-frac_y)
599 + ( src[index+stride ]*(s-frac_x)
600 + src[index+stride+1]* frac_x )* frac_y
603 index= src_x + av_clip(src_y, 0, height)*stride;
604 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
605 + src[index +1]* frac_x )*s
609 if((unsigned)src_y < height){
610 index= av_clip(src_x, 0, width) + src_y*stride;
611 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
612 + src[index+stride ]* frac_y )*s
615 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
616 dst[y*stride + x]= src[index ];
628 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
630 case 2: put_pixels2_8_c (dst, src, stride, height); break;
631 case 4: put_pixels4_8_c (dst, src, stride, height); break;
632 case 8: put_pixels8_8_c (dst, src, stride, height); break;
633 case 16:put_pixels16_8_c(dst, src, stride, height); break;
637 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
639 for (i=0; i < height; i++) {
640 for (j=0; j < width; j++) {
641 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
648 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
650 for (i=0; i < height; i++) {
651 for (j=0; j < width; j++) {
652 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
659 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
661 for (i=0; i < height; i++) {
662 for (j=0; j < width; j++) {
663 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
670 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
672 for (i=0; i < height; i++) {
673 for (j=0; j < width; j++) {
674 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
681 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
683 for (i=0; i < height; i++) {
684 for (j=0; j < width; j++) {
685 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
692 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
694 for (i=0; i < height; i++) {
695 for (j=0; j < width; j++) {
696 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
703 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
705 for (i=0; i < height; i++) {
706 for (j=0; j < width; j++) {
707 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
714 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
716 for (i=0; i < height; i++) {
717 for (j=0; j < width; j++) {
718 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
725 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
727 case 2: avg_pixels2_8_c (dst, src, stride, height); break;
728 case 4: avg_pixels4_8_c (dst, src, stride, height); break;
729 case 8: avg_pixels8_8_c (dst, src, stride, height); break;
730 case 16:avg_pixels16_8_c(dst, src, stride, height); break;
734 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
736 for (i=0; i < height; i++) {
737 for (j=0; j < width; j++) {
738 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
745 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
747 for (i=0; i < height; i++) {
748 for (j=0; j < width; j++) {
749 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
756 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
758 for (i=0; i < height; i++) {
759 for (j=0; j < width; j++) {
760 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
767 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
769 for (i=0; i < height; i++) {
770 for (j=0; j < width; j++) {
771 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
778 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
780 for (i=0; i < height; i++) {
781 for (j=0; j < width; j++) {
782 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
789 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
791 for (i=0; i < height; i++) {
792 for (j=0; j < width; j++) {
793 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
800 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
802 for (i=0; i < height; i++) {
803 for (j=0; j < width; j++) {
804 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
811 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
813 for (i=0; i < height; i++) {
814 for (j=0; j < width; j++) {
815 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
822 #define TPEL_WIDTH(width)\
823 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
824 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
825 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
826 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
827 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
828 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
829 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
830 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
831 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
832 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
833 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
834 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
835 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
836 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
837 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
838 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
839 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
840 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
843 #define QPEL_MC(r, OPNAME, RND, OP) \
844 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
845 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
849 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
850 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
851 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
852 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
853 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
854 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
855 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
856 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
862 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
864 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
868 const int src0= src[0*srcStride];\
869 const int src1= src[1*srcStride];\
870 const int src2= src[2*srcStride];\
871 const int src3= src[3*srcStride];\
872 const int src4= src[4*srcStride];\
873 const int src5= src[5*srcStride];\
874 const int src6= src[6*srcStride];\
875 const int src7= src[7*srcStride];\
876 const int src8= src[8*srcStride];\
877 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
878 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
879 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
880 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
881 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
882 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
883 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
884 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
890 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
891 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
896 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
897 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
898 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
899 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
900 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
901 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
902 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
903 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
904 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
905 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
906 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
907 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
908 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
909 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
910 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
911 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
917 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
918 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
923 const int src0= src[0*srcStride];\
924 const int src1= src[1*srcStride];\
925 const int src2= src[2*srcStride];\
926 const int src3= src[3*srcStride];\
927 const int src4= src[4*srcStride];\
928 const int src5= src[5*srcStride];\
929 const int src6= src[6*srcStride];\
930 const int src7= src[7*srcStride];\
931 const int src8= src[8*srcStride];\
932 const int src9= src[9*srcStride];\
933 const int src10= src[10*srcStride];\
934 const int src11= src[11*srcStride];\
935 const int src12= src[12*srcStride];\
936 const int src13= src[13*srcStride];\
937 const int src14= src[14*srcStride];\
938 const int src15= src[15*srcStride];\
939 const int src16= src[16*srcStride];\
940 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
941 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
942 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
943 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
944 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
945 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
946 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
947 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
948 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
949 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
950 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
951 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
952 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
953 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
954 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
955 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
961 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
963 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
964 OPNAME ## pixels8_l2_8(dst, src, half, stride, stride, 8, 8);\
967 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
968 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
971 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
973 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
974 OPNAME ## pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);\
977 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
980 copy_block9(full, src, 16, stride, 9);\
981 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
982 OPNAME ## pixels8_l2_8(dst, full, half, stride, 16, 8, 8);\
985 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
987 copy_block9(full, src, 16, stride, 9);\
988 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
991 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
994 copy_block9(full, src, 16, stride, 9);\
995 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
996 OPNAME ## pixels8_l2_8(dst, full+16, half, stride, 16, 8, 8);\
998 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1002 uint8_t halfHV[64];\
1003 copy_block9(full, src, 16, stride, 9);\
1004 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1005 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1006 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1007 OPNAME ## pixels8_l4_8(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1009 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1010 uint8_t full[16*9];\
1012 uint8_t halfHV[64];\
1013 copy_block9(full, src, 16, stride, 9);\
1014 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1015 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1016 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1017 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1019 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1020 uint8_t full[16*9];\
1023 uint8_t halfHV[64];\
1024 copy_block9(full, src, 16, stride, 9);\
1025 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1026 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1027 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1028 OPNAME ## pixels8_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1030 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1031 uint8_t full[16*9];\
1033 uint8_t halfHV[64];\
1034 copy_block9(full, src, 16, stride, 9);\
1035 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1036 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1037 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1038 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1040 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1041 uint8_t full[16*9];\
1044 uint8_t halfHV[64];\
1045 copy_block9(full, src, 16, stride, 9);\
1046 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1047 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1048 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1049 OPNAME ## pixels8_l4_8(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1051 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1052 uint8_t full[16*9];\
1054 uint8_t halfHV[64];\
1055 copy_block9(full, src, 16, stride, 9);\
1056 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1057 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1058 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1059 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1061 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1062 uint8_t full[16*9];\
1065 uint8_t halfHV[64];\
1066 copy_block9(full, src, 16, stride, 9);\
1067 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1068 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1069 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1070 OPNAME ## pixels8_l4_8(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1072 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1073 uint8_t full[16*9];\
1075 uint8_t halfHV[64];\
1076 copy_block9(full, src, 16, stride, 9);\
1077 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1078 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1079 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1080 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1082 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1084 uint8_t halfHV[64];\
1085 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1086 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1087 OPNAME ## pixels8_l2_8(dst, halfH, halfHV, stride, 8, 8, 8);\
1089 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1091 uint8_t halfHV[64];\
1092 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1093 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1094 OPNAME ## pixels8_l2_8(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1096 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1097 uint8_t full[16*9];\
1100 uint8_t halfHV[64];\
1101 copy_block9(full, src, 16, stride, 9);\
1102 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1103 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1104 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1105 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1107 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1108 uint8_t full[16*9];\
1110 copy_block9(full, src, 16, stride, 9);\
1111 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1112 put ## RND ## pixels8_l2_8(halfH, halfH, full, 8, 8, 16, 9);\
1113 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1115 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1116 uint8_t full[16*9];\
1119 uint8_t halfHV[64];\
1120 copy_block9(full, src, 16, stride, 9);\
1121 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1122 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1123 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1124 OPNAME ## pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);\
1126 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1127 uint8_t full[16*9];\
1129 copy_block9(full, src, 16, stride, 9);\
1130 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1131 put ## RND ## pixels8_l2_8(halfH, halfH, full+1, 8, 8, 16, 9);\
1132 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1134 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1136 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1137 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1140 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1142 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1143 OPNAME ## pixels16_l2_8(dst, src, half, stride, stride, 16, 16);\
1146 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1147 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
1150 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1152 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
1153 OPNAME ## pixels16_l2_8(dst, src+1, half, stride, stride, 16, 16);\
1156 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1157 uint8_t full[24*17];\
1159 copy_block17(full, src, 24, stride, 17);\
1160 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1161 OPNAME ## pixels16_l2_8(dst, full, half, stride, 24, 16, 16);\
1164 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1165 uint8_t full[24*17];\
1166 copy_block17(full, src, 24, stride, 17);\
1167 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
1170 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1171 uint8_t full[24*17];\
1173 copy_block17(full, src, 24, stride, 17);\
1174 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
1175 OPNAME ## pixels16_l2_8(dst, full+24, half, stride, 24, 16, 16);\
1177 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1178 uint8_t full[24*17];\
1179 uint8_t halfH[272];\
1180 uint8_t halfV[256];\
1181 uint8_t halfHV[256];\
1182 copy_block17(full, src, 24, stride, 17);\
1183 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1184 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1185 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1186 OPNAME ## pixels16_l4_8(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1188 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1189 uint8_t full[24*17];\
1190 uint8_t halfH[272];\
1191 uint8_t halfHV[256];\
1192 copy_block17(full, src, 24, stride, 17);\
1193 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1194 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1195 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1196 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1198 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1199 uint8_t full[24*17];\
1200 uint8_t halfH[272];\
1201 uint8_t halfV[256];\
1202 uint8_t halfHV[256];\
1203 copy_block17(full, src, 24, stride, 17);\
1204 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1205 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1206 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1207 OPNAME ## pixels16_l4_8(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1209 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1210 uint8_t full[24*17];\
1211 uint8_t halfH[272];\
1212 uint8_t halfHV[256];\
1213 copy_block17(full, src, 24, stride, 17);\
1214 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1215 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1216 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1217 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1219 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1220 uint8_t full[24*17];\
1221 uint8_t halfH[272];\
1222 uint8_t halfV[256];\
1223 uint8_t halfHV[256];\
1224 copy_block17(full, src, 24, stride, 17);\
1225 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1226 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1227 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1228 OPNAME ## pixels16_l4_8(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1230 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1231 uint8_t full[24*17];\
1232 uint8_t halfH[272];\
1233 uint8_t halfHV[256];\
1234 copy_block17(full, src, 24, stride, 17);\
1235 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1236 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1237 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1238 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1240 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1241 uint8_t full[24*17];\
1242 uint8_t halfH[272];\
1243 uint8_t halfV[256];\
1244 uint8_t halfHV[256];\
1245 copy_block17(full, src, 24, stride, 17);\
1246 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
1247 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1248 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1249 OPNAME ## pixels16_l4_8(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
1251 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1252 uint8_t full[24*17];\
1253 uint8_t halfH[272];\
1254 uint8_t halfHV[256];\
1255 copy_block17(full, src, 24, stride, 17);\
1256 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1257 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1258 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1259 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1261 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1262 uint8_t halfH[272];\
1263 uint8_t halfHV[256];\
1264 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1265 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1266 OPNAME ## pixels16_l2_8(dst, halfH, halfHV, stride, 16, 16, 16);\
1268 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1269 uint8_t halfH[272];\
1270 uint8_t halfHV[256];\
1271 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1272 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1273 OPNAME ## pixels16_l2_8(dst, halfH+16, halfHV, stride, 16, 16, 16);\
1275 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1276 uint8_t full[24*17];\
1277 uint8_t halfH[272];\
1278 uint8_t halfV[256];\
1279 uint8_t halfHV[256];\
1280 copy_block17(full, src, 24, stride, 17);\
1281 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1282 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
1283 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1284 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1286 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1287 uint8_t full[24*17];\
1288 uint8_t halfH[272];\
1289 copy_block17(full, src, 24, stride, 17);\
1290 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1291 put ## RND ## pixels16_l2_8(halfH, halfH, full, 16, 16, 24, 17);\
1292 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1294 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1295 uint8_t full[24*17];\
1296 uint8_t halfH[272];\
1297 uint8_t halfV[256];\
1298 uint8_t halfHV[256];\
1299 copy_block17(full, src, 24, stride, 17);\
1300 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1301 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
1302 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
1303 OPNAME ## pixels16_l2_8(dst, halfV, halfHV, stride, 16, 16, 16);\
1305 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1306 uint8_t full[24*17];\
1307 uint8_t halfH[272];\
1308 copy_block17(full, src, 24, stride, 17);\
1309 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
1310 put ## RND ## pixels16_l2_8(halfH, halfH, full+1, 16, 16, 24, 17);\
1311 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1313 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1314 uint8_t halfH[272];\
1315 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
1316 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
1319 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
1320 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
1321 #define op_put(a, b) a = cm[((b) + 16)>>5]
1322 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
1324 QPEL_MC(0, put_ , _ , op_put)
1325 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
1326 QPEL_MC(0, avg_ , _ , op_avg)
1327 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
1329 #undef op_avg_no_rnd
1331 #undef op_put_no_rnd
1333 #define put_qpel8_mc00_c ff_put_pixels8x8_c
1334 #define avg_qpel8_mc00_c ff_avg_pixels8x8_c
1335 #define put_qpel16_mc00_c ff_put_pixels16x16_c
1336 #define avg_qpel16_mc00_c ff_avg_pixels16x16_c
1337 #define put_no_rnd_qpel8_mc00_c ff_put_pixels8x8_c
1338 #define put_no_rnd_qpel16_mc00_c ff_put_pixels16x16_8_c
1340 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
1341 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1345 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
1346 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
1347 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
1348 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
1349 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
1350 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
1351 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
1352 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
1358 #if CONFIG_RV40_DECODER
1359 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1360 put_pixels16_xy2_8_c(dst, src, stride, 16);
1362 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1363 avg_pixels16_xy2_8_c(dst, src, stride, 16);
1365 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1366 put_pixels8_xy2_8_c(dst, src, stride, 8);
1368 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
1369 avg_pixels8_xy2_8_c(dst, src, stride, 8);
1371 #endif /* CONFIG_RV40_DECODER */
1373 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
1374 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
1378 const int src_1= src[ -srcStride];
1379 const int src0 = src[0 ];
1380 const int src1 = src[ srcStride];
1381 const int src2 = src[2*srcStride];
1382 const int src3 = src[3*srcStride];
1383 const int src4 = src[4*srcStride];
1384 const int src5 = src[5*srcStride];
1385 const int src6 = src[6*srcStride];
1386 const int src7 = src[7*srcStride];
1387 const int src8 = src[8*srcStride];
1388 const int src9 = src[9*srcStride];
1389 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
1390 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
1391 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
1392 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
1393 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
1394 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
1395 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
1396 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
1402 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
1404 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1405 put_pixels8_l2_8(dst, src, half, stride, stride, 8, 8);
1408 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
1409 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
1412 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
1414 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
1415 put_pixels8_l2_8(dst, src+1, half, stride, stride, 8, 8);
1418 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
1419 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
1422 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
1426 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1427 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
1428 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1429 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1431 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
1435 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1436 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
1437 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
1438 put_pixels8_l2_8(dst, halfV, halfHV, stride, 8, 8, 8);
1440 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
1442 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
1443 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
1446 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
1447 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1449 const int strength= ff_h263_loop_filter_strength[qscale];
1453 int p0= src[x-2*stride];
1454 int p1= src[x-1*stride];
1455 int p2= src[x+0*stride];
1456 int p3= src[x+1*stride];
1457 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1459 if (d<-2*strength) d1= 0;
1460 else if(d<- strength) d1=-2*strength - d;
1461 else if(d< strength) d1= d;
1462 else if(d< 2*strength) d1= 2*strength - d;
1467 if(p1&256) p1= ~(p1>>31);
1468 if(p2&256) p2= ~(p2>>31);
1470 src[x-1*stride] = p1;
1471 src[x+0*stride] = p2;
1475 d2= av_clip((p0-p3)/4, -ad1, ad1);
1477 src[x-2*stride] = p0 - d2;
1478 src[x+ stride] = p3 + d2;
1483 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
1484 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1486 const int strength= ff_h263_loop_filter_strength[qscale];
1490 int p0= src[y*stride-2];
1491 int p1= src[y*stride-1];
1492 int p2= src[y*stride+0];
1493 int p3= src[y*stride+1];
1494 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
1496 if (d<-2*strength) d1= 0;
1497 else if(d<- strength) d1=-2*strength - d;
1498 else if(d< strength) d1= d;
1499 else if(d< 2*strength) d1= 2*strength - d;
1504 if(p1&256) p1= ~(p1>>31);
1505 if(p2&256) p2= ~(p2>>31);
1507 src[y*stride-1] = p1;
1508 src[y*stride+0] = p2;
1512 d2= av_clip((p0-p3)/4, -ad1, ad1);
1514 src[y*stride-2] = p0 - d2;
1515 src[y*stride+1] = p3 + d2;
1520 static void h261_loop_filter_c(uint8_t *src, int stride){
1525 temp[x ] = 4*src[x ];
1526 temp[x + 7*8] = 4*src[x + 7*stride];
1530 xy = y * stride + x;
1532 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
1537 src[ y*stride] = (temp[ y*8] + 2)>>2;
1538 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
1540 xy = y * stride + x;
1542 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
1547 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1553 s += abs(pix1[0] - pix2[0]);
1554 s += abs(pix1[1] - pix2[1]);
1555 s += abs(pix1[2] - pix2[2]);
1556 s += abs(pix1[3] - pix2[3]);
1557 s += abs(pix1[4] - pix2[4]);
1558 s += abs(pix1[5] - pix2[5]);
1559 s += abs(pix1[6] - pix2[6]);
1560 s += abs(pix1[7] - pix2[7]);
1561 s += abs(pix1[8] - pix2[8]);
1562 s += abs(pix1[9] - pix2[9]);
1563 s += abs(pix1[10] - pix2[10]);
1564 s += abs(pix1[11] - pix2[11]);
1565 s += abs(pix1[12] - pix2[12]);
1566 s += abs(pix1[13] - pix2[13]);
1567 s += abs(pix1[14] - pix2[14]);
1568 s += abs(pix1[15] - pix2[15]);
1575 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1581 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1582 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1583 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1584 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1585 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1586 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1587 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1588 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1589 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1590 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1591 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1592 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1593 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1594 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1595 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1596 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1603 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1606 uint8_t *pix3 = pix2 + line_size;
1610 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1611 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1612 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1613 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1614 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1615 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1616 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1617 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1618 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1619 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1620 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1621 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1622 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1623 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1624 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1625 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1633 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1636 uint8_t *pix3 = pix2 + line_size;
1640 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1641 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1642 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1643 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1644 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1645 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1646 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1647 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1648 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1649 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1650 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1651 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1652 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1653 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1654 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1655 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1663 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1669 s += abs(pix1[0] - pix2[0]);
1670 s += abs(pix1[1] - pix2[1]);
1671 s += abs(pix1[2] - pix2[2]);
1672 s += abs(pix1[3] - pix2[3]);
1673 s += abs(pix1[4] - pix2[4]);
1674 s += abs(pix1[5] - pix2[5]);
1675 s += abs(pix1[6] - pix2[6]);
1676 s += abs(pix1[7] - pix2[7]);
1683 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1689 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1690 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1691 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1692 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1693 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1694 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1695 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1696 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1703 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1706 uint8_t *pix3 = pix2 + line_size;
1710 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1711 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1712 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1713 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1714 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1715 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1716 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1717 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1725 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
1728 uint8_t *pix3 = pix2 + line_size;
1732 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1733 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1734 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1735 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1736 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1737 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1738 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1739 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1747 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1748 MpegEncContext *c = v;
1754 for(x=0; x<16; x++){
1755 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1758 for(x=0; x<15; x++){
1759 score2+= FFABS( s1[x ] - s1[x +stride]
1760 - s1[x+1] + s1[x+1+stride])
1761 -FFABS( s2[x ] - s2[x +stride]
1762 - s2[x+1] + s2[x+1+stride]);
1769 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1770 else return score1 + FFABS(score2)*8;
1773 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
1774 MpegEncContext *c = v;
1781 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
1785 score2+= FFABS( s1[x ] - s1[x +stride]
1786 - s1[x+1] + s1[x+1+stride])
1787 -FFABS( s2[x ] - s2[x +stride]
1788 - s2[x+1] + s2[x+1+stride]);
1795 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
1796 else return score1 + FFABS(score2)*8;
1799 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
1803 for(i=0; i<8*8; i++){
1804 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
1807 assert(-512<b && b<512);
1809 sum += (w*b)*(w*b)>>4;
1814 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
1817 for(i=0; i<8*8; i++){
1818 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
1823 * permutes an 8x8 block.
1824 * @param block the block which will be permuted according to the given permutation vector
1825 * @param permutation the permutation vector
1826 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
1827 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
1828 * (inverse) permutated to scantable order!
1830 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
1836 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
1838 for(i=0; i<=last; i++){
1839 const int j= scantable[i];
1844 for(i=0; i<=last; i++){
1845 const int j= scantable[i];
1846 const int perm_j= permutation[j];
1847 block[perm_j]= temp[j];
1851 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
1855 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
1858 memset(cmp, 0, sizeof(void*)*6);
1866 cmp[i]= c->hadamard8_diff[i];
1872 cmp[i]= c->dct_sad[i];
1875 cmp[i]= c->dct264_sad[i];
1878 cmp[i]= c->dct_max[i];
1881 cmp[i]= c->quant_psnr[i];
1910 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
1915 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
1917 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1918 long a = *(long*)(src+i);
1919 long b = *(long*)(dst+i);
1920 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
1923 dst[i+0] += src[i+0];
1926 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
1928 #if !HAVE_FAST_UNALIGNED
1929 if((long)src2 & (sizeof(long)-1)){
1930 for(i=0; i+7<w; i+=8){
1931 dst[i+0] = src1[i+0]-src2[i+0];
1932 dst[i+1] = src1[i+1]-src2[i+1];
1933 dst[i+2] = src1[i+2]-src2[i+2];
1934 dst[i+3] = src1[i+3]-src2[i+3];
1935 dst[i+4] = src1[i+4]-src2[i+4];
1936 dst[i+5] = src1[i+5]-src2[i+5];
1937 dst[i+6] = src1[i+6]-src2[i+6];
1938 dst[i+7] = src1[i+7]-src2[i+7];
1942 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
1943 long a = *(long*)(src1+i);
1944 long b = *(long*)(src2+i);
1945 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
1948 dst[i+0] = src1[i+0]-src2[i+0];
1951 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
1959 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
1968 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
1976 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
1986 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
1989 for(i=0; i<w-1; i++){
2016 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
2046 #define BUTTERFLY2(o1,o2,i1,i2) \
2050 #define BUTTERFLY1(x,y) \
2059 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
2061 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
2069 //FIXME try pointer walks
2070 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
2071 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
2072 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
2073 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
2075 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2076 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2077 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2078 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2080 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2081 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2082 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2083 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2087 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2088 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2089 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2090 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2092 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2093 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2094 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2095 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2098 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2099 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2100 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2101 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2106 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
2114 //FIXME try pointer walks
2115 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
2116 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
2117 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
2118 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
2120 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
2121 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
2122 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
2123 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
2125 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
2126 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
2127 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
2128 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
2132 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
2133 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
2134 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
2135 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
2137 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
2138 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
2139 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
2140 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
2143 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
2144 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
2145 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
2146 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
2149 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
2154 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2155 MpegEncContext * const s= (MpegEncContext *)c;
2156 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2160 s->dsp.diff_pixels(temp, src1, src2, stride);
2162 return s->dsp.sum_abs_dctelem(temp);
2167 const int s07 = SRC(0) + SRC(7);\
2168 const int s16 = SRC(1) + SRC(6);\
2169 const int s25 = SRC(2) + SRC(5);\
2170 const int s34 = SRC(3) + SRC(4);\
2171 const int a0 = s07 + s34;\
2172 const int a1 = s16 + s25;\
2173 const int a2 = s07 - s34;\
2174 const int a3 = s16 - s25;\
2175 const int d07 = SRC(0) - SRC(7);\
2176 const int d16 = SRC(1) - SRC(6);\
2177 const int d25 = SRC(2) - SRC(5);\
2178 const int d34 = SRC(3) - SRC(4);\
2179 const int a4 = d16 + d25 + (d07 + (d07>>1));\
2180 const int a5 = d07 - d34 - (d25 + (d25>>1));\
2181 const int a6 = d07 + d34 - (d16 + (d16>>1));\
2182 const int a7 = d16 - d25 + (d34 + (d34>>1));\
2184 DST(1, a4 + (a7>>2)) ;\
2185 DST(2, a2 + (a3>>1)) ;\
2186 DST(3, a5 + (a6>>2)) ;\
2188 DST(5, a6 - (a5>>2)) ;\
2189 DST(6, (a2>>1) - a3 ) ;\
2190 DST(7, (a4>>2) - a7 ) ;\
2193 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2194 MpegEncContext * const s= (MpegEncContext *)c;
2199 s->dsp.diff_pixels(dct[0], src1, src2, stride);
2201 #define SRC(x) dct[i][x]
2202 #define DST(x,v) dct[i][x]= v
2203 for( i = 0; i < 8; i++ )
2208 #define SRC(x) dct[x][i]
2209 #define DST(x,v) sum += FFABS(v)
2210 for( i = 0; i < 8; i++ )
2218 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2219 MpegEncContext * const s= (MpegEncContext *)c;
2220 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2225 s->dsp.diff_pixels(temp, src1, src2, stride);
2229 sum= FFMAX(sum, FFABS(temp[i]));
2234 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2235 MpegEncContext * const s= (MpegEncContext *)c;
2236 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
2237 DCTELEM * const bak = temp+64;
2243 s->dsp.diff_pixels(temp, src1, src2, stride);
2245 memcpy(bak, temp, 64*sizeof(DCTELEM));
2247 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2248 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2249 ff_simple_idct(temp); //FIXME
2252 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
2257 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2258 MpegEncContext * const s= (MpegEncContext *)c;
2259 const uint8_t *scantable= s->intra_scantable.permutated;
2260 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2261 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
2262 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
2263 int i, last, run, bits, level, distortion, start_i;
2264 const int esc_length= s->ac_esc_length;
2266 uint8_t * last_length;
2270 copy_block8(lsrc1, src1, 8, stride, 8);
2271 copy_block8(lsrc2, src2, 8, stride, 8);
2273 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
2275 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2281 length = s->intra_ac_vlc_length;
2282 last_length= s->intra_ac_vlc_last_length;
2283 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2286 length = s->inter_ac_vlc_length;
2287 last_length= s->inter_ac_vlc_last_length;
2292 for(i=start_i; i<last; i++){
2293 int j= scantable[i];
2298 if((level&(~127)) == 0){
2299 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2308 level= temp[i] + 64;
2312 if((level&(~127)) == 0){
2313 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2321 s->dct_unquantize_intra(s, temp, 0, s->qscale);
2323 s->dct_unquantize_inter(s, temp, 0, s->qscale);
2326 s->dsp.idct_add(lsrc2, 8, temp);
2328 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
2330 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
2333 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
2334 MpegEncContext * const s= (MpegEncContext *)c;
2335 const uint8_t *scantable= s->intra_scantable.permutated;
2336 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
2337 int i, last, run, bits, level, start_i;
2338 const int esc_length= s->ac_esc_length;
2340 uint8_t * last_length;
2344 s->dsp.diff_pixels(temp, src1, src2, stride);
2346 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
2352 length = s->intra_ac_vlc_length;
2353 last_length= s->intra_ac_vlc_last_length;
2354 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
2357 length = s->inter_ac_vlc_length;
2358 last_length= s->inter_ac_vlc_last_length;
2363 for(i=start_i; i<last; i++){
2364 int j= scantable[i];
2369 if((level&(~127)) == 0){
2370 bits+= length[UNI_AC_ENC_INDEX(run, level)];
2379 level= temp[i] + 64;
2383 if((level&(~127)) == 0){
2384 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
2392 #define VSAD_INTRA(size) \
2393 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2397 for(y=1; y<h; y++){ \
2398 for(x=0; x<size; x+=4){ \
2399 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
2400 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
2410 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2415 for(x=0; x<16; x++){
2416 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2425 #define SQ(a) ((a)*(a))
2426 #define VSSE_INTRA(size) \
2427 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
2431 for(y=1; y<h; y++){ \
2432 for(x=0; x<size; x+=4){ \
2433 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
2434 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
2444 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
2449 for(x=0; x<16; x++){
2450 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
2459 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
2463 for(i=0; i<size; i++)
2464 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
2468 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
2469 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
2470 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
2472 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
2474 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
2475 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
2476 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
2477 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
2479 static void vector_fmul_c(float *dst, const float *src0, const float *src1, int len){
2481 for(i=0; i<len; i++)
2482 dst[i] = src0[i] * src1[i];
2485 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
2488 for(i=0; i<len; i++)
2489 dst[i] = src0[i] * src1[-i];
2492 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
2494 for(i=0; i<len; i++)
2495 dst[i] = src0[i] * src1[i] + src2[i];
2498 static void vector_fmul_window_c(float *dst, const float *src0,
2499 const float *src1, const float *win, int len)
2505 for(i=-len, j=len-1; i<0; i++, j--) {
2510 dst[i] = s0*wj - s1*wi;
2511 dst[j] = s0*wi + s1*wj;
2515 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
2519 for (i = 0; i < len; i++)
2520 dst[i] = src[i] * mul;
2523 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
2524 const float **sv, float mul, int len)
2527 for (i = 0; i < len; i += 2, sv++) {
2528 dst[i ] = src[i ] * sv[0][0] * mul;
2529 dst[i+1] = src[i+1] * sv[0][1] * mul;
2533 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
2534 const float **sv, float mul, int len)
2537 for (i = 0; i < len; i += 4, sv++) {
2538 dst[i ] = src[i ] * sv[0][0] * mul;
2539 dst[i+1] = src[i+1] * sv[0][1] * mul;
2540 dst[i+2] = src[i+2] * sv[0][2] * mul;
2541 dst[i+3] = src[i+3] * sv[0][3] * mul;
2545 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
2549 for (i = 0; i < len; i += 2, sv++) {
2550 dst[i ] = sv[0][0] * mul;
2551 dst[i+1] = sv[0][1] * mul;
2555 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
2559 for (i = 0; i < len; i += 4, sv++) {
2560 dst[i ] = sv[0][0] * mul;
2561 dst[i+1] = sv[0][1] * mul;
2562 dst[i+2] = sv[0][2] * mul;
2563 dst[i+3] = sv[0][3] * mul;
2567 static void butterflies_float_c(float *restrict v1, float *restrict v2,
2571 for (i = 0; i < len; i++) {
2572 float t = v1[i] - v2[i];
2578 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
2583 for (i = 0; i < len; i++)
2589 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
2590 uint32_t maxi, uint32_t maxisign)
2593 if(a > mini) return mini;
2594 else if((a^(1U<<31)) > maxisign) return maxi;
2598 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
2600 uint32_t mini = *(uint32_t*)min;
2601 uint32_t maxi = *(uint32_t*)max;
2602 uint32_t maxisign = maxi ^ (1U<<31);
2603 uint32_t *dsti = (uint32_t*)dst;
2604 const uint32_t *srci = (const uint32_t*)src;
2605 for(i=0; i<len; i+=8) {
2606 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
2607 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
2608 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
2609 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
2610 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
2611 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
2612 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
2613 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
2616 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
2618 if(min < 0 && max > 0) {
2619 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
2621 for(i=0; i < len; i+=8) {
2622 dst[i ] = av_clipf(src[i ], min, max);
2623 dst[i + 1] = av_clipf(src[i + 1], min, max);
2624 dst[i + 2] = av_clipf(src[i + 2], min, max);
2625 dst[i + 3] = av_clipf(src[i + 3], min, max);
2626 dst[i + 4] = av_clipf(src[i + 4], min, max);
2627 dst[i + 5] = av_clipf(src[i + 5], min, max);
2628 dst[i + 6] = av_clipf(src[i + 6], min, max);
2629 dst[i + 7] = av_clipf(src[i + 7], min, max);
2634 static int32_t scalarproduct_int16_c(const int16_t * v1, const int16_t * v2, int order, int shift)
2639 res += (*v1++ * *v2++) >> shift;
2644 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul)
2649 *v1++ += mul * *v3++;
2654 static void apply_window_int16_c(int16_t *output, const int16_t *input,
2655 const int16_t *window, unsigned int len)
2658 int len2 = len >> 1;
2660 for (i = 0; i < len2; i++) {
2661 int16_t w = window[i];
2662 output[i] = (MUL16(input[i], w) + (1 << 14)) >> 15;
2663 output[len-i-1] = (MUL16(input[len-i-1], w) + (1 << 14)) >> 15;
2668 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
2669 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
2670 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
2671 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
2672 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
2673 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
2674 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
2676 static void wmv2_idct_row(short * b)
2679 int a0,a1,a2,a3,a4,a5,a6,a7;
2681 a1 = W1*b[1]+W7*b[7];
2682 a7 = W7*b[1]-W1*b[7];
2683 a5 = W5*b[5]+W3*b[3];
2684 a3 = W3*b[5]-W5*b[3];
2685 a2 = W2*b[2]+W6*b[6];
2686 a6 = W6*b[2]-W2*b[6];
2687 a0 = W0*b[0]+W0*b[4];
2688 a4 = W0*b[0]-W0*b[4];
2690 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
2691 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2693 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
2694 b[1] = (a4+a6 +s1 + (1<<7))>>8;
2695 b[2] = (a4-a6 +s2 + (1<<7))>>8;
2696 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
2697 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
2698 b[5] = (a4-a6 -s2 + (1<<7))>>8;
2699 b[6] = (a4+a6 -s1 + (1<<7))>>8;
2700 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
2702 static void wmv2_idct_col(short * b)
2705 int a0,a1,a2,a3,a4,a5,a6,a7;
2706 /*step 1, with extended precision*/
2707 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
2708 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
2709 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
2710 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
2711 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
2712 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
2713 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
2714 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
2716 s1 = (181*(a1-a5+a7-a3)+128)>>8;
2717 s2 = (181*(a1-a5-a7+a3)+128)>>8;
2719 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
2720 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
2721 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
2722 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
2724 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
2725 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
2726 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
2727 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
2729 void ff_wmv2_idct_c(short * block){
2733 wmv2_idct_row(block+i);
2736 wmv2_idct_col(block+i);
2739 /* XXX: those functions should be suppressed ASAP when all IDCTs are
2741 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
2743 ff_wmv2_idct_c(block);
2744 ff_put_pixels_clamped_c(block, dest, line_size);
2746 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
2748 ff_wmv2_idct_c(block);
2749 ff_add_pixels_clamped_c(block, dest, line_size);
2751 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
2754 ff_put_pixels_clamped_c(block, dest, line_size);
2756 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
2759 ff_add_pixels_clamped_c(block, dest, line_size);
2762 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
2765 put_pixels_clamped4_c(block, dest, line_size);
2767 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
2770 add_pixels_clamped4_c(block, dest, line_size);
2773 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
2776 put_pixels_clamped2_c(block, dest, line_size);
2778 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
2781 add_pixels_clamped2_c(block, dest, line_size);
2784 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
2786 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2788 dest[0] = cm[(block[0] + 4)>>3];
2790 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
2792 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2794 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
2797 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
2799 /* init static data */
2800 av_cold void dsputil_static_init(void)
2804 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
2805 for(i=0;i<MAX_NEG_CROP;i++) {
2807 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
2810 for(i=0;i<512;i++) {
2811 ff_squareTbl[i] = (i - 256) * (i - 256);
2814 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
2817 int ff_check_alignment(void){
2818 static int did_fail=0;
2819 DECLARE_ALIGNED(16, int, aligned);
2821 if((intptr_t)&aligned & 15){
2823 #if HAVE_MMX || HAVE_ALTIVEC
2824 av_log(NULL, AV_LOG_ERROR,
2825 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
2826 "and may be very slow or crash. This is not a bug in libavcodec,\n"
2827 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
2828 "Do not report crashes to FFmpeg developers.\n");
2837 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
2841 ff_check_alignment();
2844 if(avctx->dct_algo==FF_DCT_FASTINT) {
2845 c->fdct = fdct_ifast;
2846 c->fdct248 = fdct_ifast248;
2848 else if(avctx->dct_algo==FF_DCT_FAAN) {
2849 c->fdct = ff_faandct;
2850 c->fdct248 = ff_faandct248;
2853 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
2854 c->fdct248 = ff_fdct248_islow;
2856 #endif //CONFIG_ENCODERS
2858 if(avctx->lowres==1){
2859 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
2860 c->idct_put= ff_jref_idct4_put;
2861 c->idct_add= ff_jref_idct4_add;
2863 if (avctx->codec_id != CODEC_ID_H264) {
2864 c->idct_put= ff_h264_lowres_idct_put_8_c;
2865 c->idct_add= ff_h264_lowres_idct_add_8_c;
2867 switch (avctx->bits_per_raw_sample) {
2869 c->idct_put= ff_h264_lowres_idct_put_9_c;
2870 c->idct_add= ff_h264_lowres_idct_add_9_c;
2873 c->idct_put= ff_h264_lowres_idct_put_10_c;
2874 c->idct_add= ff_h264_lowres_idct_add_10_c;
2877 c->idct_put= ff_h264_lowres_idct_put_8_c;
2878 c->idct_add= ff_h264_lowres_idct_add_8_c;
2882 c->idct = j_rev_dct4;
2883 c->idct_permutation_type= FF_NO_IDCT_PERM;
2884 }else if(avctx->lowres==2){
2885 c->idct_put= ff_jref_idct2_put;
2886 c->idct_add= ff_jref_idct2_add;
2887 c->idct = j_rev_dct2;
2888 c->idct_permutation_type= FF_NO_IDCT_PERM;
2889 }else if(avctx->lowres==3){
2890 c->idct_put= ff_jref_idct1_put;
2891 c->idct_add= ff_jref_idct1_add;
2892 c->idct = j_rev_dct1;
2893 c->idct_permutation_type= FF_NO_IDCT_PERM;
2895 if(avctx->idct_algo==FF_IDCT_INT){
2896 c->idct_put= ff_jref_idct_put;
2897 c->idct_add= ff_jref_idct_add;
2898 c->idct = j_rev_dct;
2899 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2900 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
2901 avctx->idct_algo==FF_IDCT_VP3){
2902 c->idct_put= ff_vp3_idct_put_c;
2903 c->idct_add= ff_vp3_idct_add_c;
2904 c->idct = ff_vp3_idct_c;
2905 c->idct_permutation_type= FF_NO_IDCT_PERM;
2906 }else if(avctx->idct_algo==FF_IDCT_WMV2){
2907 c->idct_put= ff_wmv2_idct_put_c;
2908 c->idct_add= ff_wmv2_idct_add_c;
2909 c->idct = ff_wmv2_idct_c;
2910 c->idct_permutation_type= FF_NO_IDCT_PERM;
2911 }else if(avctx->idct_algo==FF_IDCT_FAAN){
2912 c->idct_put= ff_faanidct_put;
2913 c->idct_add= ff_faanidct_add;
2914 c->idct = ff_faanidct;
2915 c->idct_permutation_type= FF_NO_IDCT_PERM;
2916 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
2917 c->idct_put= ff_ea_idct_put_c;
2918 c->idct_permutation_type= FF_NO_IDCT_PERM;
2919 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
2920 c->idct = ff_bink_idct_c;
2921 c->idct_add = ff_bink_idct_add_c;
2922 c->idct_put = ff_bink_idct_put_c;
2923 c->idct_permutation_type = FF_NO_IDCT_PERM;
2924 }else{ //accurate/default
2925 c->idct_put= ff_simple_idct_put;
2926 c->idct_add= ff_simple_idct_add;
2927 c->idct = ff_simple_idct;
2928 c->idct_permutation_type= FF_NO_IDCT_PERM;
2932 c->get_pixels = get_pixels_c;
2933 c->diff_pixels = diff_pixels_c;
2934 c->put_pixels_clamped = ff_put_pixels_clamped_c;
2935 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_c;
2936 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
2937 c->add_pixels_clamped = ff_add_pixels_clamped_c;
2938 c->sum_abs_dctelem = sum_abs_dctelem_c;
2941 c->pix_sum = pix_sum_c;
2942 c->pix_norm1 = pix_norm1_c;
2944 c->fill_block_tab[0] = fill_block16_c;
2945 c->fill_block_tab[1] = fill_block8_c;
2946 c->scale_block = scale_block_c;
2948 /* TODO [0] 16 [1] 8 */
2949 c->pix_abs[0][0] = pix_abs16_c;
2950 c->pix_abs[0][1] = pix_abs16_x2_c;
2951 c->pix_abs[0][2] = pix_abs16_y2_c;
2952 c->pix_abs[0][3] = pix_abs16_xy2_c;
2953 c->pix_abs[1][0] = pix_abs8_c;
2954 c->pix_abs[1][1] = pix_abs8_x2_c;
2955 c->pix_abs[1][2] = pix_abs8_y2_c;
2956 c->pix_abs[1][3] = pix_abs8_xy2_c;
2958 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
2959 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
2960 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
2961 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
2962 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
2963 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
2964 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
2965 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
2966 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
2968 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
2969 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
2970 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
2971 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
2972 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
2973 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
2974 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
2975 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
2976 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
2978 #define dspfunc(PFX, IDX, NUM) \
2979 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
2980 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
2981 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
2982 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
2983 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
2984 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
2985 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
2986 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
2987 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
2988 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
2989 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
2990 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
2991 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
2992 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
2993 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
2994 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
2996 dspfunc(put_qpel, 0, 16);
2997 dspfunc(put_no_rnd_qpel, 0, 16);
2999 dspfunc(avg_qpel, 0, 16);
3000 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
3002 dspfunc(put_qpel, 1, 8);
3003 dspfunc(put_no_rnd_qpel, 1, 8);
3005 dspfunc(avg_qpel, 1, 8);
3006 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
3010 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
3011 ff_mlp_init(c, avctx);
3013 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
3014 ff_intrax8dsp_init(c,avctx);
3016 #if CONFIG_RV30_DECODER
3017 ff_rv30dsp_init(c,avctx);
3019 #if CONFIG_RV40_DECODER
3020 ff_rv40dsp_init(c,avctx);
3021 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
3022 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
3023 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
3024 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
3027 c->put_mspel_pixels_tab[0]= ff_put_pixels8x8_c;
3028 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
3029 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
3030 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
3031 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
3032 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
3033 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
3034 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
3036 #define SET_CMP_FUNC(name) \
3037 c->name[0]= name ## 16_c;\
3038 c->name[1]= name ## 8x8_c;
3040 SET_CMP_FUNC(hadamard8_diff)
3041 c->hadamard8_diff[4]= hadamard8_intra16_c;
3042 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
3043 SET_CMP_FUNC(dct_sad)
3044 SET_CMP_FUNC(dct_max)
3046 SET_CMP_FUNC(dct264_sad)
3048 c->sad[0]= pix_abs16_c;
3049 c->sad[1]= pix_abs8_c;
3053 SET_CMP_FUNC(quant_psnr)
3056 c->vsad[0]= vsad16_c;
3057 c->vsad[4]= vsad_intra16_c;
3058 c->vsad[5]= vsad_intra8_c;
3059 c->vsse[0]= vsse16_c;
3060 c->vsse[4]= vsse_intra16_c;
3061 c->vsse[5]= vsse_intra8_c;
3062 c->nsse[0]= nsse16_c;
3063 c->nsse[1]= nsse8_c;
3065 ff_dsputil_init_dwt(c);
3068 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
3070 c->add_bytes= add_bytes_c;
3071 c->diff_bytes= diff_bytes_c;
3072 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
3073 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
3074 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
3075 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
3076 c->bswap_buf= bswap_buf;
3077 c->bswap16_buf = bswap16_buf;
3079 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
3080 c->h263_h_loop_filter= h263_h_loop_filter_c;
3081 c->h263_v_loop_filter= h263_v_loop_filter_c;
3084 if (CONFIG_VP3_DECODER) {
3085 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
3086 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
3087 c->vp3_idct_dc_add= ff_vp3_idct_dc_add_c;
3090 c->h261_loop_filter= h261_loop_filter_c;
3092 c->try_8x8basis= try_8x8basis_c;
3093 c->add_8x8basis= add_8x8basis_c;
3095 #if CONFIG_VORBIS_DECODER
3096 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
3098 #if CONFIG_AC3_DECODER
3099 c->ac3_downmix = ff_ac3_downmix_c;
3101 c->vector_fmul = vector_fmul_c;
3102 c->vector_fmul_reverse = vector_fmul_reverse_c;
3103 c->vector_fmul_add = vector_fmul_add_c;
3104 c->vector_fmul_window = vector_fmul_window_c;
3105 c->vector_clipf = vector_clipf_c;
3106 c->scalarproduct_int16 = scalarproduct_int16_c;
3107 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
3108 c->apply_window_int16 = apply_window_int16_c;
3109 c->scalarproduct_float = scalarproduct_float_c;
3110 c->butterflies_float = butterflies_float_c;
3111 c->vector_fmul_scalar = vector_fmul_scalar_c;
3113 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
3114 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
3116 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
3117 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
3119 c->shrink[0]= av_image_copy_plane;
3120 c->shrink[1]= ff_shrink22;
3121 c->shrink[2]= ff_shrink44;
3122 c->shrink[3]= ff_shrink88;
3124 c->prefetch= just_return;
3126 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
3127 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
3131 #define FUNC(f, depth) f ## _ ## depth
3132 #define FUNCC(f, depth) f ## _ ## depth ## _c
3134 #define dspfunc1(PFX, IDX, NUM, depth)\
3135 c->PFX ## _pixels_tab[IDX][0] = FUNCC(PFX ## _pixels ## NUM , depth);\
3136 c->PFX ## _pixels_tab[IDX][1] = FUNCC(PFX ## _pixels ## NUM ## _x2 , depth);\
3137 c->PFX ## _pixels_tab[IDX][2] = FUNCC(PFX ## _pixels ## NUM ## _y2 , depth);\
3138 c->PFX ## _pixels_tab[IDX][3] = FUNCC(PFX ## _pixels ## NUM ## _xy2, depth)
3140 #define dspfunc2(PFX, IDX, NUM, depth)\
3141 c->PFX ## _pixels_tab[IDX][ 0] = FUNCC(PFX ## NUM ## _mc00, depth);\
3142 c->PFX ## _pixels_tab[IDX][ 1] = FUNCC(PFX ## NUM ## _mc10, depth);\
3143 c->PFX ## _pixels_tab[IDX][ 2] = FUNCC(PFX ## NUM ## _mc20, depth);\
3144 c->PFX ## _pixels_tab[IDX][ 3] = FUNCC(PFX ## NUM ## _mc30, depth);\
3145 c->PFX ## _pixels_tab[IDX][ 4] = FUNCC(PFX ## NUM ## _mc01, depth);\
3146 c->PFX ## _pixels_tab[IDX][ 5] = FUNCC(PFX ## NUM ## _mc11, depth);\
3147 c->PFX ## _pixels_tab[IDX][ 6] = FUNCC(PFX ## NUM ## _mc21, depth);\
3148 c->PFX ## _pixels_tab[IDX][ 7] = FUNCC(PFX ## NUM ## _mc31, depth);\
3149 c->PFX ## _pixels_tab[IDX][ 8] = FUNCC(PFX ## NUM ## _mc02, depth);\
3150 c->PFX ## _pixels_tab[IDX][ 9] = FUNCC(PFX ## NUM ## _mc12, depth);\
3151 c->PFX ## _pixels_tab[IDX][10] = FUNCC(PFX ## NUM ## _mc22, depth);\
3152 c->PFX ## _pixels_tab[IDX][11] = FUNCC(PFX ## NUM ## _mc32, depth);\
3153 c->PFX ## _pixels_tab[IDX][12] = FUNCC(PFX ## NUM ## _mc03, depth);\
3154 c->PFX ## _pixels_tab[IDX][13] = FUNCC(PFX ## NUM ## _mc13, depth);\
3155 c->PFX ## _pixels_tab[IDX][14] = FUNCC(PFX ## NUM ## _mc23, depth);\
3156 c->PFX ## _pixels_tab[IDX][15] = FUNCC(PFX ## NUM ## _mc33, depth)
3159 #define BIT_DEPTH_FUNCS(depth)\
3160 c->draw_edges = FUNCC(draw_edges , depth);\
3161 c->emulated_edge_mc = FUNC (ff_emulated_edge_mc , depth);\
3162 c->clear_block = FUNCC(clear_block , depth);\
3163 c->clear_blocks = FUNCC(clear_blocks , depth);\
3164 c->add_pixels8 = FUNCC(add_pixels8 , depth);\
3165 c->add_pixels4 = FUNCC(add_pixels4 , depth);\
3166 c->put_no_rnd_pixels_l2[0] = FUNCC(put_no_rnd_pixels16_l2, depth);\
3167 c->put_no_rnd_pixels_l2[1] = FUNCC(put_no_rnd_pixels8_l2 , depth);\
3169 c->put_h264_chroma_pixels_tab[0] = FUNCC(put_h264_chroma_mc8 , depth);\
3170 c->put_h264_chroma_pixels_tab[1] = FUNCC(put_h264_chroma_mc4 , depth);\
3171 c->put_h264_chroma_pixels_tab[2] = FUNCC(put_h264_chroma_mc2 , depth);\
3172 c->avg_h264_chroma_pixels_tab[0] = FUNCC(avg_h264_chroma_mc8 , depth);\
3173 c->avg_h264_chroma_pixels_tab[1] = FUNCC(avg_h264_chroma_mc4 , depth);\
3174 c->avg_h264_chroma_pixels_tab[2] = FUNCC(avg_h264_chroma_mc2 , depth);\
3176 dspfunc1(put , 0, 16, depth);\
3177 dspfunc1(put , 1, 8, depth);\
3178 dspfunc1(put , 2, 4, depth);\
3179 dspfunc1(put , 3, 2, depth);\
3180 dspfunc1(put_no_rnd, 0, 16, depth);\
3181 dspfunc1(put_no_rnd, 1, 8, depth);\
3182 dspfunc1(avg , 0, 16, depth);\
3183 dspfunc1(avg , 1, 8, depth);\
3184 dspfunc1(avg , 2, 4, depth);\
3185 dspfunc1(avg , 3, 2, depth);\
3186 dspfunc1(avg_no_rnd, 0, 16, depth);\
3187 dspfunc1(avg_no_rnd, 1, 8, depth);\
3189 dspfunc2(put_h264_qpel, 0, 16, depth);\
3190 dspfunc2(put_h264_qpel, 1, 8, depth);\
3191 dspfunc2(put_h264_qpel, 2, 4, depth);\
3192 dspfunc2(put_h264_qpel, 3, 2, depth);\
3193 dspfunc2(avg_h264_qpel, 0, 16, depth);\
3194 dspfunc2(avg_h264_qpel, 1, 8, depth);\
3195 dspfunc2(avg_h264_qpel, 2, 4, depth);
3197 if (avctx->codec_id != CODEC_ID_H264 || avctx->bits_per_raw_sample == 8) {
3200 switch (avctx->bits_per_raw_sample) {
3208 av_log(avctx, AV_LOG_DEBUG, "Unsupported bit depth: %d\n", avctx->bits_per_raw_sample);
3215 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
3216 if (ARCH_ARM) dsputil_init_arm (c, avctx);
3217 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
3218 if (HAVE_VIS) dsputil_init_vis (c, avctx);
3219 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
3220 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
3221 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
3222 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
3223 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
3225 for(i=0; i<64; i++){
3226 if(!c->put_2tap_qpel_pixels_tab[0][i])
3227 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
3228 if(!c->avg_2tap_qpel_pixels_tab[0][i])
3229 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
3232 c->put_rv30_tpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3233 c->put_rv30_tpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3234 c->avg_rv30_tpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3235 c->avg_rv30_tpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3237 c->put_rv40_qpel_pixels_tab[0][0] = c->put_h264_qpel_pixels_tab[0][0];
3238 c->put_rv40_qpel_pixels_tab[1][0] = c->put_h264_qpel_pixels_tab[1][0];
3239 c->avg_rv40_qpel_pixels_tab[0][0] = c->avg_h264_qpel_pixels_tab[0][0];
3240 c->avg_rv40_qpel_pixels_tab[1][0] = c->avg_h264_qpel_pixels_tab[1][0];
3242 switch(c->idct_permutation_type){
3243 case FF_NO_IDCT_PERM:
3245 c->idct_permutation[i]= i;
3247 case FF_LIBMPEG2_IDCT_PERM:
3249 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
3251 case FF_SIMPLE_IDCT_PERM:
3253 c->idct_permutation[i]= simple_mmx_permutation[i];
3255 case FF_TRANSPOSE_IDCT_PERM:
3257 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
3259 case FF_PARTTRANS_IDCT_PERM:
3261 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
3263 case FF_SSE2_IDCT_PERM:
3265 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
3268 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");