2 * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
12 #include "vp8/encoder/variance.h"
13 #include "vp8/common/pragmas.h"
14 #include "vpx_ports/mem.h"
16 extern void filter_block1d_h6_mmx
18 const unsigned char *src_ptr,
19 unsigned short *output_ptr,
20 unsigned int src_pixels_per_line,
21 unsigned int pixel_step,
22 unsigned int output_height,
23 unsigned int output_width,
26 extern void filter_block1d_v6_mmx
29 unsigned char *output_ptr,
30 unsigned int pixels_per_line,
31 unsigned int pixel_step,
32 unsigned int output_height,
33 unsigned int output_width,
37 extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr);
38 extern unsigned int vp8_get8x8var_mmx
40 const unsigned char *src_ptr,
42 const unsigned char *ref_ptr,
47 extern unsigned int vp8_get4x4var_mmx
49 const unsigned char *src_ptr,
51 const unsigned char *ref_ptr,
56 extern unsigned int vp8_get4x4sse_cs_mmx
58 const unsigned char *src_ptr,
60 const unsigned char *ref_ptr,
63 extern void vp8_filter_block2d_bil4x4_var_mmx
65 const unsigned char *ref_ptr,
66 int ref_pixels_per_line,
67 const unsigned char *src_ptr,
68 int src_pixels_per_line,
72 unsigned int *sumsquared
74 extern void vp8_filter_block2d_bil_var_mmx
76 const unsigned char *ref_ptr,
77 int ref_pixels_per_line,
78 const unsigned char *src_ptr,
79 int src_pixels_per_line,
84 unsigned int *sumsquared
86 extern unsigned int vp8_get16x16pred_error_mmx
88 unsigned char *src_ptr,
90 unsigned char *ref_ptr,
95 void vp8_test_get_mb_ss(void)
99 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
100 -2, -2, -2, -2, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, 2, 2,
101 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
102 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
103 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
104 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
105 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
106 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
107 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
108 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
109 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
110 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
111 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
112 -3, -3, -3, -3, 3, 3, 3, 3, -3, -3, -3, -3, 3, 3, 3, 3,
113 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
114 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
116 int s = 0, x = vp8_get_mb_ss_mmx(zz);
120 for (y = 0; y < 256; y++)
121 s += (zz[y] * zz[y]);
128 unsigned int vp8_get16x16var_mmx(
129 const unsigned char *src_ptr,
131 const unsigned char *ref_ptr,
137 unsigned int sse0, sse1, sse2, sse3, var;
138 int sum0, sum1, sum2, sum3, avg;
141 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
142 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
143 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
144 vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
146 var = sse0 + sse1 + sse2 + sse3;
147 avg = sum0 + sum1 + sum2 + sum3;
151 return (var - ((avg * avg) >> 8));
159 unsigned int vp8_variance4x4_mmx(
160 const unsigned char *src_ptr,
162 const unsigned char *ref_ptr,
169 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
171 return (var - ((avg * avg) >> 4));
175 unsigned int vp8_variance8x8_mmx(
176 const unsigned char *src_ptr,
178 const unsigned char *ref_ptr,
185 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
188 return (var - ((avg * avg) >> 6));
192 unsigned int vp8_mse16x16_mmx(
193 const unsigned char *src_ptr,
195 const unsigned char *ref_ptr,
199 unsigned int sse0, sse1, sse2, sse3, var;
200 int sum0, sum1, sum2, sum3;
203 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
204 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
205 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
206 vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
208 var = sse0 + sse1 + sse2 + sse3;
214 unsigned int vp8_variance16x16_mmx(
215 const unsigned char *src_ptr,
217 const unsigned char *ref_ptr,
221 unsigned int sse0, sse1, sse2, sse3, var;
222 int sum0, sum1, sum2, sum3, avg;
225 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
226 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
227 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
228 vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
230 var = sse0 + sse1 + sse2 + sse3;
231 avg = sum0 + sum1 + sum2 + sum3;
233 return (var - ((avg * avg) >> 8));
236 unsigned int vp8_variance16x8_mmx(
237 const unsigned char *src_ptr,
239 const unsigned char *ref_ptr,
243 unsigned int sse0, sse1, var;
246 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
247 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
252 return (var - ((avg * avg) >> 7));
257 unsigned int vp8_variance8x16_mmx(
258 const unsigned char *src_ptr,
260 const unsigned char *ref_ptr,
264 unsigned int sse0, sse1, var;
267 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
268 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
274 return (var - ((avg * avg) >> 7));
281 ///////////////////////////////////////////////////////////////////////////
282 // the mmx function that does the bilinear filtering and var calculation //
284 ///////////////////////////////////////////////////////////////////////////
285 DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) =
287 { 128, 128, 128, 128, 0, 0, 0, 0 },
288 { 112, 112, 112, 112, 16, 16, 16, 16 },
289 { 96, 96, 96, 96, 32, 32, 32, 32 },
290 { 80, 80, 80, 80, 48, 48, 48, 48 },
291 { 64, 64, 64, 64, 64, 64, 64, 64 },
292 { 48, 48, 48, 48, 80, 80, 80, 80 },
293 { 32, 32, 32, 32, 96, 96, 96, 96 },
294 { 16, 16, 16, 16, 112, 112, 112, 112 }
297 unsigned int vp8_sub_pixel_variance4x4_mmx
299 const unsigned char *src_ptr,
300 int src_pixels_per_line,
303 const unsigned char *dst_ptr,
304 int dst_pixels_per_line,
310 vp8_filter_block2d_bil4x4_var_mmx(
311 src_ptr, src_pixels_per_line,
312 dst_ptr, dst_pixels_per_line,
313 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
317 return (xxsum - ((xsum * xsum) >> 4));
321 unsigned int vp8_sub_pixel_variance8x8_mmx
323 const unsigned char *src_ptr,
324 int src_pixels_per_line,
327 const unsigned char *dst_ptr,
328 int dst_pixels_per_line,
335 vp8_filter_block2d_bil_var_mmx(
336 src_ptr, src_pixels_per_line,
337 dst_ptr, dst_pixels_per_line, 8,
338 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
342 return (xxsum - ((xsum * xsum) >> 6));
345 unsigned int vp8_sub_pixel_variance16x16_mmx
347 const unsigned char *src_ptr,
348 int src_pixels_per_line,
351 const unsigned char *dst_ptr,
352 int dst_pixels_per_line,
358 unsigned int xxsum0, xxsum1;
361 vp8_filter_block2d_bil_var_mmx(
362 src_ptr, src_pixels_per_line,
363 dst_ptr, dst_pixels_per_line, 16,
364 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
369 vp8_filter_block2d_bil_var_mmx(
370 src_ptr + 8, src_pixels_per_line,
371 dst_ptr + 8, dst_pixels_per_line, 16,
372 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
380 return (xxsum0 - ((xsum0 * xsum0) >> 8));
385 unsigned int vp8_sub_pixel_mse16x16_mmx(
386 const unsigned char *src_ptr,
387 int src_pixels_per_line,
390 const unsigned char *dst_ptr,
391 int dst_pixels_per_line,
395 vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
399 unsigned int vp8_sub_pixel_variance16x8_mmx
401 const unsigned char *src_ptr,
402 int src_pixels_per_line,
405 const unsigned char *dst_ptr,
406 int dst_pixels_per_line,
411 unsigned int xxsum0, xxsum1;
414 vp8_filter_block2d_bil_var_mmx(
415 src_ptr, src_pixels_per_line,
416 dst_ptr, dst_pixels_per_line, 8,
417 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
422 vp8_filter_block2d_bil_var_mmx(
423 src_ptr + 8, src_pixels_per_line,
424 dst_ptr + 8, dst_pixels_per_line, 8,
425 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
433 return (xxsum0 - ((xsum0 * xsum0) >> 7));
436 unsigned int vp8_sub_pixel_variance8x16_mmx
438 const unsigned char *src_ptr,
439 int src_pixels_per_line,
442 const unsigned char *dst_ptr,
443 int dst_pixels_per_line,
449 vp8_filter_block2d_bil_var_mmx(
450 src_ptr, src_pixels_per_line,
451 dst_ptr, dst_pixels_per_line, 16,
452 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
456 return (xxsum - ((xsum * xsum) >> 7));
459 unsigned int vp8_i_variance16x16_mmx(
460 const unsigned char *src_ptr,
462 const unsigned char *ref_ptr,
466 unsigned int sse0, sse1, sse2, sse3, var;
467 int sum0, sum1, sum2, sum3, avg;
470 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
471 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
472 vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
473 vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
475 var = sse0 + sse1 + sse2 + sse3;
476 avg = sum0 + sum1 + sum2 + sum3;
478 return (var - ((avg * avg) >> 8));
482 unsigned int vp8_i_variance8x16_mmx(
483 const unsigned char *src_ptr,
485 const unsigned char *ref_ptr,
489 unsigned int sse0, sse1, var;
491 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
492 vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
498 return (var - ((avg * avg) >> 7));
502 unsigned int vp8_i_sub_pixel_variance16x16_mmx
504 const unsigned char *src_ptr,
505 int src_pixels_per_line,
508 const unsigned char *dst_ptr,
509 int dst_pixels_per_line,
514 unsigned int xxsum0, xxsum1;
515 int f2soffset = (src_pixels_per_line >> 1);
516 int f2doffset = (dst_pixels_per_line >> 1);
519 vp8_filter_block2d_bil_var_mmx(
520 src_ptr, src_pixels_per_line,
521 dst_ptr, dst_pixels_per_line, 8,
522 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
527 vp8_filter_block2d_bil_var_mmx(
528 src_ptr + 8, src_pixels_per_line,
529 dst_ptr + 8, dst_pixels_per_line, 8,
530 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
537 vp8_filter_block2d_bil_var_mmx(
538 src_ptr + f2soffset, src_pixels_per_line,
539 dst_ptr + f2doffset, dst_pixels_per_line, 8,
540 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
547 vp8_filter_block2d_bil_var_mmx(
548 src_ptr + f2soffset + 8, src_pixels_per_line,
549 dst_ptr + f2doffset + 8, dst_pixels_per_line, 8,
550 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
557 return (xxsum0 - ((xsum0 * xsum0) >> 8));
561 unsigned int vp8_i_sub_pixel_variance8x16_mmx
563 const unsigned char *src_ptr,
564 int src_pixels_per_line,
567 const unsigned char *dst_ptr,
568 int dst_pixels_per_line,
573 unsigned int xxsum0, xxsum1;
574 int f2soffset = (src_pixels_per_line >> 1);
575 int f2doffset = (dst_pixels_per_line >> 1);
578 vp8_filter_block2d_bil_var_mmx(
579 src_ptr, src_pixels_per_line,
580 dst_ptr, dst_pixels_per_line, 8,
581 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
586 vp8_filter_block2d_bil_var_mmx(
587 src_ptr + f2soffset, src_pixels_per_line,
588 dst_ptr + f2doffset, dst_pixels_per_line, 8,
589 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
596 return (xxsum0 - ((xsum0 * xsum0) >> 7));
600 unsigned int vp8_variance_halfpixvar16x16_h_mmx(
601 const unsigned char *src_ptr,
603 const unsigned char *ref_ptr,
607 return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 0,
608 ref_ptr, recon_stride, sse);
612 unsigned int vp8_variance_halfpixvar16x16_v_mmx(
613 const unsigned char *src_ptr,
615 const unsigned char *ref_ptr,
619 return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 4,
620 ref_ptr, recon_stride, sse);
624 unsigned int vp8_variance_halfpixvar16x16_hv_mmx(
625 const unsigned char *src_ptr,
627 const unsigned char *ref_ptr,
631 return vp8_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 4, 4,
632 ref_ptr, recon_stride, sse);