2 * Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license and patent
5 * grant that can be found in the LICENSE file in the root of the source
6 * tree. All contributing project authors may be found in the AUTHORS
7 * file in the root of the source tree.
13 #include "vpx_ports/mem.h"
15 extern void filter_block1d_h6_mmx
17 unsigned char *src_ptr,
18 unsigned short *output_ptr,
19 unsigned int src_pixels_per_line,
20 unsigned int pixel_step,
21 unsigned int output_height,
22 unsigned int output_width,
25 extern void filter_block1d_v6_mmx
28 unsigned char *output_ptr,
29 unsigned int pixels_per_line,
30 unsigned int pixel_step,
31 unsigned int output_height,
32 unsigned int output_width,
36 extern unsigned int vp8_get_mb_ss_mmx(short *src_ptr);
37 extern unsigned int vp8_get8x8var_mmx
39 unsigned char *src_ptr,
41 unsigned char *ref_ptr,
46 extern unsigned int vp8_get4x4var_mmx
48 unsigned char *src_ptr,
50 unsigned char *ref_ptr,
55 extern unsigned int vp8_get4x4sse_cs_mmx
57 unsigned char *src_ptr,
59 unsigned char *ref_ptr,
62 extern void vp8_filter_block2d_bil4x4_var_mmx
64 unsigned char *ref_ptr,
65 int ref_pixels_per_line,
66 unsigned char *src_ptr,
67 int src_pixels_per_line,
71 unsigned int *sumsquared
73 extern void vp8_filter_block2d_bil_var_mmx
75 unsigned char *ref_ptr,
76 int ref_pixels_per_line,
77 unsigned char *src_ptr,
78 int src_pixels_per_line,
83 unsigned int *sumsquared
85 extern unsigned int vp8_get16x16pred_error_mmx
87 unsigned char *src_ptr,
89 unsigned char *ref_ptr,
94 void vp8_test_get_mb_ss(void)
98 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
99 -2, -2, -2, -2, 2, 2, 2, 2, -2, -2, -2, -2, 2, 2, 2, 2,
100 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
101 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
102 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
103 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
104 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
105 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
106 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
107 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
108 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
109 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
110 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
111 -3, -3, -3, -3, 3, 3, 3, 3, -3, -3, -3, -3, 3, 3, 3, 3,
112 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
113 -4, -4, -4, -4, 4, 4, 4, 4, -4, -4, -4, -4, 4, 4, 4, 4,
115 int s = 0, x = vp8_get_mb_ss_mmx(zz);
119 for (y = 0; y < 256; y++)
120 s += (zz[y] * zz[y]);
127 unsigned int vp8_get16x16var_mmx(
128 unsigned char *src_ptr,
130 unsigned char *ref_ptr,
136 unsigned int sse0, sse1, sse2, sse3, var;
137 int sum0, sum1, sum2, sum3, avg;
140 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
141 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
142 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
143 vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
145 var = sse0 + sse1 + sse2 + sse3;
146 avg = sum0 + sum1 + sum2 + sum3;
150 return (var - ((avg * avg) >> 8));
158 unsigned int vp8_variance4x4_mmx(
159 unsigned char *src_ptr,
161 unsigned char *ref_ptr,
168 vp8_get4x4var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
170 return (var - ((avg * avg) >> 4));
174 unsigned int vp8_variance8x8_mmx(
175 unsigned char *src_ptr,
177 unsigned char *ref_ptr,
184 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &var, &avg) ;
187 return (var - ((avg * avg) >> 6));
191 unsigned int vp8_mse16x16_mmx(
192 unsigned char *src_ptr,
194 unsigned char *ref_ptr,
198 unsigned int sse0, sse1, sse2, sse3, var;
199 int sum0, sum1, sum2, sum3;
202 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
203 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
204 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
205 vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
207 var = sse0 + sse1 + sse2 + sse3;
213 unsigned int vp8_variance16x16_mmx(
214 unsigned char *src_ptr,
216 unsigned char *ref_ptr,
220 unsigned int sse0, sse1, sse2, sse3, var;
221 int sum0, sum1, sum2, sum3, avg;
224 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
225 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
226 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
227 vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
229 var = sse0 + sse1 + sse2 + sse3;
230 avg = sum0 + sum1 + sum2 + sum3;
232 return (var - ((avg * avg) >> 8));
235 unsigned int vp8_variance16x8_mmx(
236 unsigned char *src_ptr,
238 unsigned char *ref_ptr,
242 unsigned int sse0, sse1, var;
245 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
246 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
251 return (var - ((avg * avg) >> 7));
256 unsigned int vp8_variance8x16_mmx(
257 unsigned char *src_ptr,
259 unsigned char *ref_ptr,
263 unsigned int sse0, sse1, var;
266 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
267 vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse1, &sum1) ;
273 return (var - ((avg * avg) >> 7));
280 ///////////////////////////////////////////////////////////////////////////
281 // the mmx function that does the bilinear filtering and var calculation //
283 ///////////////////////////////////////////////////////////////////////////
284 DECLARE_ALIGNED(16, const short, vp8_vp7_bilinear_filters_mmx[8][8]) =
286 { 128, 128, 128, 128, 0, 0, 0, 0 },
287 { 112, 112, 112, 112, 16, 16, 16, 16 },
288 { 96, 96, 96, 96, 32, 32, 32, 32 },
289 { 80, 80, 80, 80, 48, 48, 48, 48 },
290 { 64, 64, 64, 64, 64, 64, 64, 64 },
291 { 48, 48, 48, 48, 80, 80, 80, 80 },
292 { 32, 32, 32, 32, 96, 96, 96, 96 },
293 { 16, 16, 16, 16, 112, 112, 112, 112 }
296 unsigned int vp8_sub_pixel_variance4x4_mmx
298 unsigned char *src_ptr,
299 int src_pixels_per_line,
302 unsigned char *dst_ptr,
303 int dst_pixels_per_line,
309 vp8_filter_block2d_bil4x4_var_mmx(
310 src_ptr, src_pixels_per_line,
311 dst_ptr, dst_pixels_per_line,
312 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
316 return (xxsum - ((xsum * xsum) >> 4));
320 unsigned int vp8_sub_pixel_variance8x8_mmx
322 unsigned char *src_ptr,
323 int src_pixels_per_line,
326 unsigned char *dst_ptr,
327 int dst_pixels_per_line,
334 vp8_filter_block2d_bil_var_mmx(
335 src_ptr, src_pixels_per_line,
336 dst_ptr, dst_pixels_per_line, 8,
337 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
341 return (xxsum - ((xsum * xsum) >> 6));
344 unsigned int vp8_sub_pixel_variance16x16_mmx
346 unsigned char *src_ptr,
347 int src_pixels_per_line,
350 unsigned char *dst_ptr,
351 int dst_pixels_per_line,
357 unsigned int xxsum0, xxsum1;
360 vp8_filter_block2d_bil_var_mmx(
361 src_ptr, src_pixels_per_line,
362 dst_ptr, dst_pixels_per_line, 16,
363 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
368 vp8_filter_block2d_bil_var_mmx(
369 src_ptr + 8, src_pixels_per_line,
370 dst_ptr + 8, dst_pixels_per_line, 16,
371 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
379 return (xxsum0 - ((xsum0 * xsum0) >> 8));
384 unsigned int vp8_sub_pixel_mse16x16_mmx(
385 unsigned char *src_ptr,
386 int src_pixels_per_line,
389 unsigned char *dst_ptr,
390 int dst_pixels_per_line,
394 vp8_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
398 unsigned int vp8_sub_pixel_variance16x8_mmx
400 unsigned char *src_ptr,
401 int src_pixels_per_line,
404 unsigned char *dst_ptr,
405 int dst_pixels_per_line,
410 unsigned int xxsum0, xxsum1;
413 vp8_filter_block2d_bil_var_mmx(
414 src_ptr, src_pixels_per_line,
415 dst_ptr, dst_pixels_per_line, 8,
416 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
421 vp8_filter_block2d_bil_var_mmx(
422 src_ptr + 8, src_pixels_per_line,
423 dst_ptr + 8, dst_pixels_per_line, 8,
424 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
432 return (xxsum0 - ((xsum0 * xsum0) >> 7));
435 unsigned int vp8_sub_pixel_variance8x16_mmx
437 unsigned char *src_ptr,
438 int src_pixels_per_line,
441 unsigned char *dst_ptr,
442 int dst_pixels_per_line,
448 vp8_filter_block2d_bil_var_mmx(
449 src_ptr, src_pixels_per_line,
450 dst_ptr, dst_pixels_per_line, 16,
451 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
455 return (xxsum - ((xsum * xsum) >> 7));
458 unsigned int vp8_i_variance16x16_mmx(
459 unsigned char *src_ptr,
461 unsigned char *ref_ptr,
465 unsigned int sse0, sse1, sse2, sse3, var;
466 int sum0, sum1, sum2, sum3, avg;
469 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
470 vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
471 vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse2, &sum2) ;
472 vp8_get8x8var_mmx(src_ptr + (source_stride >> 1) + 8, source_stride, ref_ptr + (recon_stride >> 1) + 8, recon_stride, &sse3, &sum3);
474 var = sse0 + sse1 + sse2 + sse3;
475 avg = sum0 + sum1 + sum2 + sum3;
477 return (var - ((avg * avg) >> 8));
481 unsigned int vp8_i_variance8x16_mmx(
482 unsigned char *src_ptr,
484 unsigned char *ref_ptr,
488 unsigned int sse0, sse1, var;
490 vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
491 vp8_get8x8var_mmx(src_ptr + (source_stride >> 1), source_stride, ref_ptr + (recon_stride >> 1), recon_stride, &sse1, &sum1) ;
497 return (var - ((avg * avg) >> 7));
501 unsigned int vp8_i_sub_pixel_variance16x16_mmx
503 unsigned char *src_ptr,
504 int src_pixels_per_line,
507 unsigned char *dst_ptr,
508 int dst_pixels_per_line,
513 unsigned int xxsum0, xxsum1;
514 int f2soffset = (src_pixels_per_line >> 1);
515 int f2doffset = (dst_pixels_per_line >> 1);
518 vp8_filter_block2d_bil_var_mmx(
519 src_ptr, src_pixels_per_line,
520 dst_ptr, dst_pixels_per_line, 8,
521 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
526 vp8_filter_block2d_bil_var_mmx(
527 src_ptr + 8, src_pixels_per_line,
528 dst_ptr + 8, dst_pixels_per_line, 8,
529 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
536 vp8_filter_block2d_bil_var_mmx(
537 src_ptr + f2soffset, src_pixels_per_line,
538 dst_ptr + f2doffset, dst_pixels_per_line, 8,
539 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
546 vp8_filter_block2d_bil_var_mmx(
547 src_ptr + f2soffset + 8, src_pixels_per_line,
548 dst_ptr + f2doffset + 8, dst_pixels_per_line, 8,
549 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
556 return (xxsum0 - ((xsum0 * xsum0) >> 8));
560 unsigned int vp8_i_sub_pixel_variance8x16_mmx
562 unsigned char *src_ptr,
563 int src_pixels_per_line,
566 unsigned char *dst_ptr,
567 int dst_pixels_per_line,
572 unsigned int xxsum0, xxsum1;
573 int f2soffset = (src_pixels_per_line >> 1);
574 int f2doffset = (dst_pixels_per_line >> 1);
577 vp8_filter_block2d_bil_var_mmx(
578 src_ptr, src_pixels_per_line,
579 dst_ptr, dst_pixels_per_line, 8,
580 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
585 vp8_filter_block2d_bil_var_mmx(
586 src_ptr + f2soffset, src_pixels_per_line,
587 dst_ptr + f2doffset, dst_pixels_per_line, 8,
588 vp8_vp7_bilinear_filters_mmx[xoffset], vp8_vp7_bilinear_filters_mmx[yoffset],
595 return (xxsum0 - ((xsum0 * xsum0) >> 7));