2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
10 #include "./vpx_config.h"
12 #include "vp9/encoder/vp9_variance.h"
13 #include "vp9/common/vp9_pragmas.h"
14 #include "vpx_ports/mem.h"
16 typedef void (*get_var_avx2) (
17 const unsigned char *src_ptr,
19 const unsigned char *ref_ptr,
25 void vp9_get16x16var_avx2
27 const unsigned char *src_ptr,
29 const unsigned char *ref_ptr,
35 void vp9_get32x32var_avx2
37 const unsigned char *src_ptr,
39 const unsigned char *ref_ptr,
45 unsigned int vp9_sub_pixel_variance32xh_avx2
57 unsigned int vp9_sub_pixel_avg_variance32xh_avx2
71 static void variance_avx2(const unsigned char *src_ptr, int source_stride,
72 const unsigned char *ref_ptr, int recon_stride,
73 int w, int h, unsigned int *sse, int *sum,
74 get_var_avx2 var_fn, int block_size) {
82 for (i = 0; i < h; i += 16) {
83 for (j = 0; j < w; j += block_size) {
84 // processing 16 rows horizontally each call
85 var_fn(src_ptr + source_stride * i + j, source_stride,
86 ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0);
93 unsigned int vp9_variance16x16_avx2
95 const unsigned char *src_ptr,
97 const unsigned char *ref_ptr,
103 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16,
104 &var, &avg, vp9_get16x16var_avx2, 16);
106 return (var - (((unsigned int)avg * avg) >> 8));
109 unsigned int vp9_mse16x16_avx2(
110 const unsigned char *src_ptr,
112 const unsigned char *ref_ptr,
117 vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0,
123 unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr,
125 const uint8_t *ref_ptr,
131 // processing 32 elements vertically in parallel
132 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32,
133 &var, &avg, vp9_get32x32var_avx2, 32);
135 return (var - (((int64_t)avg * avg) >> 10));
138 unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr,
140 const uint8_t *ref_ptr,
146 // processing 32 elements vertically in parallel
147 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16,
148 &var, &avg, vp9_get32x32var_avx2, 32);
150 return (var - (((int64_t)avg * avg) >> 9));
154 unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr,
156 const uint8_t *ref_ptr,
162 // processing 32 elements vertically in parallel
163 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64,
164 &var, &avg, vp9_get32x32var_avx2, 32);
166 return (var - (((int64_t)avg * avg) >> 12));
169 unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr,
171 const uint8_t *ref_ptr,
177 // processing 32 elements vertically in parallel
178 variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32,
179 &var, &avg, vp9_get32x32var_avx2, 32);
182 return (var - (((int64_t)avg * avg) >> 11));
185 unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src,
191 unsigned int *sse_ptr) {
192 // processing 32 elements in parallel
194 int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
195 y_offset, dst, dst_stride,
197 // processing the next 32 elements in parallel
199 int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride,
201 dst + 32, dst_stride,
206 return sse - (((int64_t)se * se) >> 12);
209 unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src,
215 unsigned int *sse_ptr) {
216 // processing 32 element in parallel
218 int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset,
219 y_offset, dst, dst_stride,
222 return sse - (((int64_t)se * se) >> 10);
225 unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src,
231 unsigned int *sseptr,
232 const uint8_t *sec) {
233 // processing 32 elements in parallel
236 int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
237 y_offset, dst, dst_stride,
240 // processing the next 32 elements in parallel
241 int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset,
242 y_offset, dst + 32, dst_stride,
243 sec + 32, 64, 64, &sse2);
248 return sse - (((int64_t)se * se) >> 12);
251 unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src,
257 unsigned int *sseptr,
258 const uint8_t *sec) {
259 // processing 32 element in parallel
261 int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset,
262 y_offset, dst, dst_stride,
265 return sse - (((int64_t)se * se) >> 10);