2 * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
4 * Use of this source code is governed by a BSD-style license
5 * that can be found in the LICENSE file in the root of the source
6 * tree. An additional intellectual property rights grant can be found
7 * in the file PATENTS. All contributing project authors may
8 * be found in the AUTHORS file in the root of the source tree.
11 #include <immintrin.h> // AVX2
12 #include "vpx_ports/mem.h"
13 #include "vp9/encoder/vp9_variance.h"
15 DECLARE_ALIGNED(32, static const uint8_t, bilinear_filters_avx2[512]) = {
16 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
17 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0, 16, 0,
18 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
19 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1,
20 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
21 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2,
22 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
23 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3,
24 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
25 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4,
26 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
27 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5,
28 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
29 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6,
30 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
31 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7,
32 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
33 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
34 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
35 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9, 7, 9,
36 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
37 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10, 6, 10,
38 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
39 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11, 5, 11,
40 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
41 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12, 4, 12,
42 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
43 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13, 3, 13,
44 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
45 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14, 2, 14,
46 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15,
47 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15, 1, 15
50 #define FILTER_SRC(filter) \
51 /* filter the source */ \
52 exp_src_lo = _mm256_maddubs_epi16(exp_src_lo, filter); \
53 exp_src_hi = _mm256_maddubs_epi16(exp_src_hi, filter); \
55 /* add 8 to source */ \
56 exp_src_lo = _mm256_add_epi16(exp_src_lo, pw8); \
57 exp_src_hi = _mm256_add_epi16(exp_src_hi, pw8); \
59 /* divide source by 16 */ \
60 exp_src_lo = _mm256_srai_epi16(exp_src_lo, 4); \
61 exp_src_hi = _mm256_srai_epi16(exp_src_hi, 4);
63 #define MERGE_WITH_SRC(src_reg, reg) \
64 exp_src_lo = _mm256_unpacklo_epi8(src_reg, reg); \
65 exp_src_hi = _mm256_unpackhi_epi8(src_reg, reg);
67 #define LOAD_SRC_DST \
68 /* load source and destination */ \
69 src_reg = _mm256_loadu_si256((__m256i const *) (src)); \
70 dst_reg = _mm256_loadu_si256((__m256i const *) (dst));
72 #define AVG_NEXT_SRC(src_reg, size_stride) \
73 src_next_reg = _mm256_loadu_si256((__m256i const *) \
74 (src + size_stride)); \
75 /* average between current and next stride source */ \
76 src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
78 #define MERGE_NEXT_SRC(src_reg, size_stride) \
79 src_next_reg = _mm256_loadu_si256((__m256i const *) \
80 (src + size_stride)); \
81 MERGE_WITH_SRC(src_reg, src_next_reg)
83 #define CALC_SUM_SSE_INSIDE_LOOP \
84 /* expand each byte to 2 bytes */ \
85 exp_dst_lo = _mm256_unpacklo_epi8(dst_reg, zero_reg); \
86 exp_dst_hi = _mm256_unpackhi_epi8(dst_reg, zero_reg); \
88 exp_src_lo = _mm256_sub_epi16(exp_src_lo, exp_dst_lo); \
89 exp_src_hi = _mm256_sub_epi16(exp_src_hi, exp_dst_hi); \
91 sum_reg = _mm256_add_epi16(sum_reg, exp_src_lo); \
92 exp_src_lo = _mm256_madd_epi16(exp_src_lo, exp_src_lo); \
93 sum_reg = _mm256_add_epi16(sum_reg, exp_src_hi); \
94 exp_src_hi = _mm256_madd_epi16(exp_src_hi, exp_src_hi); \
96 sse_reg = _mm256_add_epi32(sse_reg, exp_src_lo); \
97 sse_reg = _mm256_add_epi32(sse_reg, exp_src_hi);
99 // final calculation to sum and sse
100 #define CALC_SUM_AND_SSE \
101 res_cmp = _mm256_cmpgt_epi16(zero_reg, sum_reg); \
102 sse_reg_hi = _mm256_srli_si256(sse_reg, 8); \
103 sum_reg_lo = _mm256_unpacklo_epi16(sum_reg, res_cmp); \
104 sum_reg_hi = _mm256_unpackhi_epi16(sum_reg, res_cmp); \
105 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
106 sum_reg = _mm256_add_epi32(sum_reg_lo, sum_reg_hi); \
108 sse_reg_hi = _mm256_srli_si256(sse_reg, 4); \
109 sum_reg_hi = _mm256_srli_si256(sum_reg, 8); \
111 sse_reg = _mm256_add_epi32(sse_reg, sse_reg_hi); \
112 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
113 *((int*)sse)= _mm_cvtsi128_si32(_mm256_castsi256_si128(sse_reg)) + \
114 _mm_cvtsi128_si32(_mm256_extractf128_si256(sse_reg, 1)); \
115 sum_reg_hi = _mm256_srli_si256(sum_reg, 4); \
116 sum_reg = _mm256_add_epi32(sum_reg, sum_reg_hi); \
117 sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) + \
118 _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
121 unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src,
129 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
130 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
133 sum_reg = _mm256_set1_epi16(0);
134 sse_reg = _mm256_set1_epi16(0);
135 zero_reg = _mm256_set1_epi16(0);
137 // x_offset = 0 and y_offset = 0
140 for (i = 0; i < height ; i++) {
142 // expend each byte to 2 bytes
143 MERGE_WITH_SRC(src_reg, zero_reg)
144 CALC_SUM_SSE_INSIDE_LOOP
148 // x_offset = 0 and y_offset = 8
149 } else if (y_offset == 8) {
150 __m256i src_next_reg;
151 for (i = 0; i < height ; i++) {
153 AVG_NEXT_SRC(src_reg, src_stride)
154 // expend each byte to 2 bytes
155 MERGE_WITH_SRC(src_reg, zero_reg)
156 CALC_SUM_SSE_INSIDE_LOOP
160 // x_offset = 0 and y_offset = bilin interpolation
162 __m256i filter, pw8, src_next_reg;
165 filter = _mm256_load_si256((__m256i const *)
166 (bilinear_filters_avx2 + y_offset));
167 pw8 = _mm256_set1_epi16(8);
168 for (i = 0; i < height ; i++) {
170 MERGE_NEXT_SRC(src_reg, src_stride)
172 CALC_SUM_SSE_INSIDE_LOOP
177 // x_offset = 8 and y_offset = 0
178 } else if (x_offset == 8) {
180 __m256i src_next_reg;
181 for (i = 0; i < height ; i++) {
183 AVG_NEXT_SRC(src_reg, 1)
184 // expand each byte to 2 bytes
185 MERGE_WITH_SRC(src_reg, zero_reg)
186 CALC_SUM_SSE_INSIDE_LOOP
190 // x_offset = 8 and y_offset = 8
191 } else if (y_offset == 8) {
192 __m256i src_next_reg, src_avg;
193 // load source and another source starting from the next
195 src_reg = _mm256_loadu_si256((__m256i const *) (src));
196 AVG_NEXT_SRC(src_reg, 1)
197 for (i = 0; i < height ; i++) {
201 AVG_NEXT_SRC(src_reg, 1)
202 // average between previous average to current average
203 src_avg = _mm256_avg_epu8(src_avg, src_reg);
204 // expand each byte to 2 bytes
205 MERGE_WITH_SRC(src_avg, zero_reg)
206 // save current source average
207 CALC_SUM_SSE_INSIDE_LOOP
210 // x_offset = 8 and y_offset = bilin interpolation
212 __m256i filter, pw8, src_next_reg, src_avg;
214 filter = _mm256_load_si256((__m256i const *)
215 (bilinear_filters_avx2 + y_offset));
216 pw8 = _mm256_set1_epi16(8);
217 // load source and another source starting from the next
219 src_reg = _mm256_loadu_si256((__m256i const *) (src));
220 AVG_NEXT_SRC(src_reg, 1)
221 for (i = 0; i < height ; i++) {
222 // save current source average
226 AVG_NEXT_SRC(src_reg, 1)
227 MERGE_WITH_SRC(src_avg, src_reg)
229 CALC_SUM_SSE_INSIDE_LOOP
233 // x_offset = bilin interpolation and y_offset = 0
236 __m256i filter, pw8, src_next_reg;
238 filter = _mm256_load_si256((__m256i const *)
239 (bilinear_filters_avx2 + x_offset));
240 pw8 = _mm256_set1_epi16(8);
241 for (i = 0; i < height ; i++) {
243 MERGE_NEXT_SRC(src_reg, 1)
245 CALC_SUM_SSE_INSIDE_LOOP
249 // x_offset = bilin interpolation and y_offset = 8
250 } else if (y_offset == 8) {
251 __m256i filter, pw8, src_next_reg, src_pack;
253 filter = _mm256_load_si256((__m256i const *)
254 (bilinear_filters_avx2 + x_offset));
255 pw8 = _mm256_set1_epi16(8);
256 src_reg = _mm256_loadu_si256((__m256i const *) (src));
257 MERGE_NEXT_SRC(src_reg, 1)
259 // convert each 16 bit to 8 bit to each low and high lane source
260 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
261 for (i = 0; i < height ; i++) {
264 MERGE_NEXT_SRC(src_reg, 1)
266 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
267 // average between previous pack to the current
268 src_pack = _mm256_avg_epu8(src_pack, src_reg);
269 MERGE_WITH_SRC(src_pack, zero_reg)
270 CALC_SUM_SSE_INSIDE_LOOP
274 // x_offset = bilin interpolation and y_offset = bilin interpolation
276 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
278 xfilter = _mm256_load_si256((__m256i const *)
279 (bilinear_filters_avx2 + x_offset));
281 yfilter = _mm256_load_si256((__m256i const *)
282 (bilinear_filters_avx2 + y_offset));
283 pw8 = _mm256_set1_epi16(8);
284 // load source and another source starting from the next
286 src_reg = _mm256_loadu_si256((__m256i const *) (src));
287 MERGE_NEXT_SRC(src_reg, 1)
290 // convert each 16 bit to 8 bit to each low and high lane source
291 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
292 for (i = 0; i < height ; i++) {
295 MERGE_NEXT_SRC(src_reg, 1)
297 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
298 // merge previous pack to current pack source
299 MERGE_WITH_SRC(src_pack, src_reg)
303 CALC_SUM_SSE_INSIDE_LOOP
312 unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src,
323 __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
324 __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
327 sum_reg = _mm256_set1_epi16(0);
328 sse_reg = _mm256_set1_epi16(0);
329 zero_reg = _mm256_set1_epi16(0);
331 // x_offset = 0 and y_offset = 0
334 for (i = 0; i < height ; i++) {
336 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
337 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
339 // expend each byte to 2 bytes
340 MERGE_WITH_SRC(src_reg, zero_reg)
341 CALC_SUM_SSE_INSIDE_LOOP
345 } else if (y_offset == 8) {
346 __m256i src_next_reg;
347 for (i = 0; i < height ; i++) {
349 AVG_NEXT_SRC(src_reg, src_stride)
350 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
351 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
353 // expend each byte to 2 bytes
354 MERGE_WITH_SRC(src_reg, zero_reg)
355 CALC_SUM_SSE_INSIDE_LOOP
359 // x_offset = 0 and y_offset = bilin interpolation
361 __m256i filter, pw8, src_next_reg;
364 filter = _mm256_load_si256((__m256i const *)
365 (bilinear_filters_avx2 + y_offset));
366 pw8 = _mm256_set1_epi16(8);
367 for (i = 0; i < height ; i++) {
369 MERGE_NEXT_SRC(src_reg, src_stride)
371 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
372 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
373 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
375 MERGE_WITH_SRC(src_reg, zero_reg)
376 CALC_SUM_SSE_INSIDE_LOOP
381 // x_offset = 8 and y_offset = 0
382 } else if (x_offset == 8) {
384 __m256i src_next_reg;
385 for (i = 0; i < height ; i++) {
387 AVG_NEXT_SRC(src_reg, 1)
388 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
389 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
391 // expand each byte to 2 bytes
392 MERGE_WITH_SRC(src_reg, zero_reg)
393 CALC_SUM_SSE_INSIDE_LOOP
397 // x_offset = 8 and y_offset = 8
398 } else if (y_offset == 8) {
399 __m256i src_next_reg, src_avg;
400 // load source and another source starting from the next
402 src_reg = _mm256_loadu_si256((__m256i const *) (src));
403 AVG_NEXT_SRC(src_reg, 1)
404 for (i = 0; i < height ; i++) {
405 // save current source average
409 AVG_NEXT_SRC(src_reg, 1)
410 // average between previous average to current average
411 src_avg = _mm256_avg_epu8(src_avg, src_reg);
412 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
413 src_avg = _mm256_avg_epu8(src_avg, sec_reg);
415 // expand each byte to 2 bytes
416 MERGE_WITH_SRC(src_avg, zero_reg)
417 CALC_SUM_SSE_INSIDE_LOOP
420 // x_offset = 8 and y_offset = bilin interpolation
422 __m256i filter, pw8, src_next_reg, src_avg;
424 filter = _mm256_load_si256((__m256i const *)
425 (bilinear_filters_avx2 + y_offset));
426 pw8 = _mm256_set1_epi16(8);
427 // load source and another source starting from the next
429 src_reg = _mm256_loadu_si256((__m256i const *) (src));
430 AVG_NEXT_SRC(src_reg, 1)
431 for (i = 0; i < height ; i++) {
432 // save current source average
436 AVG_NEXT_SRC(src_reg, 1)
437 MERGE_WITH_SRC(src_avg, src_reg)
439 src_avg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
440 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
441 src_avg = _mm256_avg_epu8(src_avg, sec_reg);
442 // expand each byte to 2 bytes
443 MERGE_WITH_SRC(src_avg, zero_reg)
445 CALC_SUM_SSE_INSIDE_LOOP
449 // x_offset = bilin interpolation and y_offset = 0
452 __m256i filter, pw8, src_next_reg;
454 filter = _mm256_load_si256((__m256i const *)
455 (bilinear_filters_avx2 + x_offset));
456 pw8 = _mm256_set1_epi16(8);
457 for (i = 0; i < height ; i++) {
459 MERGE_NEXT_SRC(src_reg, 1)
461 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
462 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
463 src_reg = _mm256_avg_epu8(src_reg, sec_reg);
464 MERGE_WITH_SRC(src_reg, zero_reg)
466 CALC_SUM_SSE_INSIDE_LOOP
470 // x_offset = bilin interpolation and y_offset = 8
471 } else if (y_offset == 8) {
472 __m256i filter, pw8, src_next_reg, src_pack;
474 filter = _mm256_load_si256((__m256i const *)
475 (bilinear_filters_avx2 + x_offset));
476 pw8 = _mm256_set1_epi16(8);
477 src_reg = _mm256_loadu_si256((__m256i const *) (src));
478 MERGE_NEXT_SRC(src_reg, 1)
480 // convert each 16 bit to 8 bit to each low and high lane source
481 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
482 for (i = 0; i < height ; i++) {
485 MERGE_NEXT_SRC(src_reg, 1)
487 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
488 // average between previous pack to the current
489 src_pack = _mm256_avg_epu8(src_pack, src_reg);
490 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
491 src_pack = _mm256_avg_epu8(src_pack, sec_reg);
493 MERGE_WITH_SRC(src_pack, zero_reg)
495 CALC_SUM_SSE_INSIDE_LOOP
498 // x_offset = bilin interpolation and y_offset = bilin interpolation
500 __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
502 xfilter = _mm256_load_si256((__m256i const *)
503 (bilinear_filters_avx2 + x_offset));
505 yfilter = _mm256_load_si256((__m256i const *)
506 (bilinear_filters_avx2 + y_offset));
507 pw8 = _mm256_set1_epi16(8);
508 // load source and another source starting from the next
510 src_reg = _mm256_loadu_si256((__m256i const *) (src));
511 MERGE_NEXT_SRC(src_reg, 1)
514 // convert each 16 bit to 8 bit to each low and high lane source
515 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
516 for (i = 0; i < height ; i++) {
519 MERGE_NEXT_SRC(src_reg, 1)
521 src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
522 // merge previous pack to current pack source
523 MERGE_WITH_SRC(src_pack, src_reg)
526 src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
527 sec_reg = _mm256_loadu_si256((__m256i const *) (sec));
528 src_pack = _mm256_avg_epu8(src_pack, sec_reg);
529 MERGE_WITH_SRC(src_pack, zero_reg)
532 CALC_SUM_SSE_INSIDE_LOOP