From 64f728caef5d9f019222c6989a9c6df17464dd69 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Tue, 12 Nov 2013 16:51:15 -0800 Subject: [PATCH] Do horizontal loopfiltering in parallel This patch followed "Rewrite filter_selectively_horiz for parallel loopfiltering" commit, and added x86 SSE2 optimization to do 16-pixel filtering in parallel. Also, corrected the declaration of aligned arrays. For 8-pixel-in-parallel case, improved the calculation of the masks and filters. Updated the threshold loading since the thresholds were already duplicated. Updated neon C functions to call neon loopfilters twice. Using tulip clip, tests showed it gave a ~1.5% decoder speed gain. Change-Id: Id02638626ac27a4b0e0b09d71792a24c0499bd35 --- vp9/common/arm/neon/vp9_loopfilter_16_neon.c | 33 ++ vp9/common/vp9_loopfilter.c | 39 +- vp9/common/vp9_loopfilter_filters.c | 59 +++ vp9/common/vp9_rtcd_defs.sh | 6 + vp9/common/x86/vp9_loopfilter_intrin_sse2.c | 690 +++++++++++++++++++++------ vp9/vp9_common.mk | 1 + 6 files changed, 648 insertions(+), 180 deletions(-) create mode 100644 vp9/common/arm/neon/vp9_loopfilter_16_neon.c diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c new file mode 100644 index 0000000..2f022dc --- /dev/null +++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2013 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp9_rtcd.h" + +void vp9_loop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_loop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1); + vp9_loop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1); +} + +void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + vp9_mbloop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1); + vp9_mbloop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1); +} diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index ff504a1..7867b41 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -411,19 +411,16 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, // Next block's thresholds const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1); - // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. - vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); - vp9_mbloop_filter_horizontal_edge(s + 8, pitch, lfin->mblim, - lfin->lim, lfin->hev_thr, 1); + vp9_mbloop_filter_horizontal_edge_16(s, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, + lfin->mblim, lfin->lim, + lfin->hev_thr); if ((mask_4x4_int & 3) == 3) { - // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); - vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch, - lfin->mblim, lfin->lim, - lfin->hev_thr, 1); + vp9_loop_filter_horizontal_edge_16(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, + lfin->mblim, lfin->lim, + lfin->hev_thr); } else { if (mask_4x4_int & 1) vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, @@ -449,19 +446,15 @@ static void filter_selectively_horiz(uint8_t *s, int pitch, // Next block's thresholds const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1); - // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. - vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim, - lfi->hev_thr, 1); - vp9_loop_filter_horizontal_edge(s + 8, pitch, lfin->mblim, lfin->lim, - lfin->hev_thr, 1); - + vp9_loop_filter_horizontal_edge_16(s, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, + lfin->mblim, lfin->lim, + lfin->hev_thr); if ((mask_4x4_int & 3) == 3) { - // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering. - vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); - vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch, - lfin->mblim, lfin->lim, - lfin->hev_thr, 1); + vp9_loop_filter_horizontal_edge_16(s + 4 * pitch, pitch, lfi->mblim, + lfi->lim, lfi->hev_thr, + lfin->mblim, lfin->lim, + lfin->hev_thr); } else { if (mask_4x4_int & 1) vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim, diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c index 2c4bf6c..9edf870 100644 --- a/vp9/common/vp9_loopfilter_filters.c +++ b/vp9/common/vp9_loopfilter_filters.c @@ -121,6 +121,34 @@ void vp9_loop_filter_horizontal_edge_c(uint8_t *s, int p /* pitch */, } } +void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + int i, j; + const uint8_t *blimit = blimit0; + const uint8_t *limit = limit0; + const uint8_t *thresh = thresh0; + + for (i = 0; i < 2; ++i) { + for (j = 0; j < 8; ++j) { + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); + filter4(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p); + ++s; + } + blimit = blimit1; + limit = limit1; + thresh = thresh1; + } +} + void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, @@ -185,6 +213,37 @@ void vp9_mbloop_filter_horizontal_edge_c(uint8_t *s, int p, } } +void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int p /* pitch */, + const uint8_t *blimit0, + const uint8_t *limit0, + const uint8_t *thresh0, + const uint8_t *blimit1, + const uint8_t *limit1, + const uint8_t *thresh1) { + int i, j; + const uint8_t *blimit = blimit0; + const uint8_t *limit = limit0; + const uint8_t *thresh = thresh0; + + for (i = 0; i < 2; ++i) { + for (j = 0; j < 8; ++j) { + const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p]; + const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p]; + + const int8_t mask = filter_mask(*limit, *blimit, + p3, p2, p1, p0, q0, q1, q2, q3); + const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1); + const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3); + filter8(mask, hev, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p, + s, s + 1 * p, s + 2 * p, s + 3 * p); + ++s; + } + blimit = blimit1; + limit = limit1; + thresh = thresh1; + } +} + void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 2c0864e..66eaa81 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -205,9 +205,15 @@ specialize vp9_mb_lpf_horizontal_edge_w sse2 avx2 neon dspr2 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_mbloop_filter_horizontal_edge sse2 neon dspr2 +prototype void vp9_mbloop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1" +specialize vp9_mbloop_filter_horizontal_edge_16 sse2 neon + prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count" specialize vp9_loop_filter_horizontal_edge mmx neon dspr2 +prototype void vp9_loop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1" +specialize vp9_loop_filter_horizontal_edge_16 sse2 neon + # # post proc # diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c index fa4dd9b..925f74d 100644 --- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c +++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -17,20 +17,14 @@ static void mb_lpf_horizontal_edge_w_sse2_8(unsigned char *s, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat, flat2; __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; - const unsigned int extended_thresh = _thresh[0] * 0x01010101u; - const unsigned int extended_limit = _limit[0] * 0x01010101u; - const unsigned int extended_blimit = _blimit[0] * 0x01010101u; - const __m128i thresh = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); - const __m128i limit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); - const __m128i blimit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4), @@ -375,32 +369,25 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { - DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]); - DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]); - - DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_op, 7 * 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_oq, 7 * 16); - DECLARE_ALIGNED(16, unsigned char, ap[8][16]); - DECLARE_ALIGNED(16, unsigned char, aq[8][16]); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op, 3 * 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq, 3 * 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, ap, 8 * 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, aq, 8 * 16); - __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat, flat2; __m128i p7, p6, p5; __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4; __m128i q5, q6, q7; int i = 0; - const unsigned int extended_thresh = _thresh[0] * 0x01010101u; - const unsigned int extended_limit = _limit[0] * 0x01010101u; - const unsigned int extended_blimit = _blimit[0] * 0x01010101u; - const __m128i thresh = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); - const __m128i limit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); - const __m128i blimit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); p4 = _mm_loadu_si128((__m128i *)(s - 5 * p)); p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); @@ -413,16 +400,16 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); q4 = _mm_loadu_si128((__m128i *)(s + 4 * p)); - _mm_store_si128((__m128i *)ap[4], p4); - _mm_store_si128((__m128i *)ap[3], p3); - _mm_store_si128((__m128i *)ap[2], p2); - _mm_store_si128((__m128i *)ap[1], p1); - _mm_store_si128((__m128i *)ap[0], p0); - _mm_store_si128((__m128i *)aq[4], q4); - _mm_store_si128((__m128i *)aq[3], q3); - _mm_store_si128((__m128i *)aq[2], q2); - _mm_store_si128((__m128i *)aq[1], q1); - _mm_store_si128((__m128i *)aq[0], q0); + _mm_store_si128((__m128i *)&ap[4 * 16], p4); + _mm_store_si128((__m128i *)&ap[3 * 16], p3); + _mm_store_si128((__m128i *)&ap[2 * 16], p2); + _mm_store_si128((__m128i *)&ap[1 * 16], p1); + _mm_store_si128((__m128i *)&ap[0 * 16], p0); + _mm_store_si128((__m128i *)&aq[4 * 16], q4); + _mm_store_si128((__m128i *)&aq[3 * 16], q3); + _mm_store_si128((__m128i *)&aq[2 * 16], q2); + _mm_store_si128((__m128i *)&aq[1 * 16], q1); + _mm_store_si128((__m128i *)&aq[0 * 16], q0); { @@ -546,8 +533,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, _mm_subs_epu8(p0, p5)), _mm_or_si128(_mm_subs_epu8(q5, q0), _mm_subs_epu8(q0, q5))); - _mm_store_si128((__m128i *)ap[5], p5); - _mm_store_si128((__m128i *)aq[5], q5); + _mm_store_si128((__m128i *)&ap[5 * 16], p5); + _mm_store_si128((__m128i *)&aq[5 * 16], q5); flat2 = _mm_max_epu8(work, flat2); p6 = _mm_loadu_si128((__m128i *)(s - 7 * p)); q6 = _mm_loadu_si128((__m128i *)(s + 6 * p)); @@ -555,8 +542,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, _mm_subs_epu8(p0, p6)), _mm_or_si128(_mm_subs_epu8(q6, q0), _mm_subs_epu8(q0, q6))); - _mm_store_si128((__m128i *)ap[6], p6); - _mm_store_si128((__m128i *)aq[6], q6); + _mm_store_si128((__m128i *)&ap[6 * 16], p6); + _mm_store_si128((__m128i *)&aq[6 * 16], q6); flat2 = _mm_max_epu8(work, flat2); p7 = _mm_loadu_si128((__m128i *)(s - 8 * p)); @@ -565,8 +552,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, _mm_subs_epu8(p0, p7)), _mm_or_si128(_mm_subs_epu8(q7, q0), _mm_subs_epu8(q0, q7))); - _mm_store_si128((__m128i *)ap[7], p7); - _mm_store_si128((__m128i *)aq[7], q7); + _mm_store_si128((__m128i *)&ap[7 * 16], p7); + _mm_store_si128((__m128i *)&aq[7 * 16], q7); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_cmpeq_epi8(flat2, zero); @@ -586,22 +573,38 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, __m128i a, b, c; unsigned int off = i * 8; - p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero); - p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero); - p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero); - p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero); - p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero); - p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero); - p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero); - p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero); - q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero); - q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero); - q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero); - q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero); - q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero); - q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero); - q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero); - q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero); + p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[7 * 16] + off)), + zero); + p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[6 * 16] + off)), + zero); + p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[5 * 16] + off)), + zero); + p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[4 * 16] + off)), + zero); + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[3 * 16] + off)), + zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[2 * 16] + off)), + zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[1 * 16] + off)), + zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[0 * 16] + off)), + zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[0 * 16] + off)), + zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[1 * 16] + off)), + zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[2 * 16] + off)), + zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[3 * 16] + off)), + zero); + q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[4 * 16] + off)), + zero); + q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[5 * 16] + off)), + zero); + q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[6 * 16] + off)), + zero); + q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[7 * 16] + off)), + zero); c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7 c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c)); @@ -610,117 +613,117 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1)); a = _mm_add_epi16(_mm_add_epi16(p0, q0), a); - _mm_storel_epi64((__m128i *)&flat_op[2][i*8], + _mm_storel_epi64((__m128i *)&flat_op[2 * 16 + i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(_mm_add_epi16(p5, eight), c); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[6][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[6 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q1, a); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1); - _mm_storel_epi64((__m128i *)&flat_op[1][i*8], + _mm_storel_epi64((__m128i *)&flat_op[1 * 16 + i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[5][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[5 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q2, a); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0); - _mm_storel_epi64((__m128i *)&flat_op[0][i*8], + _mm_storel_epi64((__m128i *)&flat_op[i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[4][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[4 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q3, a); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0); - _mm_storel_epi64((__m128i *)&flat_oq[0][i*8], + _mm_storel_epi64((__m128i *)&flat_oq[i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[3][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[3 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); b = _mm_add_epi16(q3, b); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1); - _mm_storel_epi64((__m128i *)&flat_oq[1][i*8], + _mm_storel_epi64((__m128i *)&flat_oq[16 + i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); c = _mm_add_epi16(q4, c); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[2][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[2 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); b = _mm_add_epi16(q3, b); b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2); - _mm_storel_epi64((__m128i *)&flat_oq[2][i*8], + _mm_storel_epi64((__m128i *)&flat_oq[2 * 16 + i * 8], _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3) , b)); a = _mm_add_epi16(q5, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[1][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q6, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_op[0][i*8], + _mm_storel_epi64((__m128i *)&flat2_op[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[2 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[3 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[4 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[5 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); a = _mm_add_epi16(q7, a); c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6); workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4); - _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8], + _mm_storel_epi64((__m128i *)&flat2_oq[6 * 16 + i * 8], _mm_packus_epi16(workp_shft, workp_shft)); temp_flat2 = _mm_srli_si128(temp_flat2, 8); @@ -730,51 +733,51 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - work_a = _mm_load_si128((__m128i *)ap[2]); - p2 = _mm_load_si128((__m128i *)flat_op[2]); + work_a = _mm_load_si128((__m128i *)&ap[2 * 16]); + p2 = _mm_load_si128((__m128i *)&flat_op[2 * 16]); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_store_si128((__m128i *)flat_op[2], p2); + _mm_store_si128((__m128i *)&flat_op[2 * 16], p2); - p1 = _mm_load_si128((__m128i *)flat_op[1]); + p1 = _mm_load_si128((__m128i *)&flat_op[1 * 16]); work_a = _mm_andnot_si128(flat, ps1); p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); - _mm_store_si128((__m128i *)flat_op[1], p1); + _mm_store_si128((__m128i *)&flat_op[1 * 16], p1); - p0 = _mm_load_si128((__m128i *)flat_op[0]); + p0 = _mm_load_si128((__m128i *)&flat_op[0]); work_a = _mm_andnot_si128(flat, ps0); p0 = _mm_and_si128(flat, p0); p0 = _mm_or_si128(work_a, p0); - _mm_store_si128((__m128i *)flat_op[0], p0); + _mm_store_si128((__m128i *)&flat_op[0], p0); - q0 = _mm_load_si128((__m128i *)flat_oq[0]); + q0 = _mm_load_si128((__m128i *)&flat_oq[0]); work_a = _mm_andnot_si128(flat, qs0); q0 = _mm_and_si128(flat, q0); q0 = _mm_or_si128(work_a, q0); - _mm_store_si128((__m128i *)flat_oq[0], q0); + _mm_store_si128((__m128i *)&flat_oq[0], q0); - q1 = _mm_load_si128((__m128i *)flat_oq[1]); + q1 = _mm_load_si128((__m128i *)&flat_oq[1 * 16]); work_a = _mm_andnot_si128(flat, qs1); q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); - _mm_store_si128((__m128i *)flat_oq[1], q1); + _mm_store_si128((__m128i *)&flat_oq[1 * 16], q1); - work_a = _mm_load_si128((__m128i *)aq[2]); - q2 = _mm_load_si128((__m128i *)flat_oq[2]); + work_a = _mm_load_si128((__m128i *)&aq[2 * 16]); + q2 = _mm_load_si128((__m128i *)&flat_oq[2 * 16]); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); q2 = _mm_or_si128(work_a, q2); - _mm_store_si128((__m128i *)flat_oq[2], q2); + _mm_store_si128((__m128i *)&flat_oq[2 * 16], q2); // write out op6 - op3 { unsigned char *dst = (s - 7 * p); for (i = 6; i > 2; i--) { __m128i flat2_output; - work_a = _mm_load_si128((__m128i *)ap[i]); - flat2_output = _mm_load_si128((__m128i *)flat2_op[i]); + work_a = _mm_load_si128((__m128i *)&ap[i * 16]); + flat2_output = _mm_load_si128((__m128i *)&flat2_op[i * 16]); work_a = _mm_andnot_si128(flat2, work_a); flat2_output = _mm_and_si128(flat2, flat2_output); work_a = _mm_or_si128(work_a, flat2_output); @@ -783,43 +786,43 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, } } - work_a = _mm_load_si128((__m128i *)flat_op[2]); - p2 = _mm_load_si128((__m128i *)flat2_op[2]); + work_a = _mm_load_si128((__m128i *)&flat_op[2 * 16]); + p2 = _mm_load_si128((__m128i *)&flat2_op[2 * 16]); work_a = _mm_andnot_si128(flat2, work_a); p2 = _mm_and_si128(flat2, p2); p2 = _mm_or_si128(work_a, p2); _mm_storeu_si128((__m128i *)(s - 3 * p), p2); - work_a = _mm_load_si128((__m128i *)flat_op[1]); - p1 = _mm_load_si128((__m128i *)flat2_op[1]); + work_a = _mm_load_si128((__m128i *)&flat_op[1 * 16]); + p1 = _mm_load_si128((__m128i *)&flat2_op[1 * 16]); work_a = _mm_andnot_si128(flat2, work_a); p1 = _mm_and_si128(flat2, p1); p1 = _mm_or_si128(work_a, p1); _mm_storeu_si128((__m128i *)(s - 2 * p), p1); - work_a = _mm_load_si128((__m128i *)flat_op[0]); - p0 = _mm_load_si128((__m128i *)flat2_op[0]); + work_a = _mm_load_si128((__m128i *)&flat_op[0]); + p0 = _mm_load_si128((__m128i *)&flat2_op[0]); work_a = _mm_andnot_si128(flat2, work_a); p0 = _mm_and_si128(flat2, p0); p0 = _mm_or_si128(work_a, p0); _mm_storeu_si128((__m128i *)(s - 1 * p), p0); - work_a = _mm_load_si128((__m128i *)flat_oq[0]); - q0 = _mm_load_si128((__m128i *)flat2_oq[0]); + work_a = _mm_load_si128((__m128i *)&flat_oq[0]); + q0 = _mm_load_si128((__m128i *)&flat2_oq[0]); work_a = _mm_andnot_si128(flat2, work_a); q0 = _mm_and_si128(flat2, q0); q0 = _mm_or_si128(work_a, q0); _mm_storeu_si128((__m128i *)(s - 0 * p), q0); - work_a = _mm_load_si128((__m128i *)flat_oq[1]); - q1 = _mm_load_si128((__m128i *)flat2_oq[1]); + work_a = _mm_load_si128((__m128i *)&flat_oq[1 * 16]); + q1 = _mm_load_si128((__m128i *)&flat2_oq[16]); work_a = _mm_andnot_si128(flat2, work_a); q1 = _mm_and_si128(flat2, q1); q1 = _mm_or_si128(work_a, q1); _mm_storeu_si128((__m128i *)(s + 1 * p), q1); - work_a = _mm_load_si128((__m128i *)flat_oq[2]); - q2 = _mm_load_si128((__m128i *)flat2_oq[2]); + work_a = _mm_load_si128((__m128i *)&flat_oq[2 * 16]); + q2 = _mm_load_si128((__m128i *)&flat2_oq[2 * 16]); work_a = _mm_andnot_si128(flat2, work_a); q2 = _mm_and_si128(flat2, q2); q2 = _mm_or_si128(work_a, q2); @@ -830,8 +833,8 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, unsigned char *dst = (s + 3 * p); for (i = 3; i < 7; i++) { __m128i flat2_output; - work_a = _mm_load_si128((__m128i *)aq[i]); - flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]); + work_a = _mm_load_si128((__m128i *)&aq[i * 16]); + flat2_output = _mm_load_si128((__m128i *)&flat2_oq[i * 16]); work_a = _mm_andnot_si128(flat2, work_a); flat2_output = _mm_and_si128(flat2, flat2_output); work_a = _mm_or_si128(work_a, flat2_output); @@ -842,6 +845,7 @@ static void mb_lpf_horizontal_edge_w_sse2_16(unsigned char *s, } } +// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly. void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s, int p, const unsigned char *_blimit, @@ -860,34 +864,260 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, const unsigned char *_limit, const unsigned char *_thresh, int count) { - DECLARE_ALIGNED(16, unsigned char, flat_op2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_op0[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]); - DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]); - __m128i mask, hev, flat; + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16); const __m128i zero = _mm_set1_epi16(0); + const __m128i blimit = _mm_load_si128((const __m128i *)_blimit); + const __m128i limit = _mm_load_si128((const __m128i *)_limit); + const __m128i thresh = _mm_load_si128((const __m128i *)_thresh); + __m128i mask, hev, flat; __m128i p3, p2, p1, p0, q0, q1, q2, q3; - const unsigned int extended_thresh = _thresh[0] * 0x01010101u; - const unsigned int extended_limit = _limit[0] * 0x01010101u; - const unsigned int extended_blimit = _blimit[0] * 0x01010101u; - const __m128i thresh = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0); - const __m128i limit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0); - const __m128i blimit = - _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0); + __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0; (void)count; - p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); - p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); - p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); - p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); - q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p)); - q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p)); - q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p)); - q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p)); + + q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)), + _mm_loadl_epi64((__m128i *)(s + 3 * p))); + q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)), + _mm_loadl_epi64((__m128i *)(s + 2 * p))); + q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + _mm_loadl_epi64((__m128i *)(s + 1 * p))); + q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + _mm_loadl_epi64((__m128i *)(s - 0 * p))); + p1q1 = _mm_shuffle_epi32(q1p1, 78); + p0q0 = _mm_shuffle_epi32(q0p0, 78); + + { + // filter_mask and hev_mask + const __m128i one = _mm_set1_epi8(1); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(fe, fe); + __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work; + abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), + _mm_subs_epu8(q0p0, q1p1)); + abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); + + abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), + _mm_subs_epu8(p0q0, q0p0)); + abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), + _mm_subs_epu8(p1q1, q1p1)); + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(abs_p1p0, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1), + _mm_subs_epu8(q1p1, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), + _mm_subs_epu8(q2p2, q3p3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + + // flat_mask4 + + flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0), + _mm_subs_epu8(q0p0, q2p2)), + _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), + _mm_subs_epu8(q0p0, q3p3))); + flat = _mm_max_epu8(abs_p1p0, flat); + flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); + flat = _mm_subs_epu8(flat, one); + flat = _mm_cmpeq_epi8(flat, zero); + flat = _mm_and_si128(flat, mask); + } + + { + const __m128i four = _mm_set1_epi16(4); + unsigned char *src = s; + { + __m128i workp_a, workp_b, workp_shft; + p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); + p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); + p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero); + p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero); + q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero); + q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero); + q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero); + q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero); + + workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1)); + workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); + workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_op0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq0[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq1[0], + _mm_packus_epi16(workp_shft, workp_shft)); + + workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); + workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); + workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); + _mm_storel_epi64((__m128i *)&flat_oq2[0], + _mm_packus_epi16(workp_shft, workp_shft)); + } + } + // lp filter + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), + t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + /* Filter1 >> 3 */ + filter1 = _mm_unpacklo_epi8(zero, filter1); + filter1 = _mm_srai_epi16(filter1, 11); + filter1 = _mm_packs_epi16(filter1, filter1); + + /* Filter2 >> 3 */ + filter2 = _mm_unpacklo_epi8(zero, filter2); + filter2 = _mm_srai_epi16(filter2, 11); + filter2 = _mm_packs_epi16(filter2, zero); + + /* filt >> 1 */ + filt = _mm_adds_epi8(filter1, t1); + filt = _mm_unpacklo_epi8(zero, filt); + filt = _mm_srai_epi16(filt, 9); + filt = _mm_packs_epi16(filt, zero); + + filt = _mm_andnot_si128(hev, filt); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q0 = _mm_loadl_epi64((__m128i *)flat_oq0); + work_a = _mm_andnot_si128(flat, work_a); + q0 = _mm_and_si128(flat, q0); + q0 = _mm_or_si128(work_a, q0); + + work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + q1 = _mm_loadl_epi64((__m128i *)flat_oq1); + work_a = _mm_andnot_si128(flat, work_a); + q1 = _mm_and_si128(flat, q1); + q1 = _mm_or_si128(work_a, q1); + + work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q2 = _mm_loadl_epi64((__m128i *)flat_oq2); + work_a = _mm_andnot_si128(flat, work_a); + q2 = _mm_and_si128(flat, q2); + q2 = _mm_or_si128(work_a, q2); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p0 = _mm_loadl_epi64((__m128i *)flat_op0); + work_a = _mm_andnot_si128(flat, work_a); + p0 = _mm_and_si128(flat, p0); + p0 = _mm_or_si128(work_a, p0); + + work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + p1 = _mm_loadl_epi64((__m128i *)flat_op1); + work_a = _mm_andnot_si128(flat, work_a); + p1 = _mm_and_si128(flat, p1); + p1 = _mm_or_si128(work_a, p1); + + work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p2 = _mm_loadl_epi64((__m128i *)flat_op2); + work_a = _mm_andnot_si128(flat, work_a); + p2 = _mm_and_si128(flat, p2); + p2 = _mm_or_si128(work_a, p2); + + _mm_storel_epi64((__m128i *)(s - 3 * p), p2); + _mm_storel_epi64((__m128i *)(s - 2 * p), p1); + _mm_storel_epi64((__m128i *)(s - 1 * p), p0); + _mm_storel_epi64((__m128i *)(s + 0 * p), q0); + _mm_storel_epi64((__m128i *)(s + 1 * p), q1); + _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + } +} + +void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p /* pitch */, + const uint8_t *_blimit0, + const uint8_t *_limit0, + const uint8_t *_thresh0, + const uint8_t *_blimit1, + const uint8_t *_limit1, + const uint8_t *_thresh1) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16); + DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16); + const __m128i zero = _mm_set1_epi16(0); + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + + __m128i mask, hev, flat; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); { const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), _mm_subs_epu8(p0, p1)); @@ -901,6 +1131,8 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), _mm_subs_epu8(q1, p1)); __m128i work; + + // filter_mask and hev_mask flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); @@ -926,6 +1158,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, mask = _mm_subs_epu8(mask, limit); mask = _mm_cmpeq_epi8(mask, zero); + // flat_mask4 work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0), _mm_subs_epu8(p0, p2)), _mm_or_si128(_mm_subs_epu8(q2, q0), @@ -943,7 +1176,9 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, { const __m128i four = _mm_set1_epi16(4); unsigned char *src = s; - { + int i = 0; + + do { __m128i workp_a, workp_b, workp_shft; p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero); p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero); @@ -958,38 +1193,40 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0); workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op2[0], + _mm_storel_epi64((__m128i *)&flat_op2[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op1[0], + _mm_storel_epi64((__m128i *)&flat_op1[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_op0[0], + _mm_storel_epi64((__m128i *)&flat_op0[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq0[0], + _mm_storel_epi64((__m128i *)&flat_oq0[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq1[0], + _mm_storel_epi64((__m128i *)&flat_oq1[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3); workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2); workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3); - _mm_storel_epi64((__m128i *)&flat_oq2[0], + _mm_storel_epi64((__m128i *)&flat_oq2[i * 8], _mm_packus_epi16(workp_shft, workp_shft)); - } + + src += 8; + } while (++i < 2); } // lp filter { @@ -1001,13 +1238,13 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, const __m128i t1 = _mm_set1_epi8(0x1); const __m128i t7f = _mm_set1_epi8(0x7f); - const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)), + const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), t80); - const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)), + const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), t80); - const __m128i qs0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 0 * p)), + const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), t80); - const __m128i qs1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s + 1 * p)), + const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), t80); __m128i filt; __m128i work_a; @@ -1049,47 +1286,186 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s, filt = _mm_andnot_si128(hev, filt); work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); - q0 = _mm_loadl_epi64((__m128i *)flat_oq0); + q0 = _mm_load_si128((__m128i *)flat_oq0); work_a = _mm_andnot_si128(flat, work_a); q0 = _mm_and_si128(flat, q0); q0 = _mm_or_si128(work_a, q0); work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); - q1 = _mm_loadl_epi64((__m128i *)flat_oq1); + q1 = _mm_load_si128((__m128i *)flat_oq1); work_a = _mm_andnot_si128(flat, work_a); q1 = _mm_and_si128(flat, q1); q1 = _mm_or_si128(work_a, q1); work_a = _mm_loadu_si128((__m128i *)(s + 2 * p)); - q2 = _mm_loadl_epi64((__m128i *)flat_oq2); + q2 = _mm_load_si128((__m128i *)flat_oq2); work_a = _mm_andnot_si128(flat, work_a); q2 = _mm_and_si128(flat, q2); q2 = _mm_or_si128(work_a, q2); work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); - p0 = _mm_loadl_epi64((__m128i *)flat_op0); + p0 = _mm_load_si128((__m128i *)flat_op0); work_a = _mm_andnot_si128(flat, work_a); p0 = _mm_and_si128(flat, p0); p0 = _mm_or_si128(work_a, p0); work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); - p1 = _mm_loadl_epi64((__m128i *)flat_op1); + p1 = _mm_load_si128((__m128i *)flat_op1); work_a = _mm_andnot_si128(flat, work_a); p1 = _mm_and_si128(flat, p1); p1 = _mm_or_si128(work_a, p1); work_a = _mm_loadu_si128((__m128i *)(s - 3 * p)); - p2 = _mm_loadl_epi64((__m128i *)flat_op2); + p2 = _mm_load_si128((__m128i *)flat_op2); work_a = _mm_andnot_si128(flat, work_a); p2 = _mm_and_si128(flat, p2); p2 = _mm_or_si128(work_a, p2); - _mm_storel_epi64((__m128i *)(s - 3 * p), p2); - _mm_storel_epi64((__m128i *)(s - 2 * p), p1); - _mm_storel_epi64((__m128i *)(s - 1 * p), p0); - _mm_storel_epi64((__m128i *)(s + 0 * p), q0); - _mm_storel_epi64((__m128i *)(s + 1 * p), q1); - _mm_storel_epi64((__m128i *)(s + 2 * p), q2); + _mm_storeu_si128((__m128i *)(s - 3 * p), p2); + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); + _mm_storeu_si128((__m128i *)(s + 2 * p), q2); + } +} + +void vp9_loop_filter_horizontal_edge_16_sse2(unsigned char *s, + int p, + const unsigned char *_blimit0, + const unsigned char *_limit0, + const unsigned char *_thresh0, + const unsigned char *_blimit1, + const unsigned char *_limit1, + const unsigned char *_thresh1) { + const __m128i blimit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0), + _mm_load_si128((const __m128i *)_blimit1)); + const __m128i limit = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0), + _mm_load_si128((const __m128i *)_limit1)); + const __m128i thresh = + _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0), + _mm_load_si128((const __m128i *)_thresh1)); + const __m128i zero = _mm_set1_epi16(0); + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + __m128i mask, hev, flat; + + p3 = _mm_loadu_si128((__m128i *)(s - 4 * p)); + p2 = _mm_loadu_si128((__m128i *)(s - 3 * p)); + p1 = _mm_loadu_si128((__m128i *)(s - 2 * p)); + p0 = _mm_loadu_si128((__m128i *)(s - 1 * p)); + q0 = _mm_loadu_si128((__m128i *)(s - 0 * p)); + q1 = _mm_loadu_si128((__m128i *)(s + 1 * p)); + q2 = _mm_loadu_si128((__m128i *)(s + 2 * p)); + q3 = _mm_loadu_si128((__m128i *)(s + 3 * p)); + + // filter_mask and hev_mask + { + const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0), + _mm_subs_epu8(p0, p1)); + const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0), + _mm_subs_epu8(q0, q1)); + const __m128i fe = _mm_set1_epi8(0xfe); + const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); + __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0), + _mm_subs_epu8(q0, p0)); + __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1), + _mm_subs_epu8(q1, p1)); + __m128i work; + + flat = _mm_max_epu8(abs_p1p0, abs_q1q0); + hev = _mm_subs_epu8(flat, thresh); + hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); + + abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0); + abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); + mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); + mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); + // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; + mask = _mm_max_epu8(flat, mask); + // mask |= (abs(p1 - p0) > limit) * -1; + // mask |= (abs(q1 - q0) > limit) * -1; + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1), + _mm_subs_epu8(p1, p2)), + _mm_or_si128(_mm_subs_epu8(p3, p2), + _mm_subs_epu8(p2, p3))); + mask = _mm_max_epu8(work, mask); + work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1), + _mm_subs_epu8(q1, q2)), + _mm_or_si128(_mm_subs_epu8(q3, q2), + _mm_subs_epu8(q2, q3))); + mask = _mm_max_epu8(work, mask); + mask = _mm_subs_epu8(mask, limit); + mask = _mm_cmpeq_epi8(mask, zero); + } + + // filter4 + { + const __m128i t4 = _mm_set1_epi8(4); + const __m128i t3 = _mm_set1_epi8(3); + const __m128i t80 = _mm_set1_epi8(0x80); + const __m128i te0 = _mm_set1_epi8(0xe0); + const __m128i t1f = _mm_set1_epi8(0x1f); + const __m128i t1 = _mm_set1_epi8(0x1); + const __m128i t7f = _mm_set1_epi8(0x7f); + + const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)), + t80); + const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)), + t80); + const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)), + t80); + const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)), + t80); + __m128i filt; + __m128i work_a; + __m128i filter1, filter2; + + filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev); + work_a = _mm_subs_epi8(qs0, ps0); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + filt = _mm_adds_epi8(filt, work_a); + /* (vp9_filter + 3 * (qs0 - ps0)) & mask */ + filt = _mm_and_si128(filt, mask); + + filter1 = _mm_adds_epi8(filt, t4); + filter2 = _mm_adds_epi8(filt, t3); + + /* Filter1 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter1); + filter1 = _mm_srli_epi16(filter1, 3); + work_a = _mm_and_si128(work_a, te0); + filter1 = _mm_and_si128(filter1, t1f); + filter1 = _mm_or_si128(filter1, work_a); + + /* Filter2 >> 3 */ + work_a = _mm_cmpgt_epi8(zero, filter2); + filter2 = _mm_srli_epi16(filter2, 3); + work_a = _mm_and_si128(work_a, te0); + filter2 = _mm_and_si128(filter2, t1f); + filter2 = _mm_or_si128(filter2, work_a); + + /* filt >> 1 */ + filt = _mm_adds_epi8(filter1, t1); + work_a = _mm_cmpgt_epi8(zero, filt); + filt = _mm_srli_epi16(filt, 1); + work_a = _mm_and_si128(work_a, t80); + filt = _mm_and_si128(filt, t7f); + filt = _mm_or_si128(filt, work_a); + + filt = _mm_andnot_si128(hev, filt); + + q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80); + q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80); + p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80); + p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80); + + _mm_storeu_si128((__m128i *)(s - 2 * p), p1); + _mm_storeu_si128((__m128i *)(s - 1 * p), p0); + _mm_storeu_si128((__m128i *)(s + 0 * p), q0); + _mm_storeu_si128((__m128i *)(s + 1 * p), q1); } } diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 2dd2bf0..1b38aa1 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -123,6 +123,7 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c +VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon.c VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM) VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM) -- 2.7.4