From: Scott LaVarnway Date: Wed, 6 Sep 2017 17:08:03 +0000 (-0700) Subject: vpxdsp: [x86] add highbd_d207_predictor functions X-Git-Tag: v1.7.0~177^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d6c9bbc2b6f2b15495c3e1fcc86feba23b27dc08;p=platform%2Fupstream%2Flibvpx.git vpxdsp: [x86] add highbd_d207_predictor functions C vs SSE2 speed gains: _4x4 : ~2.31x C vs SSSE3 speed gains: _8x8 : ~4.73x _16x16 : ~10.88x _32x32 : ~4.80x BUG=webm:1411 Change-Id: I0bac29db261079181ddabc6814bd62c463109caf --- diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc index cbc1a8c..b92fc2b 100644 --- a/test/test_intra_pred_speed.cc +++ b/test/test_intra_pred_speed.cc @@ -480,14 +480,12 @@ HIGHBD_INTRA_PRED_TEST( vpx_highbd_d63_predictor_32x32_c, vpx_highbd_tm_predictor_32x32_c) #if HAVE_SSE2 -HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred4, - vpx_highbd_dc_predictor_4x4_sse2, - vpx_highbd_dc_left_predictor_4x4_sse2, - vpx_highbd_dc_top_predictor_4x4_sse2, - vpx_highbd_dc_128_predictor_4x4_sse2, - vpx_highbd_v_predictor_4x4_sse2, - vpx_highbd_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, - NULL, NULL, vpx_highbd_tm_predictor_4x4_c) +HIGHBD_INTRA_PRED_TEST( + SSE2, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_sse2, + vpx_highbd_dc_left_predictor_4x4_sse2, vpx_highbd_dc_top_predictor_4x4_sse2, + vpx_highbd_dc_128_predictor_4x4_sse2, vpx_highbd_v_predictor_4x4_sse2, + vpx_highbd_h_predictor_4x4_sse2, NULL, NULL, NULL, NULL, + vpx_highbd_d207_predictor_4x4_sse2, NULL, vpx_highbd_tm_predictor_4x4_c) HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred8, vpx_highbd_dc_predictor_8x8_sse2, @@ -517,6 +515,20 @@ HIGHBD_INTRA_PRED_TEST(SSE2, TestHighbdIntraPred32, NULL, NULL, NULL, vpx_highbd_tm_predictor_32x32_sse2) #endif // HAVE_SSE2 +#if HAVE_SSSE3 +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred4, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred8, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, + vpx_highbd_d207_predictor_8x8_ssse3, NULL, NULL) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred16, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, + vpx_highbd_d207_predictor_16x16_ssse3, NULL, NULL) +HIGHBD_INTRA_PRED_TEST(SSSE3, TestHighbdIntraPred32, NULL, NULL, NULL, NULL, + NULL, NULL, NULL, NULL, NULL, NULL, + vpx_highbd_d207_predictor_32x32_ssse3, NULL, NULL) +#endif // HAVE_SSSE3 + #if HAVE_NEON HIGHBD_INTRA_PRED_TEST( NEON, TestHighbdIntraPred4, vpx_highbd_dc_predictor_4x4_neon, diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc index 96985bd..649f501 100644 --- a/test/vp9_intrapred_test.cc +++ b/test/vp9_intrapred_test.cc @@ -467,6 +467,38 @@ TEST_P(VP9HighbdIntraPredTest, HighbdIntraPredTests) { RunTest(left_col, above_data, dst, ref_dst); } +#if HAVE_SSSE3 +INSTANTIATE_TEST_CASE_P( + SSSE3_TO_C_8, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 8))); + +INSTANTIATE_TEST_CASE_P( + SSSE3_TO_C_10, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 10))); + +INSTANTIATE_TEST_CASE_P( + SSSE3_TO_C_12, VP9HighbdIntraPredTest, + ::testing::Values( + HighbdIntraPredParam(&vpx_highbd_d207_predictor_8x8_ssse3, + &vpx_highbd_d207_predictor_8x8_c, 8, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_16x16_ssse3, + &vpx_highbd_d207_predictor_16x16_c, 16, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_32x32_ssse3, + &vpx_highbd_d207_predictor_32x32_c, 32, 12))); +#endif // HAVE_SSSE3 + #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( SSE2_TO_C_8, VP9HighbdIntraPredTest, @@ -479,6 +511,8 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_dc_128_predictor_16x16_c, 16, 8), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, &vpx_highbd_dc_128_predictor_32x32_c, 32, 8), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, &vpx_highbd_dc_left_predictor_4x4_c, 4, 8), HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, @@ -539,6 +573,8 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_dc_128_predictor_16x16_c, 16, 10), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, &vpx_highbd_dc_128_predictor_32x32_c, 32, 10), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, &vpx_highbd_dc_left_predictor_4x4_c, 4, 10), HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, @@ -599,6 +635,8 @@ INSTANTIATE_TEST_CASE_P( &vpx_highbd_dc_128_predictor_16x16_c, 16, 12), HighbdIntraPredParam(&vpx_highbd_dc_128_predictor_32x32_sse2, &vpx_highbd_dc_128_predictor_32x32_c, 32, 12), + HighbdIntraPredParam(&vpx_highbd_d207_predictor_4x4_sse2, + &vpx_highbd_d207_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_4x4_sse2, &vpx_highbd_dc_left_predictor_4x4_c, 4, 12), HighbdIntraPredParam(&vpx_highbd_dc_left_predictor_8x8_sse2, diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 74f1135..275a367 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -57,6 +57,7 @@ ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE) += x86/highbd_intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_sse2.asm DSP_SRCS-$(HAVE_SSE2) += x86/highbd_intrapred_intrin_sse2.c +DSP_SRCS-$(HAVE_SSSE3) += x86/highbd_intrapred_intrin_ssse3.c DSP_SRCS-$(HAVE_NEON) += arm/highbd_intrapred_neon.c endif # CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index acf7f32..8606e89 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -189,6 +189,7 @@ specialize qw/vpx_dc_128_predictor_32x32 msa neon sse2 vsx/; # High bitdepth functions if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d207_predictor_4x4 sse2/; add_proto qw/void vpx_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_4x4 neon/; @@ -224,6 +225,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_dc_128_predictor_4x4 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d207_predictor_8x8 ssse3/; add_proto qw/void vpx_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_8x8 neon/; @@ -259,6 +261,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_dc_128_predictor_8x8 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d207_predictor_16x16 ssse3/; add_proto qw/void vpx_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_16x16 neon/; @@ -294,6 +297,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vpx_highbd_dc_128_predictor_16x16 neon sse2/; add_proto qw/void vpx_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; + specialize qw/vpx_highbd_d207_predictor_32x32 ssse3/; add_proto qw/void vpx_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; specialize qw/vpx_highbd_d45_predictor_32x32 neon/; diff --git a/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c b/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c index 83113a2..06f188d 100644 --- a/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c +++ b/vpx_dsp/x86/highbd_intrapred_intrin_sse2.c @@ -371,3 +371,49 @@ void vpx_highbd_dc_128_predictor_32x32_sse2(uint16_t *dst, ptrdiff_t stride, (void)left; dc_store_32x32(dst, stride, &dc_dup); } + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +void vpx_highbd_d207_predictor_4x4_sse2(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i IJKL0000 = _mm_load_si128((const __m128i *)left); + const __m128i LLLL0000 = _mm_shufflelo_epi16(IJKL0000, 0xff); + const __m128i IJKLLLLL = _mm_unpacklo_epi64(IJKL0000, LLLL0000); + const __m128i JKLLLLL0 = _mm_srli_si128(IJKLLLLL, 2); + const __m128i KLLLLL00 = _mm_srli_si128(IJKLLLLL, 4); + const __m128i avg3 = avg3_epu16(&IJKLLLLL, &JKLLLLL0, &KLLLLL00); + const __m128i avg2 = _mm_avg_epu16(IJKLLLLL, JKLLLLL0); + const __m128i row0 = _mm_unpacklo_epi16(avg2, avg3); + const __m128i row1 = _mm_srli_si128(row0, 4); + const __m128i row2 = _mm_srli_si128(row0, 8); + const __m128i row3 = LLLL0000; + (void)above; + (void)bd; + _mm_storel_epi64((__m128i *)dst, row0); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row1); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row2); + dst += stride; + _mm_storel_epi64((__m128i *)dst, row3); +} diff --git a/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c b/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c new file mode 100644 index 0000000..539af34 --- /dev/null +++ b/vpx_dsp/x86/highbd_intrapred_intrin_ssse3.c @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" + +// ----------------------------------------------------------------------------- +/* +; ------------------------------------------ +; input: x, y, z, result +; +; trick from pascal +; (x+2y+z+2)>>2 can be calculated as: +; result = avg(x,z) +; result -= xor(x,z) & 1 +; result = avg(result,y) +; ------------------------------------------ +*/ +static INLINE __m128i avg3_epu16(const __m128i *x, const __m128i *y, + const __m128i *z) { + const __m128i one = _mm_set1_epi16(1); + const __m128i a = _mm_avg_epu16(*x, *z); + const __m128i b = + _mm_subs_epu16(a, _mm_and_si128(_mm_xor_si128(*x, *z), one)); + return _mm_avg_epu16(b, *y); +} + +static INLINE void d207_store_4x8(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b) { + _mm_store_si128((__m128i *)*dst, *a); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)left); + const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff); + const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH); + const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2); + const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4); + const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH); + const __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH); + const __m128i out_a = _mm_unpacklo_epi16(avg2, avg3); + const __m128i out_b = _mm_unpackhi_epi16(avg2, avg3); + (void)above; + (void)bd; + d207_store_4x8(&dst, stride, &out_a, &out_b); + d207_store_4x8(&dst, stride, &out_b, &HHHHHHHH); +} + +static INLINE void d207_store_4x16(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b, + const __m128i *c) { + _mm_store_si128((__m128i *)*dst, *a); + _mm_store_si128((__m128i *)(*dst + 8), *b); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)left); + const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff); + const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(LR, A1, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(LR, A1, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); + const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); + const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); + const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); + (void)above; + (void)bd; + d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c); + d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d); + d207_store_4x16(&dst, stride, &out_c, &out_d, &LR); + d207_store_4x16(&dst, stride, &out_d, &LR, &LR); +} + +static INLINE void d207_store_4x32(uint16_t **dst, const ptrdiff_t stride, + const __m128i *a, const __m128i *b, + const __m128i *c, const __m128i *d, + const __m128i *e) { + _mm_store_si128((__m128i *)*dst, *a); + _mm_store_si128((__m128i *)(*dst + 8), *b); + _mm_store_si128((__m128i *)(*dst + 16), *c); + _mm_store_si128((__m128i *)(*dst + 24), *d); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 4)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 4)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 4)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 4)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 8)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 8)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 8)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 8)); + *dst += stride; + _mm_store_si128((__m128i *)*dst, _mm_alignr_epi8(*b, *a, 12)); + _mm_store_si128((__m128i *)(*dst + 8), _mm_alignr_epi8(*c, *b, 12)); + _mm_store_si128((__m128i *)(*dst + 16), _mm_alignr_epi8(*d, *c, 12)); + _mm_store_si128((__m128i *)(*dst + 24), _mm_alignr_epi8(*e, *d, 12)); + *dst += stride; +} + +void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, + const uint16_t *left, int bd) { + const __m128i A0 = _mm_load_si128((const __m128i *)left); + const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); + const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16)); + const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24)); + const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff); + const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); + const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); + const __m128i B1 = _mm_alignr_epi8(A2, A1, 2); + const __m128i B2 = _mm_alignr_epi8(A3, A2, 2); + const __m128i B3 = _mm_alignr_epi8(LR, A3, 2); + const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); + const __m128i C1 = _mm_alignr_epi8(A2, A1, 4); + const __m128i C2 = _mm_alignr_epi8(A3, A2, 4); + const __m128i C3 = _mm_alignr_epi8(LR, A3, 4); + const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); + const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); + const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2); + const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3); + const __m128i avg2_0 = _mm_avg_epu16(A0, B0); + const __m128i avg2_1 = _mm_avg_epu16(A1, B1); + const __m128i avg2_2 = _mm_avg_epu16(A2, B2); + const __m128i avg2_3 = _mm_avg_epu16(A3, B3); + const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); + const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); + const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); + const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); + const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2); + const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2); + const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3); + const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3); + (void)above; + (void)bd; + d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e); + d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f); + d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g); + d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h); + d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR); + d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR); + d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR); + d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR); +}