From c167345ffb4ff1ad0d999b235eecc35a16dc1a70 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Wed, 17 May 2017 12:37:23 -0700 Subject: [PATCH] Add vpx_highbd_idct{4x4,8x8,16x16}_1_add_sse2 BUG=webm:1412 Change-Id: Ia338a6057d36f9ed7eaa9cbd4dfbf0c3cbdc6468 --- test/partial_idct_test.cc | 21 +++++++++++++++++++ vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 +++--- vpx_dsp/x86/highbd_idct16x16_add_sse2.c | 5 +++++ vpx_dsp/x86/highbd_idct32x32_add_sse2.c | 25 ++-------------------- vpx_dsp/x86/highbd_idct4x4_add_sse2.c | 23 ++++++++++++++++++++ vpx_dsp/x86/highbd_idct8x8_add_sse2.c | 5 +++++ vpx_dsp/x86/highbd_inv_txfm_sse2.h | 37 +++++++++++++++++++++++++++++++++ 7 files changed, 96 insertions(+), 26 deletions(-) diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc index da93167..740d7e2 100644 --- a/test/partial_idct_test.cc +++ b/test/partial_idct_test.cc @@ -650,6 +650,15 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = { make_tuple( &vpx_highbd_fdct16x16_c, &highbd_wrapper, &highbd_wrapper, TX_16X16, 10, 12, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 8, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 10, 2), + make_tuple( + &vpx_highbd_fdct16x16_c, &highbd_wrapper, + &highbd_wrapper, TX_16X16, 1, 12, 2), make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, &highbd_wrapper, TX_8X8, 64, 8, 2), @@ -668,6 +677,12 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = { make_tuple( &vpx_highbd_fdct8x8_c, &highbd_wrapper, &highbd_wrapper, TX_8X8, 12, 12, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 8, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 10, 2), + make_tuple(&vpx_highbd_fdct8x8_c, &highbd_wrapper, + &highbd_wrapper, TX_8X8, 1, 12, 2), make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, &highbd_wrapper, TX_4X4, 16, 8, 2), @@ -677,6 +692,12 @@ const PartialInvTxfmParam sse2_partial_idct_tests[] = { make_tuple( &vpx_highbd_fdct4x4_c, &highbd_wrapper, &highbd_wrapper, TX_4X4, 16, 12, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 8, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 10, 2), + make_tuple(&vpx_highbd_fdct4x4_c, &highbd_wrapper, + &highbd_wrapper, TX_4X4, 1, 12, 2), #endif // CONFIG_VP9_HIGHBITDEPTH make_tuple(&vpx_fdct32x32_c, &wrapper, &wrapper, TX_32X32, 1024, 8, 1), diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 1e11bdb..c674836 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -629,18 +629,18 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vpx_highbd_idct4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; add_proto qw/void vpx_highbd_idct4x4_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - specialize qw/vpx_highbd_idct4x4_1_add neon/; + specialize qw/vpx_highbd_idct4x4_1_add neon sse2/; add_proto qw/void vpx_highbd_idct8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; add_proto qw/void vpx_highbd_idct8x8_12_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; add_proto qw/void vpx_highbd_idct8x8_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - specialize qw/vpx_highbd_idct8x8_1_add neon/; + specialize qw/vpx_highbd_idct8x8_1_add neon sse2/; add_proto qw/void vpx_highbd_idct16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; add_proto qw/void vpx_highbd_idct16x16_38_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; add_proto qw/void vpx_highbd_idct16x16_10_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; add_proto qw/void vpx_highbd_idct16x16_1_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; - specialize qw/vpx_highbd_idct16x16_1_add neon/; + specialize qw/vpx_highbd_idct16x16_1_add neon sse2/; add_proto qw/void vpx_highbd_idct32x32_1024_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; add_proto qw/void vpx_highbd_idct32x32_135_add/, "const tran_low_t *input, uint16_t *dest, int stride, int bd"; diff --git a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c index f16e4d0..a2412d1 100644 --- a/vpx_dsp/x86/highbd_idct16x16_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct16x16_add_sse2.c @@ -242,3 +242,8 @@ void vpx_highbd_idct16x16_10_add_sse2(const tran_low_t *input, uint16_t *dest, } } } + +void vpx_highbd_idct16x16_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + highbd_idct_1_add_kernel(input, dest, stride, bd, 16); +} diff --git a/vpx_dsp/x86/highbd_idct32x32_add_sse2.c b/vpx_dsp/x86/highbd_idct32x32_add_sse2.c index bc9debf..06f2659 100644 --- a/vpx_dsp/x86/highbd_idct32x32_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct32x32_add_sse2.c @@ -9,33 +9,12 @@ */ #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/highbd_inv_txfm_sse2.h" #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/transpose_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" void vpx_highbd_idct32x32_1_add_sse2(const tran_low_t *input, uint16_t *dest, int stride, int bd) { - __m128i dc_value, d; - const __m128i zero = _mm_setzero_si128(); - const __m128i one = _mm_set1_epi16(1); - const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); - int a, i, j; - tran_low_t out; - - out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); - out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); - a = ROUND_POWER_OF_TWO(out, 6); - - d = _mm_set1_epi32(a); - dc_value = _mm_packs_epi32(d, d); - for (i = 0; i < 32; ++i) { - for (j = 0; j < 4; ++j) { - d = _mm_loadu_si128((const __m128i *)(&dest[j * 8])); - d = _mm_adds_epi16(d, dc_value); - d = _mm_max_epi16(d, zero); - d = _mm_min_epi16(d, max); - _mm_storeu_si128((__m128i *)(&dest[j * 8]), d); - } - dest += stride; - } + highbd_idct_1_add_kernel(input, dest, stride, bd, 32); } diff --git a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c index 3949ce9..89a2584 100644 --- a/vpx_dsp/x86/highbd_idct4x4_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct4x4_add_sse2.c @@ -127,3 +127,26 @@ void vpx_highbd_idct4x4_16_add_sse2(const tran_low_t *input, uint16_t *dest, } } } + +void vpx_highbd_idct4x4_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + const __m128i zero = _mm_setzero_si128(); + // Faster than _mm_set1_epi16((1 << bd) - 1). + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + int a1, i; + tran_low_t out; + __m128i dc, d; + + out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, 4); + dc = _mm_set1_epi16(a1); + + for (i = 0; i < 4; ++i) { + d = _mm_loadl_epi64((const __m128i *)dest); + d = add_dc_clamp(&zero, &max, &dc, &d); + _mm_storel_epi64((__m128i *)dest, d); + dest += stride; + } +} diff --git a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c index 6a2e180..29cc1d3 100644 --- a/vpx_dsp/x86/highbd_idct8x8_add_sse2.c +++ b/vpx_dsp/x86/highbd_idct8x8_add_sse2.c @@ -214,3 +214,8 @@ void vpx_highbd_idct8x8_12_add_sse2(const tran_low_t *input, uint16_t *dest, } } } + +void vpx_highbd_idct8x8_1_add_sse2(const tran_low_t *input, uint16_t *dest, + int stride, int bd) { + highbd_idct_1_add_kernel(input, dest, stride, bd, 8); +} diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h index 774cce1..ea100c6 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -17,6 +17,43 @@ #include "vpx_dsp/inv_txfm.h" #include "vpx_dsp/x86/txfm_common_sse2.h" +static INLINE __m128i add_dc_clamp(const __m128i *const min, + const __m128i *const max, + const __m128i *const dc, + const __m128i *const in) { + __m128i out; + out = _mm_adds_epi16(*in, *dc); + out = _mm_max_epi16(out, *min); + out = _mm_min_epi16(out, *max); + return out; +} + +static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input, + uint16_t *dest, int stride, int bd, + const int size) { + const __m128i zero = _mm_setzero_si128(); + // Faster than _mm_set1_epi16((1 << bd) - 1). + const __m128i one = _mm_set1_epi16(1); + const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one); + int a1, i, j; + tran_low_t out; + __m128i dc, d; + + out = HIGHBD_WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64), bd); + out = HIGHBD_WRAPLOW(dct_const_round_shift(out * cospi_16_64), bd); + a1 = ROUND_POWER_OF_TWO(out, (size == 8) ? 5 : 6); + dc = _mm_set1_epi16(a1); + + for (i = 0; i < size; ++i) { + for (j = 0; j < (size >> 3); ++j) { + d = _mm_load_si128((const __m128i *)(&dest[j * 8])); + d = add_dc_clamp(&zero, &max, &dc, &d); + _mm_store_si128((__m128i *)(&dest[j * 8]), d); + } + dest += stride; + } +} + static INLINE __m128i clamp_high_sse2(__m128i value, int bd) { __m128i ubounded, retval; const __m128i zero = _mm_set1_epi16(0); -- 2.7.4