From 0084e61d5f935e763c29a49094377d4ab64577e5 Mon Sep 17 00:00:00 2001 From: Jingning Han Date: Fri, 21 Jun 2013 15:56:24 -0700 Subject: [PATCH] Tune the rounding operations in 8x8 ADST/DCT sse2 Improve the round-trip precision to meet the unit test setttings. Change-Id: I303febae56b4b990ea3798b8ebed94c0510ecf79 --- test/fdct8x8_test.cc | 6 ++++++ vp9/encoder/x86/vp9_dct_sse2.c | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 8b9cff5..03301a3 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -33,7 +33,13 @@ void idct8x8_add(int16_t *in, int16_t *out, uint8_t *dst, vp9_short_idct8x8_add_c(out, dst, stride >> 1); } void fht8x8(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) { + // TODO(jingning): need to refactor this to test both _c and _sse2 functions, + // when we have all inverse dct functions done sse2. +#if HAVE_SSE2 + vp9_short_fht8x8_sse2(in, out, stride >> 1, tx_type); +#else vp9_short_fht8x8_c(in, out, stride >> 1, tx_type); +#endif } void iht8x8_add(int16_t *in, int16_t *out, uint8_t *dst, int stride, int tx_type) { diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index f11200f..484afce 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -397,6 +397,24 @@ static INLINE void load_buffer_8x8(int16_t *input, __m128i in[8], int stride) { // write 8x8 array static INLINE void write_buffer_8x8(int16_t *output, __m128i res[8]) { + __m128i sign0 = _mm_srai_epi16(res[0], 15); + __m128i sign1 = _mm_srai_epi16(res[1], 15); + __m128i sign2 = _mm_srai_epi16(res[2], 15); + __m128i sign3 = _mm_srai_epi16(res[3], 15); + __m128i sign4 = _mm_srai_epi16(res[4], 15); + __m128i sign5 = _mm_srai_epi16(res[5], 15); + __m128i sign6 = _mm_srai_epi16(res[6], 15); + __m128i sign7 = _mm_srai_epi16(res[7], 15); + + res[0] = _mm_sub_epi16(res[0], sign0); + res[1] = _mm_sub_epi16(res[1], sign1); + res[2] = _mm_sub_epi16(res[2], sign2); + res[3] = _mm_sub_epi16(res[3], sign3); + res[4] = _mm_sub_epi16(res[4], sign4); + res[5] = _mm_sub_epi16(res[5], sign5); + res[6] = _mm_sub_epi16(res[6], sign6); + res[7] = _mm_sub_epi16(res[7], sign7); + res[0] = _mm_srai_epi16(res[0], 1); res[1] = _mm_srai_epi16(res[1], 1); res[2] = _mm_srai_epi16(res[2], 1); -- 2.7.4