From 5a51d961f2432e13ac2dc97ab75f5e56cab6c6ae Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Mon, 29 Oct 2018 16:12:05 -0700 Subject: [PATCH] Add SSE2 support for hbd 4-tap interpolation filter. Unit test performance on bitdepth 10: | 4X4 | 8X8 |16X16|64X64| 2D |1.582|1.461|1.425|1.572| HORZ|1.643|1.247|1.346|1.345| VERT|1.378|1.695|2.020|1.763| Unit test performance on bitdepth 12: | 4X4 | 8X8 |16X16|64X64| 2D |1.578|1.409|1.426|1.497| HORZ|1.625|1.153|1.323|1.259| VERT|1.392|1.707|2.030|1.787| Change-Id: I6df85330ac33fcb17d46e4302b41415dda1219f5 --- vpx_dsp/x86/convolve_sse2.h | 43 ++- vpx_dsp/x86/vpx_asm_stubs.c | 12 +- vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c | 511 +++++++++++++++++++++++++---- vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 4 +- vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c | 24 +- 5 files changed, 506 insertions(+), 88 deletions(-) diff --git a/vpx_dsp/x86/convolve_sse2.h b/vpx_dsp/x86/convolve_sse2.h index 81fae29..8443546 100644 --- a/vpx_dsp/x86/convolve_sse2.h +++ b/vpx_dsp/x86/convolve_sse2.h @@ -32,10 +32,10 @@ static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) { // Interprets src as 8-bit words, zero extends to form 16-bit words, then // multiplies with ker and add the adjacent results to form 32-bit words. // Finally adds the result from 1 and 2 together. -static INLINE __m128i multiply_add_epi8_sse2(const __m128i *const src_1, - const __m128i *const src_2, - const __m128i *const ker_1, - const __m128i *const ker_2) { +static INLINE __m128i mm_madd_add_epi8_sse2(const __m128i *const src_1, + const __m128i *const src_2, + const __m128i *const ker_1, + const __m128i *const ker_2) { const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128()); const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128()); const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1); @@ -43,25 +43,44 @@ static INLINE __m128i multiply_add_epi8_sse2(const __m128i *const src_1, return _mm_add_epi32(madd_1, madd_2); } -static INLINE __m128i multiply_add_packs_epi16_sse2(const __m128i *const src_0, - const __m128i *const src_1, - const __m128i *const ker) { +// Interprets src as 16-bit words, then multiplies with ker and add the +// adjacent results to form 32-bit words. Finally adds the result from 1 and 2 +// together. +static INLINE __m128i mm_madd_add_epi16_sse2(const __m128i *const src_1, + const __m128i *const src_2, + const __m128i *const ker_1, + const __m128i *const ker_2) { + const __m128i madd_1 = _mm_madd_epi16(*src_1, *ker_1); + const __m128i madd_2 = _mm_madd_epi16(*src_2, *ker_2); + return _mm_add_epi32(madd_1, madd_2); +} + +static INLINE __m128i mm_madd_packs_epi16_sse2(const __m128i *const src_0, + const __m128i *const src_1, + const __m128i *const ker) { const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker); const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker); return _mm_packs_epi32(madd_1, madd_2); } // Interleaves src_1 and src_2 -static INLINE __m128i combine_epi32_sse2(const __m128i *const src_1, - const __m128i *const src_2) { +static INLINE __m128i mm_zip_epi32_sse2(const __m128i *const src_1, + const __m128i *const src_2) { const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2); const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2); return _mm_packs_epi32(tmp_1, tmp_2); } -static INLINE __m128i round_epi16_sse2(const __m128i *const src, - const __m128i *const half_depth, - const int depth) { +static INLINE __m128i mm_round_epi32_sse2(const __m128i *const src, + const __m128i *const half_depth, + const int depth) { + const __m128i nearest_src = _mm_add_epi32(*src, *half_depth); + return _mm_srai_epi32(nearest_src, depth); +} + +static INLINE __m128i mm_round_epi16_sse2(const __m128i *const src, + const __m128i *const half_depth, + const int depth) { const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth); return _mm_srai_epi16(nearest_src, depth); } diff --git a/vpx_dsp/x86/vpx_asm_stubs.c b/vpx_dsp/x86/vpx_asm_stubs.c index 12194a6..9d6f837 100644 --- a/vpx_dsp/x86/vpx_asm_stubs.c +++ b/vpx_dsp/x86/vpx_asm_stubs.c @@ -104,12 +104,12 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; -#define vpx_highbd_filter_block1d16_v4_sse2 vpx_highbd_filter_block1d16_v8_sse2 -#define vpx_highbd_filter_block1d16_h4_sse2 vpx_highbd_filter_block1d16_h8_sse2 -#define vpx_highbd_filter_block1d8_v4_sse2 vpx_highbd_filter_block1d8_v8_sse2 -#define vpx_highbd_filter_block1d8_h4_sse2 vpx_highbd_filter_block1d8_h8_sse2 -#define vpx_highbd_filter_block1d4_v4_sse2 vpx_highbd_filter_block1d4_v8_sse2 -#define vpx_highbd_filter_block1d4_h4_sse2 vpx_highbd_filter_block1d4_h8_sse2 +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v4_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h4_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v4_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h4_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v4_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h4_sse2; #define vpx_highbd_filter_block1d16_v4_avg_sse2 \ vpx_highbd_filter_block1d16_v8_avg_sse2 #define vpx_highbd_filter_block1d16_h4_avg_sse2 \ diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c index fa223ae..0be2c0f 100644 --- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c +++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -16,6 +16,9 @@ #include "vpx_dsp/x86/convolve_sse2.h" #include "vpx_ports/mem.h" +#define CONV8_ROUNDING_BITS (7) +#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1)) + void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr, ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel) { @@ -54,15 +57,15 @@ void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_shift_3 = _mm_srli_si128(src_reg, 3); // Output 6 4 2 0 - even = multiply_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, - &kernel_reg_45); + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); // Output 7 5 3 1 - odd = multiply_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, - &kernel_reg_23, &kernel_reg_45); + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); // Combine to get the first half of the dst - dst_first = combine_epi32_sse2(&even, &odd); + dst_first = mm_zip_epi32_sse2(&even, &odd); // Do again to get the second half of dst src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); @@ -71,19 +74,19 @@ void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_shift_3 = _mm_srli_si128(src_reg, 3); // Output 14 12 10 8 - even = multiply_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, - &kernel_reg_45); + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); // Output 15 13 11 9 - odd = multiply_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, - &kernel_reg_23, &kernel_reg_45); + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); // Combine to get the second half of the dst - dst_second = combine_epi32_sse2(&even, &odd); + dst_second = mm_zip_epi32_sse2(&even, &odd); // Round each result - dst_first = round_epi16_sse2(&dst_first, ®_32, 6); - dst_second = round_epi16_sse2(&dst_second, ®_32, 6); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + dst_second = mm_round_epi16_sse2(&dst_second, ®_32, 6); // Finally combine to get the final dst dst_first = _mm_packus_epi16(dst_first, dst_second); @@ -181,21 +184,21 @@ void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3); // Partial output from first half - res_reg_m10_lo = multiply_add_packs_epi16_sse2( + res_reg_m10_lo = mm_madd_packs_epi16_sse2( &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23); - res_reg_01_lo = multiply_add_packs_epi16_sse2( - &src_reg_01_lo_1, &src_reg_01_lo_2, &kernel_reg_23); + res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2, + &kernel_reg_23); src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128()); - res_reg_12_lo = multiply_add_packs_epi16_sse2( - &src_reg_12_lo_1, &src_reg_12_lo_2, &kernel_reg_45); + res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2, + &kernel_reg_45); src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128()); - res_reg_23_lo = multiply_add_packs_epi16_sse2( - &src_reg_23_lo_1, &src_reg_23_lo_2, &kernel_reg_45); + res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2, + &kernel_reg_45); // Add to get first half of the results res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); @@ -203,31 +206,31 @@ void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, // Now repeat everything again for the second half // Partial output for second half - res_reg_m10_hi = multiply_add_packs_epi16_sse2( + res_reg_m10_hi = mm_madd_packs_epi16_sse2( &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23); - res_reg_01_hi = multiply_add_packs_epi16_sse2( - &src_reg_01_hi_1, &src_reg_01_hi_2, &kernel_reg_23); + res_reg_01_hi = mm_madd_packs_epi16_sse2(&src_reg_01_hi_1, &src_reg_01_hi_2, + &kernel_reg_23); src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128()); src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128()); - res_reg_12_hi = multiply_add_packs_epi16_sse2( - &src_reg_12_hi_1, &src_reg_12_hi_2, &kernel_reg_45); + res_reg_12_hi = mm_madd_packs_epi16_sse2(&src_reg_12_hi_1, &src_reg_12_hi_2, + &kernel_reg_45); src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128()); src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128()); - res_reg_23_hi = multiply_add_packs_epi16_sse2( - &src_reg_23_hi_1, &src_reg_23_hi_2, &kernel_reg_45); + res_reg_23_hi = mm_madd_packs_epi16_sse2(&src_reg_23_hi_1, &src_reg_23_hi_2, + &kernel_reg_45); // Second half of the results res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi); res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi); // Round the words - res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); - res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); - res_reg_m1012_hi = round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); - res_reg_0123_hi = round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); + res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); // Combine to get the result res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi); @@ -288,16 +291,16 @@ void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_shift_3 = _mm_srli_si128(src_reg, 3); // Output 6 4 2 0 - even = multiply_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, - &kernel_reg_45); + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); // Output 7 5 3 1 - odd = multiply_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, - &kernel_reg_23, &kernel_reg_45); + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); // Combine to get the first half of the dst - dst_first = combine_epi32_sse2(&even, &odd); - dst_first = round_epi16_sse2(&dst_first, ®_32, 6); + dst_first = mm_zip_epi32_sse2(&even, &odd); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); // Saturate and convert to 8-bit words dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); @@ -383,29 +386,29 @@ void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); // Partial output - res_reg_m10_lo = multiply_add_packs_epi16_sse2( + res_reg_m10_lo = mm_madd_packs_epi16_sse2( &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23); - res_reg_01_lo = multiply_add_packs_epi16_sse2( - &src_reg_01_lo_1, &src_reg_01_lo_2, &kernel_reg_23); + res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2, + &kernel_reg_23); src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128()); - res_reg_12_lo = multiply_add_packs_epi16_sse2( - &src_reg_12_lo_1, &src_reg_12_lo_2, &kernel_reg_45); + res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2, + &kernel_reg_45); src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128()); - res_reg_23_lo = multiply_add_packs_epi16_sse2( - &src_reg_23_lo_1, &src_reg_23_lo_2, &kernel_reg_45); + res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2, + &kernel_reg_45); // Add to get results res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); // Round the words - res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); - res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); // Convert to 8-bit words res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, _mm_setzero_si128()); @@ -480,7 +483,7 @@ void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, dst_first = _mm_add_epi32(tmp_0, tmp_1); dst_first = _mm_packs_epi32(dst_first, _mm_setzero_si128()); - dst_first = round_epi16_sse2(&dst_first, ®_32, 6); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); // Saturate and convert to 8-bit words dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); @@ -565,27 +568,27 @@ void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); // Partial output - res_reg_m10_lo = multiply_add_packs_epi16_sse2(&src_reg_m10_lo_1, ®_zero, - &kernel_reg_23); + res_reg_m10_lo = + mm_madd_packs_epi16_sse2(&src_reg_m10_lo_1, ®_zero, &kernel_reg_23); - res_reg_01_lo = multiply_add_packs_epi16_sse2(&src_reg_01_lo_1, ®_zero, - &kernel_reg_23); + res_reg_01_lo = + mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, ®_zero, &kernel_reg_23); src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); - res_reg_12_lo = multiply_add_packs_epi16_sse2(&src_reg_12_lo_1, ®_zero, - &kernel_reg_45); + res_reg_12_lo = + mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, ®_zero, &kernel_reg_45); src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); - res_reg_23_lo = multiply_add_packs_epi16_sse2(&src_reg_23_lo_1, ®_zero, - &kernel_reg_45); + res_reg_23_lo = + mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, ®_zero, &kernel_reg_45); // Add to get results res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); // Round the words - res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); - res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); // Convert to 8-bit words res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, reg_zero); @@ -604,3 +607,399 @@ void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_1 = src_reg_3; } } + +void vpx_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together to get the even output + + __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; + __m128i res_reg; + __m128i even, odd; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_1 = _mm_srli_si128(src_reg, 2); + src_reg_shift_2 = _mm_srli_si128(src_reg, 4); + src_reg_shift_3 = _mm_srli_si128(src_reg, 6); + + // Output 2 0 + even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 3 1 + odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + res_reg = _mm_unpacklo_epi32(even, odd); + res_reg = mm_round_epi32_sse2(&res_reg, ®_round, CONV8_ROUNDING_BITS); + res_reg = _mm_packs_epi32(res_reg, reg_zero); + + // Saturate the result and save + res_reg = _mm_min_epi16(res_reg, reg_max); + res_reg = _mm_max_epi16(res_reg, reg_zero); + _mm_storel_epi64((__m128i *)dst_ptr, res_reg); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vpx_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + // We will load two rows of pixels as 16-bit words, and shuffle them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10, src_reg_01; + __m128i src_reg_12, src_reg_23; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23; + __m128i res_reg_m1012, res_reg_0123; + + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // We only need to go num_taps/2 - 1 row above the source, so we move + // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down + src_ptr += src_stride_unrolled; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // First shuffle the data + src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr); + src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + src_reg_m10 = _mm_unpacklo_epi16(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01 = _mm_unpacklo_epi16(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12 = _mm_unpacklo_epi16(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23 = _mm_unpacklo_epi16(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10 = _mm_madd_epi16(src_reg_m10, kernel_reg_23); + res_reg_01 = _mm_madd_epi16(src_reg_01, kernel_reg_23); + res_reg_12 = _mm_madd_epi16(src_reg_12, kernel_reg_45); + res_reg_23 = _mm_madd_epi16(src_reg_23, kernel_reg_45); + + // Add to get results + res_reg_m1012 = _mm_add_epi32(res_reg_m10, res_reg_12); + res_reg_0123 = _mm_add_epi32(res_reg_01, res_reg_23); + + // Round the words + res_reg_m1012 = + mm_round_epi32_sse2(&res_reg_m1012, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123 = + mm_round_epi32_sse2(&res_reg_0123, ®_round, CONV8_ROUNDING_BITS); + + res_reg_m1012 = _mm_packs_epi32(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_packs_epi32(res_reg_0123, reg_zero); + + // Saturate according to bit depth + res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max); + res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max); + res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero); + + // Save only half of the register (8 words) + _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10 = src_reg_12; + src_reg_01 = src_reg_23; + src_reg_1 = src_reg_3; + } +} + +void vpx_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together for the first half of even + // output. + // Repeat multiple times to get the whole outoput + + __m128i src_reg, src_reg_next, src_reg_shift_1, src_reg_shift_2, + src_reg_shift_3; + __m128i res_reg; + __m128i even, odd; + __m128i tmp_0, tmp_1; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + // We will put first half in the first half of the reg, and second half in + // second half + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_next = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + + // Output 6 4 2 0 + tmp_0 = _mm_srli_si128(src_reg, 4); + tmp_1 = _mm_srli_si128(src_reg_next, 2); + src_reg_shift_2 = _mm_unpacklo_epi64(tmp_0, tmp_1); + even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 7 5 3 1 + tmp_0 = _mm_srli_si128(src_reg, 2); + tmp_1 = src_reg_next; + src_reg_shift_1 = _mm_unpacklo_epi64(tmp_0, tmp_1); + + tmp_0 = _mm_srli_si128(src_reg, 6); + tmp_1 = _mm_srli_si128(src_reg_next, 4); + src_reg_shift_3 = _mm_unpacklo_epi64(tmp_0, tmp_1); + + odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + even = mm_round_epi32_sse2(&even, ®_round, CONV8_ROUNDING_BITS); + odd = mm_round_epi32_sse2(&odd, ®_round, CONV8_ROUNDING_BITS); + res_reg = mm_zip_epi32_sse2(&even, &odd); + + // Saturate the result and save + res_reg = _mm_min_epi16(res_reg, reg_max); + res_reg = _mm_max_epi16(res_reg, reg_zero); + + _mm_store_si128((__m128i *)dst_ptr, res_reg); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vpx_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + // We will load two rows of pixels as 16-bit words, and shuffle them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_01_lo, src_reg_m10_hi, src_reg_01_hi; + __m128i src_reg_12_lo, src_reg_23_lo, src_reg_12_hi, src_reg_23_hi; + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo; + __m128i res_reg_m1012_hi, res_reg_0123_hi; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // We only need to go num_taps/2 - 1 row above the source, so we move + // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down + src_ptr += src_stride_unrolled; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi16(src_reg_m1, src_reg_0); + src_reg_m10_hi = _mm_unpackhi_epi16(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi16(src_reg_0, src_reg_1); + src_reg_01_hi = _mm_unpackhi_epi16(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi16(src_reg_1, src_reg_2); + src_reg_12_hi = _mm_unpackhi_epi16(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi16(src_reg_2, src_reg_3); + src_reg_23_hi = _mm_unpackhi_epi16(src_reg_2, src_reg_3); + + // Partial output for first half + res_reg_m10_lo = _mm_madd_epi16(src_reg_m10_lo, kernel_reg_23); + res_reg_01_lo = _mm_madd_epi16(src_reg_01_lo, kernel_reg_23); + res_reg_12_lo = _mm_madd_epi16(src_reg_12_lo, kernel_reg_45); + res_reg_23_lo = _mm_madd_epi16(src_reg_23_lo, kernel_reg_45); + + // Add to get results + res_reg_m1012_lo = _mm_add_epi32(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_add_epi32(res_reg_01_lo, res_reg_23_lo); + + // Round the words + res_reg_m1012_lo = + mm_round_epi32_sse2(&res_reg_m1012_lo, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123_lo = + mm_round_epi32_sse2(&res_reg_0123_lo, ®_round, CONV8_ROUNDING_BITS); + + // Partial output for first half + res_reg_m10_hi = _mm_madd_epi16(src_reg_m10_hi, kernel_reg_23); + res_reg_01_hi = _mm_madd_epi16(src_reg_01_hi, kernel_reg_23); + res_reg_12_hi = _mm_madd_epi16(src_reg_12_hi, kernel_reg_45); + res_reg_23_hi = _mm_madd_epi16(src_reg_23_hi, kernel_reg_45); + + // Add to get results + res_reg_m1012_hi = _mm_add_epi32(res_reg_m10_hi, res_reg_12_hi); + res_reg_0123_hi = _mm_add_epi32(res_reg_01_hi, res_reg_23_hi); + + // Round the words + res_reg_m1012_hi = + mm_round_epi32_sse2(&res_reg_m1012_hi, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123_hi = + mm_round_epi32_sse2(&res_reg_0123_hi, ®_round, CONV8_ROUNDING_BITS); + + // Combine the two halfs + res_reg_m1012 = _mm_packs_epi32(res_reg_m1012_lo, res_reg_m1012_hi); + res_reg_0123 = _mm_packs_epi32(res_reg_0123_lo, res_reg_0123_hi); + + // Saturate according to bit depth + res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max); + res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max); + res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero); + + // Save only half of the register (8 words) + _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012); + _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo = src_reg_12_lo; + src_reg_m10_hi = src_reg_12_hi; + src_reg_01_lo = src_reg_23_lo; + src_reg_01_hi = src_reg_23_hi; + src_reg_1 = src_reg_3; + } +} + +void vpx_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + +void vpx_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 0ccf896..b55b7e5 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -623,7 +623,7 @@ void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride, _mm256_castsi256_si128(kernel_reg_45)); dst_reg = _mm_adds_epi16(tmp_0, tmp_1); - dst_reg = round_epi16_sse2(&dst_reg, ®_32, 6); + dst_reg = mm_round_epi16_sse2(&dst_reg, ®_32, 6); dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128()); @@ -797,7 +797,7 @@ void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride, dst = _mm_hadds_epi16(dst, _mm_setzero_si128()); // Round result - dst = round_epi16_sse2(&dst, ®_32, 6); + dst = mm_round_epi16_sse2(&dst, ®_32, 6); // Pack to 8-bits dst = _mm_packus_epi16(dst, _mm_setzero_si128()); diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 9e5b730..b5f6ca5 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -246,8 +246,8 @@ void vpx_filter_block1d16_h4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, dst_second = _mm_adds_epi16(tmp_0, tmp_1); // Round each result - dst_first = round_epi16_sse2(&dst_first, ®_32, 6); - dst_second = round_epi16_sse2(&dst_second, ®_32, 6); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + dst_second = mm_round_epi16_sse2(&dst_second, ®_32, 6); // Finally combine to get the final dst dst_first = _mm_packus_epi16(dst_first, dst_second); @@ -348,10 +348,10 @@ void vpx_filter_block1d16_v4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi); // Round the words - res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); - res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); - res_reg_m1012_hi = round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); - res_reg_0123_hi = round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); + res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); // Combine to get the result res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi); @@ -421,7 +421,7 @@ void vpx_filter_block1d8_h4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, dst_first = _mm_adds_epi16(tmp_0, tmp_1); // Round round result - dst_first = round_epi16_sse2(&dst_first, ®_32, 6); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); // Pack to 8-bits dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); @@ -504,8 +504,8 @@ void vpx_filter_block1d8_v4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, res_reg_0123 = _mm_adds_epi16(res_reg_01, res_reg_23); // Round the words - res_reg_m1012 = round_epi16_sse2(&res_reg_m1012, ®_32, 6); - res_reg_0123 = round_epi16_sse2(&res_reg_0123, ®_32, 6); + res_reg_m1012 = mm_round_epi16_sse2(&res_reg_m1012, ®_32, 6); + res_reg_0123 = mm_round_epi16_sse2(&res_reg_0123, ®_32, 6); // Pack from 16-bit to 8-bit res_reg_m1012 = _mm_packus_epi16(res_reg_m1012, _mm_setzero_si128()); @@ -563,7 +563,7 @@ void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, dst_first = _mm_hadds_epi16(dst_first, _mm_setzero_si128()); // Round result - dst_first = round_epi16_sse2(&dst_first, ®_32, 6); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); // Pack to 8-bits dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); @@ -648,8 +648,8 @@ void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, reg_1 = _mm_hadds_epi16(reg_1, _mm_setzero_si128()); // Round the words - reg_0 = round_epi16_sse2(®_0, ®_32, 6); - reg_1 = round_epi16_sse2(®_1, ®_32, 6); + reg_0 = mm_round_epi16_sse2(®_0, ®_32, 6); + reg_1 = mm_round_epi16_sse2(®_1, ®_32, 6); // Pack from 16-bit to 8-bit and put them in the right order reg_0 = _mm_packus_epi16(reg_0, reg_0); -- 2.7.4