From 85fbf6569c3c34d9c091d2c09d7374c66e7b3b59 Mon Sep 17 00:00:00 2001 From: chiyotsai Date: Mon, 4 Mar 2019 10:40:14 -0800 Subject: [PATCH] Optimize SSE4_1 lowbd temporal filter implementation - Change some unaligned loads to aligned loads - Preload filter weights BUG=webm:1591 Change-Id: I4e5e755e1fa5613d1c14191265bf80b0bfd0b75c --- vp9/encoder/x86/temporal_filter_sse4.c | 184 ++++++++++----------------------- 1 file changed, 57 insertions(+), 127 deletions(-) diff --git a/vp9/encoder/x86/temporal_filter_sse4.c b/vp9/encoder/x86/temporal_filter_sse4.c index 3a1142f..437f49f 100644 --- a/vp9/encoder/x86/temporal_filter_sse4.c +++ b/vp9/encoder/x86/temporal_filter_sse4.c @@ -75,11 +75,11 @@ static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first, // by weight. static INLINE __m128i average_8(__m128i sum, const __m128i *mul_constants, const int strength, const int rounding, - const int weight) { + const __m128i *weight) { // _mm_srl_epi16 uses the lower 64 bit value for the shift. const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); const __m128i rounding_u16 = _mm_set1_epi16(rounding); - const __m128i weight_u16 = _mm_set1_epi16(weight); + const __m128i weight_u16 = *weight; const __m128i sixteen = _mm_set1_epi16(16); // modifier * 3 / index; @@ -98,62 +98,6 @@ static INLINE __m128i average_8(__m128i sum, const __m128i *mul_constants, return _mm_mullo_epi16(sum, weight_u16); } -static __m128i average_4_4(__m128i sum, const __m128i *mul_constants, - const int strength, const int rounding, - const int weight_0, const int weight_1) { - // _mm_srl_epi16 uses the lower 64 bit value for the shift. - const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); - const __m128i rounding_u16 = _mm_set1_epi16(rounding); - const __m128i weight_u16 = - _mm_setr_epi16(weight_0, weight_0, weight_0, weight_0, weight_1, weight_1, - weight_1, weight_1); - const __m128i sixteen = _mm_set1_epi16(16); - - // modifier * 3 / index; - sum = _mm_mulhi_epu16(sum, *mul_constants); - - sum = _mm_adds_epu16(sum, rounding_u16); - sum = _mm_srl_epi16(sum, strength_u128); - - // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 - // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 - // So this needs to use the epu16 version which did not come until SSE4. - sum = _mm_min_epu16(sum, sixteen); - - sum = _mm_sub_epi16(sixteen, sum); - - return _mm_mullo_epi16(sum, weight_u16); -} - -static INLINE void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, - const __m128i *mul_constants_0, - const __m128i *mul_constants_1, - const int strength, const int rounding, - const int weight) { - const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); - const __m128i rounding_u16 = _mm_set1_epi16(rounding); - const __m128i weight_u16 = _mm_set1_epi16(weight); - const __m128i sixteen = _mm_set1_epi16(16); - __m128i input_0, input_1; - - input_0 = _mm_mulhi_epu16(*sum_0_u16, *mul_constants_0); - input_0 = _mm_adds_epu16(input_0, rounding_u16); - - input_1 = _mm_mulhi_epu16(*sum_1_u16, *mul_constants_1); - input_1 = _mm_adds_epu16(input_1, rounding_u16); - - input_0 = _mm_srl_epi16(input_0, strength_u128); - input_1 = _mm_srl_epi16(input_1, strength_u128); - - input_0 = _mm_min_epu16(input_0, sixteen); - input_1 = _mm_min_epu16(input_1, sixteen); - input_0 = _mm_sub_epi16(sixteen, input_0); - input_1 = _mm_sub_epi16(sixteen, input_1); - - *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16); - *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16); -} - // Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, uint16_t *count, uint32_t *accumulator) { @@ -336,7 +280,7 @@ static void vp9_apply_temporal_filter_luma_16( const int16_t *const *neighbors_second, int top_weight, int bottom_weight, const int *blk_fw) { const int rounding = (1 << strength) >> 1; - int weight = top_weight; + __m128i weight_first, weight_second; __m128i mul_first, mul_second; @@ -360,9 +304,18 @@ static void vp9_apply_temporal_filter_luma_16( (void)block_width; + // Initialize the weights + if (blk_fw) { + weight_first = _mm_set1_epi16(blk_fw[0]); + weight_second = _mm_set1_epi16(blk_fw[1]); + } else { + weight_first = _mm_set1_epi16(top_weight); + weight_second = weight_first; + } + // First row - mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]); - mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]); + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); // Add luma values get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second); @@ -382,15 +335,10 @@ static void vp9_apply_temporal_filter_luma_16( sum_row_second = _mm_adds_epu16(sum_row_second, v_second); // Get modifier and store result - if (blk_fw) { - sum_row_first = - average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]); - sum_row_second = - average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]); - } else { - average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second, - strength, rounding, weight); - } + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, y_accum); @@ -408,16 +356,18 @@ static void vp9_apply_temporal_filter_luma_16( v_dist += DIST_STRIDE; // Then all the rows except the last one - mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]); - mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]); + mul_first = _mm_load_si128((const __m128i *)neighbors_first[1]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[1]); for (h = 1; h < block_height - 1; ++h) { // Move the weight to bottom half if (!use_whole_blk && h == block_height / 2) { if (blk_fw) { - blk_fw += 2; + weight_first = _mm_set1_epi16(blk_fw[2]); + weight_second = _mm_set1_epi16(blk_fw[3]); } else { - weight = bottom_weight; + weight_first = _mm_set1_epi16(bottom_weight); + weight_second = weight_first; } } // Shift the rows up @@ -456,15 +406,10 @@ static void vp9_apply_temporal_filter_luma_16( sum_row_second = _mm_adds_epu16(sum_row_second, v_second); // Get modifier and store result - if (blk_fw) { - sum_row_first = - average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]); - sum_row_second = - average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]); - } else { - average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second, - strength, rounding, weight); - } + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, y_accum); @@ -476,8 +421,8 @@ static void vp9_apply_temporal_filter_luma_16( } // The last row - mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]); - mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]); + mul_first = _mm_load_si128((const __m128i *)neighbors_first[0]); + mul_second = _mm_load_si128((const __m128i *)neighbors_second[0]); // Shift the rows up sum_row_1_first = sum_row_2_first; @@ -503,15 +448,10 @@ static void vp9_apply_temporal_filter_luma_16( sum_row_second = _mm_adds_epu16(sum_row_second, v_second); // Get modifier and store result - if (blk_fw) { - sum_row_first = - average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]); - sum_row_second = - average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]); - } else { - average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second, - strength, rounding, weight); - } + sum_row_first = + average_8(sum_row_first, &mul_first, strength, rounding, &weight_first); + sum_row_second = average_8(sum_row_second, &mul_second, strength, rounding, + &weight_second); accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count, y_accum); } @@ -634,7 +574,8 @@ static void vp9_apply_temporal_filter_chroma_8( const int16_t *const *neighbors, int top_weight, int bottom_weight, const int *blk_fw) { const int rounding = (1 << strength) >> 1; - int weight = top_weight; + + __m128i weight; __m128i mul; @@ -648,8 +589,16 @@ static void vp9_apply_temporal_filter_chroma_8( (void)uv_block_width; + // Initilize weight + if (blk_fw) { + weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0], + blk_fw[1], blk_fw[1], blk_fw[1], blk_fw[1]); + } else { + weight = _mm_set1_epi16(top_weight); + } + // First row - mul = _mm_loadu_si128((const __m128i *)neighbors[0]); + mul = _mm_load_si128((const __m128i *)neighbors[0]); // Add chroma values get_sum_8(u_dist, &u_sum_row_2); @@ -666,15 +615,9 @@ static void vp9_apply_temporal_filter_chroma_8( add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); // Get modifier and store result - if (blk_fw) { - u_sum_row = - average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); - v_sum_row = - average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); - } else { - u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight); - v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight); - } + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); + accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); @@ -694,15 +637,16 @@ static void vp9_apply_temporal_filter_chroma_8( y_dist += DIST_STRIDE * (1 + ss_y); // Then all the rows except the last one - mul = _mm_loadu_si128((const __m128i *)neighbors[1]); + mul = _mm_load_si128((const __m128i *)neighbors[1]); for (h = 1; h < uv_block_height - 1; ++h) { // Move the weight pointer to the bottom half of the blocks if (h == uv_block_height / 2) { if (blk_fw) { - blk_fw += 2; + weight = _mm_setr_epi16(blk_fw[2], blk_fw[2], blk_fw[2], blk_fw[2], + blk_fw[3], blk_fw[3], blk_fw[3], blk_fw[3]); } else { - weight = bottom_weight; + weight = _mm_set1_epi16(bottom_weight); } } @@ -726,15 +670,8 @@ static void vp9_apply_temporal_filter_chroma_8( add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); // Get modifier and store result - if (blk_fw) { - u_sum_row = average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], - blk_fw[1]); - v_sum_row = average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], - blk_fw[1]); - } else { - u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight); - v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight); - } + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); @@ -756,7 +693,7 @@ static void vp9_apply_temporal_filter_chroma_8( } // The last row - mul = _mm_loadu_si128((const __m128i *)neighbors[0]); + mul = _mm_load_si128((const __m128i *)neighbors[0]); // Shift the rows up u_sum_row_1 = u_sum_row_2; @@ -773,15 +710,8 @@ static void vp9_apply_temporal_filter_chroma_8( add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row); // Get modifier and store result - if (blk_fw) { - u_sum_row = - average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); - v_sum_row = - average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]); - } else { - u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight); - v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight); - } + u_sum_row = average_8(u_sum_row, &mul, strength, rounding, &weight); + v_sum_row = average_8(v_sum_row, &mul, strength, rounding, &weight); accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum); accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum); -- 2.7.4