From dd4f95335031db9e5b63bebb426b45f92c7db9b0 Mon Sep 17 00:00:00 2001 From: Johann Date: Fri, 18 Sep 2015 15:59:12 -0700 Subject: [PATCH] Remove vpx_filter_block1d16_v8_intrin_ssse3 This was rewritten and moved to vpx_dsp/x86/vpx_subpixel_8t_ssse3.asm in 195883023bb39b5ee5c6811a316ab96d9225034d Change-Id: I117ce983dae12006e302679ba7f175573dd9e874 --- vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c | 117 ----------------------------- 1 file changed, 117 deletions(-) diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 8d5c7c2..6fd5208 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -291,123 +291,6 @@ void vpx_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr, } } -#if ARCH_X86_64 -static void vpx_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr, - ptrdiff_t src_pitch, - uint8_t *output_ptr, - ptrdiff_t out_pitch, - uint32_t output_height, - const int16_t *filter) { - __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; - __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; - __m128i srcReg8; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((const __m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits in the filter - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits in the filter - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits in the filter - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - // load the first 7 rows of 16 bytes - srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr)); - srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); - srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); - srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); - srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); - srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); - srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); - - for (i = 0; i < output_height; i++) { - // load the last 16 bytes - srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); - - // merge the result together - srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2); - srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8); - srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2); - srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); - srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); - - // add and saturate the results together - srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); - - // merge the result together - srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); - srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); - srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); - - // merge the result together - srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6); - srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); - srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); - - // add and saturate the results together - srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, - _mm_min_epi16(srcRegFilt3, srcRegFilt7)); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, - _mm_min_epi16(srcRegFilt6, srcRegFilt8)); - - // add and saturate the results together - srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, - _mm_max_epi16(srcRegFilt3, srcRegFilt7)); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, - _mm_max_epi16(srcRegFilt6, srcRegFilt8)); - srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); - - src_ptr+=src_pitch; - - // shift down a row - srcReg1 = srcReg2; - srcReg2 = srcReg3; - srcReg3 = srcReg4; - srcReg4 = srcReg5; - srcReg5 = srcReg6; - srcReg6 = srcReg7; - srcReg7 = srcReg8; - - // save 16 bytes convolve result - _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); - - output_ptr+=out_pitch; - } -} -#endif // ARCH_X86_64 - filter8_1dfunction vpx_filter_block1d16_v8_ssse3; filter8_1dfunction vpx_filter_block1d16_h8_ssse3; filter8_1dfunction vpx_filter_block1d8_v8_ssse3; -- 2.7.4