From 22b6dc9fdfe6504da3a9a6569c4beef9669a0c7b Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Fri, 4 Aug 2017 15:29:19 -0700 Subject: [PATCH] Update for loop increment of idct x86 functions Change-Id: Ided7895eaf41d5bc9d64fe536a17f5a078da68d4 --- vpx_dsp/x86/highbd_inv_txfm_sse2.h | 6 +++--- vpx_dsp/x86/inv_txfm_sse2.c | 35 +++++++++++++++++------------------ vpx_dsp/x86/inv_txfm_ssse3.c | 4 ++-- 3 files changed, 22 insertions(+), 23 deletions(-) diff --git a/vpx_dsp/x86/highbd_inv_txfm_sse2.h b/vpx_dsp/x86/highbd_inv_txfm_sse2.h index 16f03cd..ca771b5 100644 --- a/vpx_dsp/x86/highbd_inv_txfm_sse2.h +++ b/vpx_dsp/x86/highbd_inv_txfm_sse2.h @@ -233,10 +233,10 @@ static INLINE void highbd_idct_1_add_kernel(const tran_low_t *input, dc = _mm_set1_epi16(a1); for (i = 0; i < size; ++i) { - for (j = 0; j < (size >> 3); ++j) { - d = _mm_load_si128((const __m128i *)(&dest[j * 8])); + for (j = 0; j < size; j += 8) { + d = _mm_load_si128((const __m128i *)(&dest[j])); d = add_clamp(d, dc, bd); - _mm_store_si128((__m128i *)(&dest[j * 8]), d); + _mm_store_si128((__m128i *)(&dest[j]), d); } dest += stride; } diff --git a/vpx_dsp/x86/inv_txfm_sse2.c b/vpx_dsp/x86/inv_txfm_sse2.c index f730030..1df1c9d 100644 --- a/vpx_dsp/x86/inv_txfm_sse2.c +++ b/vpx_dsp/x86/inv_txfm_sse2.c @@ -458,10 +458,10 @@ void vpx_idct16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest, input += 128; } - for (i = 0; i < 2; i++) { + for (i = 0; i < 16; i += 8) { int j; - transpose_16bit_8x8(l + i * 8, out); - transpose_16bit_8x8(r + i * 8, out + 8); + transpose_16bit_8x8(l + i, out); + transpose_16bit_8x8(r + i, out + 8); idct16_8col(out); for (j = 0; j < 16; ++j) { @@ -489,9 +489,9 @@ void vpx_idct16x16_38_add_sse2(const tran_low_t *input, uint8_t *dest, in[15] = _mm_setzero_si128(); idct16_8col(in); - for (i = 0; i < 2; i++) { + for (i = 0; i < 16; i += 8) { int j; - transpose_16bit_8x8(in + i * 8, out); + transpose_16bit_8x8(in + i, out); out[8] = _mm_setzero_si128(); out[9] = _mm_setzero_si128(); out[10] = _mm_setzero_si128(); @@ -525,9 +525,9 @@ void vpx_idct16x16_10_add_sse2(const tran_low_t *input, uint8_t *dest, idct16x16_10_pass1(in, l); // Second 1-D inverse transform, performed per 8x16 block - for (i = 0; i < 2; i++) { + for (i = 0; i < 16; i += 8) { int j; - idct16x16_10_pass2(l + 8 * i, in); + idct16x16_10_pass2(l + i, in); for (j = 0; j < 16; ++j) { write_buffer_8x1(dest + j * stride, in[j]); @@ -1268,10 +1268,10 @@ void vpx_idct32x32_34_add_sse2(const tran_low_t *input, uint8_t *dest, col[29] = _mm_sub_epi16(stp1_2, stp1_29); col[30] = _mm_sub_epi16(stp1_1, stp1_30); col[31] = _mm_sub_epi16(stp1_0, stp1_31); - for (i = 0; i < 4; i++) { + for (i = 0; i < 32; i += 8) { int j; // Transpose 32x8 block to 8x32 block - transpose_16bit_8x8(col + i * 8, in); + transpose_16bit_8x8(col + i, in); IDCT32_34 // 2_D: Calculate the results and store them to destination. @@ -1588,10 +1588,10 @@ static void load_buffer_8x32(const tran_low_t *input, __m128i *in) { void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, int stride) { __m128i col[128], in[32]; - int i, j; + int i; // rows - for (i = 0; i < 4; ++i) { + for (i = 0; i < 4 * 32; i += 32) { load_buffer_8x32(input, in); input += 32 << 3; @@ -1601,17 +1601,16 @@ void vpx_idct32x32_1024_add_sse2(const tran_low_t *input, uint8_t *dest, transpose_16bit_8x8(in + 16, in + 16); transpose_16bit_8x8(in + 24, in + 24); - idct32_full_8x32(in, col + (i << 5)); + idct32_full_8x32(in, col + i); } // columns - for (i = 0; i < 4; ++i) { - j = i << 3; + for (i = 0; i < 32; i += 8) { // Transpose 32x8 block to 8x32 block - transpose_16bit_8x8(col + j, in); - transpose_16bit_8x8(col + j + 32, in + 8); - transpose_16bit_8x8(col + j + 64, in + 16); - transpose_16bit_8x8(col + j + 96, in + 24); + transpose_16bit_8x8(col + i, in); + transpose_16bit_8x8(col + i + 32, in + 8); + transpose_16bit_8x8(col + i + 64, in + 16); + transpose_16bit_8x8(col + i + 96, in + 24); idct32_full_8x32(in, in); store_buffer_8x32(in, dest, stride); diff --git a/vpx_dsp/x86/inv_txfm_ssse3.c b/vpx_dsp/x86/inv_txfm_ssse3.c index 0d8d60f..f9b37fe 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3.c +++ b/vpx_dsp/x86/inv_txfm_ssse3.c @@ -236,10 +236,10 @@ void vpx_idct32x32_34_add_ssse3(const tran_low_t *input, uint8_t *dest, // 1_D: Store 32 intermediate results for each 8x32 block. add_sub_butterfly(stp1, col, 32); - for (i = 0; i < 4; i++) { + for (i = 0; i < 32; i += 8) { int j; // Transpose 32x8 block to 8x32 block - transpose_16bit_8x8(col + i * 8, in); + transpose_16bit_8x8(col + i, in); idct32_34_first_half(in, stp1); idct32_34_second_half(in, stp1); -- 2.7.4