From: Linfeng Zhang Date: Mon, 28 Aug 2017 17:35:43 +0000 (-0700) Subject: Remove get_filter_base() and get_filter_offset() in convolve X-Git-Tag: v1.7.0~188^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=d331e7a1c0c59d4055a3bfacd051268ec0832b48;p=platform%2Fupstream%2Flibvpx.git Remove get_filter_base() and get_filter_offset() in convolve so that the convolve functions are independent of table alignment. Change-Id: Ieab132a30d72c6e75bbe9473544fbe2cf51541ee --- diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 535b9b0..f1a24af 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -33,9 +33,9 @@ static const unsigned int kMaxDimension = 64; typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h); + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h); typedef void (*WrapperFilterBlock2d8Func)( const uint8_t *src_ptr, const unsigned int src_stride, @@ -550,7 +550,7 @@ TEST_P(ConvolveTest, DISABLED_Copy_Speed) { vpx_usec_timer_start(&timer); for (int n = 0; n < kNumTests; ++n) { - UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0, + UUT_->copy_[0](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0, width, height); } vpx_usec_timer_mark(&timer); @@ -570,7 +570,7 @@ TEST_P(ConvolveTest, DISABLED_Avg_Speed) { vpx_usec_timer_start(&timer); for (int n = 0; n < kNumTests; ++n) { - UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, NULL, 0, + UUT_->copy_[1](in, kInputStride, out, kOutputStride, NULL, 0, 0, 0, 0, width, height); } vpx_usec_timer_mark(&timer); @@ -585,7 +585,7 @@ TEST_P(ConvolveTest, Copy) { uint8_t *const out = output(); ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, kOutputStride, - NULL, 0, NULL, 0, Width(), Height())); + NULL, 0, 0, 0, 0, Width(), Height())); CheckGuardBlocks(); @@ -604,7 +604,7 @@ TEST_P(ConvolveTest, Avg) { CopyOutputToRef(); ASM_REGISTER_STATE_CHECK(UUT_->copy_[1](in, kInputStride, out, kOutputStride, - NULL, 0, NULL, 0, Width(), Height())); + NULL, 0, 0, 0, 0, Width(), Height())); CheckGuardBlocks(); @@ -621,12 +621,10 @@ TEST_P(ConvolveTest, Avg) { TEST_P(ConvolveTest, CopyHoriz) { uint8_t *const in = input(); uint8_t *const out = output(); - DECLARE_ALIGNED(256, const int16_t, - filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 }; ASM_REGISTER_STATE_CHECK(UUT_->sh8_[0](in, kInputStride, out, kOutputStride, - filter8, 16, filter8, 16, Width(), - Height())); + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); CheckGuardBlocks(); @@ -641,12 +639,10 @@ TEST_P(ConvolveTest, CopyHoriz) { TEST_P(ConvolveTest, CopyVert) { uint8_t *const in = input(); uint8_t *const out = output(); - DECLARE_ALIGNED(256, const int16_t, - filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 }; ASM_REGISTER_STATE_CHECK(UUT_->sv8_[0](in, kInputStride, out, kOutputStride, - filter8, 16, filter8, 16, Width(), - Height())); + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); CheckGuardBlocks(); @@ -661,12 +657,10 @@ TEST_P(ConvolveTest, CopyVert) { TEST_P(ConvolveTest, Copy2D) { uint8_t *const in = input(); uint8_t *const out = output(); - DECLARE_ALIGNED(256, const int16_t, - filter8[8]) = { 0, 0, 0, 128, 0, 0, 0, 0 }; ASM_REGISTER_STATE_CHECK(UUT_->shv8_[0](in, kInputStride, out, kOutputStride, - filter8, 16, filter8, 16, Width(), - Height())); + vp9_filter_kernels[0], 0, 16, 0, 16, + Width(), Height())); CheckGuardBlocks(); @@ -702,7 +696,6 @@ TEST(ConvolveTest, FiltersWontSaturateWhenAddedPairwise) { } } -const int16_t kInvalidFilter[8] = { 0 }; const WrapperFilterBlock2d8Func wrapper_filter_block2d_8[2] = { wrapper_filter_block2d_8_c, wrapper_filter_average_block2d_8_c }; @@ -755,21 +748,21 @@ TEST_P(ConvolveTest, MatchesReferenceSubpixelFilter) { Width(), Height(), UUT_->use_highbd_); if (filter_x && filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->hv8_[i]( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - filters[filter_y], 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->hv8_[i](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height())); else if (filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->v8_[i]( - in, kInputStride, out, kOutputStride, kInvalidFilter, 16, - filters[filter_y], 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->v8_[i](in, kInputStride, out, kOutputStride, filters, 0, + 16, filter_y, 16, Width(), Height())); else if (filter_x) - ASM_REGISTER_STATE_CHECK(UUT_->h8_[i]( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - kInvalidFilter, 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->h8_[i](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, 0, 16, Width(), Height())); else - ASM_REGISTER_STATE_CHECK(UUT_->copy_[i]( - in, kInputStride, out, kOutputStride, kInvalidFilter, 0, - kInvalidFilter, 0, Width(), Height())); + ASM_REGISTER_STATE_CHECK(UUT_->copy_[i](in, kInputStride, out, + kOutputStride, NULL, 0, 0, + 0, 0, Width(), Height())); CheckGuardBlocks(); @@ -853,21 +846,21 @@ TEST_P(ConvolveTest, FilterExtremes) { filters[filter_y], ref, kOutputStride, Width(), Height(), UUT_->use_highbd_); if (filter_x && filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->hv8_[0]( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - filters[filter_y], 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->hv8_[0](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, filter_y, 16, Width(), Height())); else if (filter_y) - ASM_REGISTER_STATE_CHECK(UUT_->v8_[0]( - in, kInputStride, out, kOutputStride, kInvalidFilter, 16, - filters[filter_y], 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->v8_[0](in, kInputStride, out, kOutputStride, filters, 0, + 16, filter_y, 16, Width(), Height())); else if (filter_x) - ASM_REGISTER_STATE_CHECK(UUT_->h8_[0]( - in, kInputStride, out, kOutputStride, filters[filter_x], 16, - kInvalidFilter, 16, Width(), Height())); + ASM_REGISTER_STATE_CHECK( + UUT_->h8_[0](in, kInputStride, out, kOutputStride, filters, + filter_x, 16, 0, 16, Width(), Height())); else - ASM_REGISTER_STATE_CHECK(UUT_->copy_[0]( - in, kInputStride, out, kOutputStride, kInvalidFilter, 0, - kInvalidFilter, 0, Width(), Height())); + ASM_REGISTER_STATE_CHECK(UUT_->copy_[0](in, kInputStride, out, + kOutputStride, NULL, 0, 0, + 0, 0, Width(), Height())); for (int y = 0; y < Height(); ++y) { for (int x = 0; x < Width(); ++x) @@ -897,8 +890,8 @@ TEST_P(ConvolveTest, CheckScalingFiltering) { for (int step = 1; step <= 32; ++step) { /* Test the horizontal and vertical filters in combination. */ ASM_REGISTER_STATE_CHECK( - UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap[frac], - step, eighttap[frac], step, Width(), Height())); + UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, frac, + step, frac, step, Width(), Height())); CheckGuardBlocks(); @@ -917,14 +910,14 @@ TEST_P(ConvolveTest, CheckScalingFiltering) { using std::tr1::make_tuple; #if CONFIG_VP9_HIGHBITDEPTH -#define WRAP(func, bd) \ - void wrap_##func##_##bd( \ - const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int filter_x_stride, \ - const int16_t *filter_y, int filter_y_stride, int w, int h) { \ - vpx_highbd_##func(reinterpret_cast(src), src_stride, \ - reinterpret_cast(dst), dst_stride, filter_x, \ - filter_x_stride, filter_y, filter_y_stride, w, h, bd); \ +#define WRAP(func, bd) \ + void wrap_##func##_##bd( \ + const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { \ + vpx_highbd_##func(reinterpret_cast(src), src_stride, \ + reinterpret_cast(dst), dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); \ } #if HAVE_SSE2 && ARCH_X86_64 diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h index 1b09b38..bb9291a 100644 --- a/vp9/common/vp9_reconinter.h +++ b/vp9/common/vp9_reconinter.h @@ -26,9 +26,9 @@ static INLINE void inter_predictor(const uint8_t *src, int src_stride, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, int xs, int ys) { - sf->predict[subpel_x != 0][subpel_y != 0][ref]( - src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y], - ys, w, h); + sf->predict[subpel_x != 0][subpel_y != 0][ref](src, src_stride, dst, + dst_stride, kernel, subpel_x, + xs, subpel_y, ys, w, h); } #if CONFIG_VP9_HIGHBITDEPTH @@ -37,8 +37,8 @@ static INLINE void highbd_inter_predictor( const int subpel_x, const int subpel_y, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, int xs, int ys, int bd) { sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref]( - src, src_stride, dst, dst_stride, kernel[subpel_x], xs, kernel[subpel_y], - ys, w, h, bd); + src, src_stride, dst, dst_stride, kernel, subpel_x, xs, subpel_y, ys, w, + h, bd); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index cea5f42..70ddc7c 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -390,12 +390,12 @@ void vp9_denoiser_denoise(VP9_COMP *cpi, MACROBLOCK *mb, int mi_row, int mi_col, } if (decision == FILTER_BLOCK) { - vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, - NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2, + vpx_convolve_copy(avg_start, avg.y_stride, src.buf, src.stride, NULL, 0, 0, + 0, 0, num_4x4_blocks_wide_lookup[bs] << 2, num_4x4_blocks_high_lookup[bs] << 2); } else { // COPY_BLOCK - vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, - NULL, 0, num_4x4_blocks_wide_lookup[bs] << 2, + vpx_convolve_copy(src.buf, src.stride, avg_start, avg.y_stride, NULL, 0, 0, + 0, 0, num_4x4_blocks_wide_lookup[bs] << 2, num_4x4_blocks_high_lookup[bs] << 2); } *denoiser_decision = decision; diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 870ba9f..29f2366 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -2645,15 +2645,14 @@ static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src, if (src->flags & YV12_FLAG_HIGHBITDEPTH) { vpx_highbd_convolve8(CONVERT_TO_SHORTPTR(src_ptr), src_stride, - CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, - kernel[x_q4 & 0xf], 16 * src_w / dst_w, - kernel[y_q4 & 0xf], 16 * src_h / dst_h, - 16 / factor, 16 / factor, bd); + CONVERT_TO_SHORTPTR(dst_ptr), dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor, + bd); } else { - vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, - kernel[x_q4 & 0xf], 16 * src_w / dst_w, - kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor, - 16 / factor); + vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor); } } } diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c index e586283..f49c443 100644 --- a/vp9/encoder/vp9_frame_scale.c +++ b/vp9/encoder/vp9_frame_scale.c @@ -43,10 +43,9 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, (x / factor) * src_w / dst_w; uint8_t *dst_ptr = dsts[i] + (y / factor) * dst_stride + (x / factor); - vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, - kernel[x_q4 & 0xf], 16 * src_w / dst_w, - kernel[y_q4 & 0xf], 16 * src_h / dst_h, 16 / factor, - 16 / factor); + vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride, kernel, + x_q4 & 0xf, 16 * src_w / dst_w, y_q4 & 0xf, + 16 * src_h / dst_h, 16 / factor, 16 / factor); } } } diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 6d9d0f0..82ec4b8 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -2162,15 +2162,15 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, vpx_highbd_convolve_copy( CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride, CONVERT_TO_SHORTPTR(this_mode_pred->data), this_mode_pred->stride, - NULL, 0, NULL, 0, bw, bh, xd->bd); + NULL, 0, 0, 0, 0, bw, bh, xd->bd); else vpx_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, NULL, - 0, NULL, 0, bw, bh); + 0, 0, 0, 0, bw, bh); #else vpx_convolve_copy(best_pred->data, best_pred->stride, this_mode_pred->data, this_mode_pred->stride, NULL, 0, - NULL, 0, bw, bh); + 0, 0, 0, bw, bh); #endif // CONFIG_VP9_HIGHBITDEPTH best_pred = this_mode_pred; } @@ -2264,14 +2264,14 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (cm->use_highbitdepth) vpx_highbd_convolve_copy( CONVERT_TO_SHORTPTR(best_pred->data), best_pred->stride, - CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, NULL, 0, + CONVERT_TO_SHORTPTR(pd->dst.buf), pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh, xd->bd); else vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, - pd->dst.stride, NULL, 0, NULL, 0, bw, bh); + pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh); #else vpx_convolve_copy(best_pred->data, best_pred->stride, pd->dst.buf, - pd->dst.stride, NULL, 0, NULL, 0, bw, bh); + pd->dst.stride, NULL, 0, 0, 0, 0, bw, bh); #endif // CONFIG_VP9_HIGHBITDEPTH } } diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index e5b4e6a..b101f35 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -600,7 +600,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16, - 32, NULL, 0, NULL, 0, bs, bs, xd->bd); + 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd); if (xd->lossless) { vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); } else { @@ -623,7 +623,7 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, recon = CONVERT_TO_BYTEPTR(recon16); } else { #endif // CONFIG_VP9_HIGHBITDEPTH - vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, NULL, 0, bs, bs); + vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs); switch (tx_size) { case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break; case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break; diff --git a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c index 74345e1..a12e803 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve8_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve8_neon.c @@ -137,15 +137,14 @@ static INLINE uint16x8_t convolve8_8(const int16x8_t s0, const int16x8_t s1, void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { if (x_step_q4 != 16) { - vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); } else { - const int16x8_t filters = vld1q_s16(filter_x); + const int16x8_t filters = vld1q_s16(filter[x0_q4]); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); uint16x8_t t0, t1, t2, t3; @@ -337,15 +336,15 @@ void vpx_highbd_convolve8_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, int h, int bd) { + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, + int y_step_q4, int w, int h, int bd) { if (x_step_q4 != 16) { - vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); } else { - const int16x8_t filters = vld1q_s16(filter_x); + const int16x8_t filters = vld1q_s16(filter[x0_q4]); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); uint16x8_t t0, t1, t2, t3; @@ -566,15 +565,14 @@ void vpx_highbd_convolve8_avg_horiz_neon(const uint16_t *src, void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { if (y_step_q4 != 16) { - vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h, bd); } else { - const int16x8_t filters = vld1q_s16(filter_y); + const int16x8_t filters = vld1q_s16(filter[y0_q4]); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); assert(!((intptr_t)dst & 3)); @@ -732,15 +730,15 @@ void vpx_highbd_convolve8_vert_neon(const uint16_t *src, ptrdiff_t src_stride, void vpx_highbd_convolve8_avg_vert_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { if (y_step_q4 != 16) { - vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); } else { - const int16x8_t filters = vld1q_s16(filter_y); + const int16x8_t filters = vld1q_s16(filter[y0_q4]); const uint16x8_t max = vdupq_n_u16((1 << bd) - 1); assert(!((intptr_t)dst & 3)); diff --git a/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c index 4ff3dea..765a054 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve_avg_neon.c @@ -15,13 +15,14 @@ void vpx_highbd_convolve_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; if (w < 8) { // avg4 diff --git a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c index 61712d4..9d2752e 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve_copy_neon.c @@ -15,13 +15,14 @@ void vpx_highbd_convolve_copy_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; if (w < 8) { // copy4 diff --git a/vpx_dsp/arm/highbd_vpx_convolve_neon.c b/vpx_dsp/arm/highbd_vpx_convolve_neon.c index 1f2631b..414ade3 100644 --- a/vpx_dsp/arm/highbd_vpx_convolve_neon.c +++ b/vpx_dsp/arm/highbd_vpx_convolve_neon.c @@ -15,10 +15,9 @@ void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); // + 1 to make it divisible by 4 uint16_t temp[64 * 136]; const int intermediate_height = @@ -29,20 +28,19 @@ void vpx_highbd_convolve8_neon(const uint16_t *src, ptrdiff_t src_stride, * buffer which has lots of extra room and is subsequently discarded this is * safe if somewhat less than ideal. */ vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, - filter_x, x_step_q4, filter_y, y_step_q4, w, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, intermediate_height, bd); /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, bd); } void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); // + 1 to make it divisible by 4 uint16_t temp[64 * 136]; const int intermediate_height = @@ -52,8 +50,9 @@ void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, * to average the values after both passes. */ vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, - filter_x, x_step_q4, filter_y, y_step_q4, w, + filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, intermediate_height, bd); - vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h, bd); + vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, + bd); } diff --git a/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm b/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm index e279d57..1c2ee50 100644 --- a/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm +++ b/vpx_dsp/arm/vpx_convolve8_avg_neon_asm.asm @@ -42,10 +42,11 @@ ; r1 int src_stride ; r2 uint8_t *dst ; r3 int dst_stride -; sp[]const int16_t *filter_x -; sp[]int x_step_q4 -; sp[]const int16_t *filter_y ; unused -; sp[]int y_step_q4 ; unused +; sp[]const int16_t *filter +; sp[]int x0_q4 +; sp[]int x_step_q4 ; unused +; sp[]int y0_q4 +; sp[]int y_step_q4 ; unused ; sp[]int w ; sp[]int h @@ -54,11 +55,11 @@ sub r0, r0, #3 ; adjust for taps - ldr r5, [sp, #32] ; filter_x - ldr r6, [sp, #48] ; w - ldr r7, [sp, #52] ; h + ldrd r4, r5, [sp, #32] ; filter, x0_q4 + add r4, r5, lsl #4 + ldrd r6, r7, [sp, #52] ; w, h - vld1.s16 {q0}, [r5] ; filter_x + vld1.s16 {q0}, [r4] ; filter sub r8, r1, r1, lsl #2 ; -src_stride * 3 add r8, r8, #4 ; -src_stride * 3 + 4 @@ -127,7 +128,7 @@ vpx_convolve8_avg_loop_horiz sub r2, r2, r3, lsl #2 ; reset for store - ; src[] * filter_x + ; src[] * filter MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 @@ -184,11 +185,13 @@ vpx_convolve8_avg_loop_horiz sub r0, r0, r1 sub r0, r0, r1, lsl #1 - ldr r4, [sp, #32] ; filter_y - ldr r6, [sp, #40] ; w - ldr lr, [sp, #44] ; h + ldr r4, [sp, #24] ; filter + ldr r5, [sp, #36] ; y0_q4 + add r4, r5, lsl #4 + ldr r6, [sp, #44] ; w + ldr lr, [sp, #48] ; h - vld1.s16 {q0}, [r4] ; filter_y + vld1.s16 {q0}, [r4] ; filter lsl r1, r1, #1 lsl r3, r3, #1 @@ -232,7 +235,7 @@ vpx_convolve8_avg_loop_vert pld [r7] pld [r4] - ; src[] * filter_y + ; src[] * filter MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 pld [r7, r1] diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index 1386838..3aaa9e3 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -125,11 +125,10 @@ static INLINE int16x8_t convolve8_8(int16x8_t s0, int16x8_t s1, int16x8_t s2, void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused - int w, int h) { - const int16x8_t filters = vld1q_s16(filter_x); + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + const int16x8_t filters = vld1q_s16(filter[x0_q4]); uint8x8_t t0, t1, t2, t3; assert(!((intptr_t)dst & 3)); @@ -137,8 +136,8 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, assert(x_step_q4 == 16); (void)x_step_q4; + (void)y0_q4; (void)y_step_q4; - (void)filter_y; src -= 3; @@ -390,11 +389,10 @@ void vpx_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, // unused - int y_step_q4, // unused + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int16x8_t filters = vld1q_s16(filter_x); + const int16x8_t filters = vld1q_s16(filter[x0_q4]); uint8x8_t t0, t1, t2, t3; assert(!((intptr_t)dst & 3)); @@ -402,8 +400,8 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, assert(x_step_q4 == 16); (void)x_step_q4; + (void)y0_q4; (void)y_step_q4; - (void)filter_y; src -= 3; @@ -692,19 +690,18 @@ void vpx_convolve8_avg_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int16x8_t filters = vld1q_s16(filter_y); + const int16x8_t filters = vld1q_s16(filter[y0_q4]); assert(!((intptr_t)dst & 3)); assert(!(dst_stride & 3)); assert(y_step_q4 == 16); + (void)x0_q4; (void)x_step_q4; (void)y_step_q4; - (void)filter_x; src -= 3 * src_stride; @@ -864,19 +861,18 @@ void vpx_convolve8_vert_neon(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, // unused - int x_step_q4, // unused - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const int16x8_t filters = vld1q_s16(filter_y); + const int16x8_t filters = vld1q_s16(filter[y0_q4]); assert(!((intptr_t)dst & 3)); assert(!(dst_stride & 3)); assert(y_step_q4 == 16); + (void)x0_q4; (void)x_step_q4; (void)y_step_q4; - (void)filter_x; src -= 3 * src_stride; diff --git a/vpx_dsp/arm/vpx_convolve8_neon_asm.asm b/vpx_dsp/arm/vpx_convolve8_neon_asm.asm index 2d0f2ae..5eee156 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon_asm.asm +++ b/vpx_dsp/arm/vpx_convolve8_neon_asm.asm @@ -42,10 +42,11 @@ ; r1 int src_stride ; r2 uint8_t *dst ; r3 int dst_stride -; sp[]const int16_t *filter_x -; sp[]int x_step_q4 -; sp[]const int16_t *filter_y ; unused -; sp[]int y_step_q4 ; unused +; sp[]const int16_t *filter +; sp[]int x0_q4 +; sp[]int x_step_q4 ; unused +; sp[]int y0_q4 +; sp[]int y_step_q4 ; unused ; sp[]int w ; sp[]int h @@ -54,11 +55,11 @@ sub r0, r0, #3 ; adjust for taps - ldr r5, [sp, #32] ; filter_x - ldr r6, [sp, #48] ; w - ldr r7, [sp, #52] ; h + ldrd r4, r5, [sp, #32] ; filter, x0_q4 + add r4, r5, lsl #4 + ldrd r6, r7, [sp, #52] ; w, h - vld1.s16 {q0}, [r5] ; filter_x + vld1.s16 {q0}, [r4] ; filter sub r8, r1, r1, lsl #2 ; -src_stride * 3 add r8, r8, #4 ; -src_stride * 3 + 4 @@ -119,7 +120,7 @@ vpx_convolve8_loop_horiz pld [r5, r1, lsl #1] - ; src[] * filter_x + ; src[] * filter MULTIPLY_BY_Q0 q1, d16, d17, d20, d22, d18, d19, d23, d24 MULTIPLY_BY_Q0 q2, d17, d20, d22, d18, d19, d23, d24, d26 MULTIPLY_BY_Q0 q14, d20, d22, d18, d19, d23, d24, d26, d27 @@ -173,11 +174,13 @@ vpx_convolve8_loop_horiz sub r0, r0, r1 sub r0, r0, r1, lsl #1 - ldr r4, [sp, #32] ; filter_y - ldr r6, [sp, #40] ; w - ldr lr, [sp, #44] ; h + ldr r4, [sp, #24] ; filter + ldr r5, [sp, #36] ; y0_q4 + add r4, r5, lsl #4 + ldr r6, [sp, #44] ; w + ldr lr, [sp, #48] ; h - vld1.s16 {q0}, [r4] ; filter_y + vld1.s16 {q0}, [r4] ; filter lsl r1, r1, #1 lsl r3, r3, #1 @@ -216,7 +219,7 @@ vpx_convolve8_loop_vert pld [r5] pld [r8] - ; src[] * filter_y + ; src[] * filter MULTIPLY_BY_Q0 q1, d16, d17, d18, d19, d20, d21, d22, d24 pld [r5, r3] diff --git a/vpx_dsp/arm/vpx_convolve_avg_neon.c b/vpx_dsp/arm/vpx_convolve_avg_neon.c index 04cb835..07349d0 100644 --- a/vpx_dsp/arm/vpx_convolve_avg_neon.c +++ b/vpx_dsp/arm/vpx_convolve_avg_neon.c @@ -15,13 +15,13 @@ void vpx_convolve_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, - int h) { - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; if (w < 8) { // avg4 uint8x8_t s0, s1; diff --git a/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm b/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm index 97e6189..efd6574 100644 --- a/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm +++ b/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm @@ -17,7 +17,7 @@ |vpx_convolve_avg_neon| PROC push {r4-r6, lr} - ldrd r4, r5, [sp, #32] + ldrd r4, r5, [sp, #36] mov r6, r2 cmp r4, #32 diff --git a/vpx_dsp/arm/vpx_convolve_copy_neon.c b/vpx_dsp/arm/vpx_convolve_copy_neon.c index a8f690a..7abed67 100644 --- a/vpx_dsp/arm/vpx_convolve_copy_neon.c +++ b/vpx_dsp/arm/vpx_convolve_copy_neon.c @@ -15,13 +15,14 @@ void vpx_convolve_copy_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; if (w < 8) { // copy4 do { diff --git a/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm b/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm index 89164ad..7a66e3c 100644 --- a/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm +++ b/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm @@ -17,7 +17,7 @@ |vpx_convolve_copy_neon| PROC push {r4-r5, lr} - ldrd r4, r5, [sp, #28] + ldrd r4, r5, [sp, #32] cmp r4, #32 bgt copy64 diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c index bdaaff16..2bf2d89 100644 --- a/vpx_dsp/arm/vpx_convolve_neon.c +++ b/vpx_dsp/arm/vpx_convolve_neon.c @@ -15,8 +15,8 @@ #include "vpx_ports/mem.h" void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). @@ -33,19 +33,19 @@ void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, * height and filter a multiple of 4 lines. Since this goes in to the temp * buffer which has lots of extra room and is subsequently discarded this is * safe if somewhat less than ideal. */ - vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x, - x_step_q4, filter_y, y_step_q4, w, + vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, intermediate_height); /* Step into the temp buffer 3 lines to get the actual frame data */ - vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { uint8_t temp[64 * 72]; const int intermediate_height = h + 7; @@ -56,9 +56,9 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, /* This implementation has the same issues as above. In addition, we only want * to average the values after both passes. */ - vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x, - x_step_q4, filter_y, y_step_q4, w, + vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, intermediate_height); - vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } diff --git a/vpx_dsp/mips/convolve2_avg_dspr2.c b/vpx_dsp/mips/convolve2_avg_dspr2.c index ae88edd..18e7d53 100644 --- a/vpx_dsp/mips/convolve2_avg_dspr2.c +++ b/vpx_dsp/mips/convolve2_avg_dspr2.c @@ -219,9 +219,10 @@ static void convolve_bi_avg_vert_64_dspr2(const uint8_t *src, void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; uint32_t pos = 38; assert(y_step_q4 == 16); @@ -247,8 +248,8 @@ void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c b/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c index e944207..7dcb662 100644 --- a/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c +++ b/vpx_dsp/mips/convolve2_avg_horiz_dspr2.c @@ -751,9 +751,10 @@ static void convolve_bi_avg_horiz_64_dspr2(const uint8_t *src_ptr, void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; uint32_t pos = 38; assert(x_step_q4 == 16); @@ -793,8 +794,8 @@ void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/convolve2_horiz_dspr2.c b/vpx_dsp/mips/convolve2_horiz_dspr2.c index 5cc06b5..9e65a8f 100644 --- a/vpx_dsp/mips/convolve2_horiz_dspr2.c +++ b/vpx_dsp/mips/convolve2_horiz_dspr2.c @@ -628,9 +628,10 @@ static void convolve_bi_horiz_64_dspr2(const uint8_t *src_ptr, void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; uint32_t pos = 38; assert(x_step_q4 == 16); @@ -672,8 +673,8 @@ void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filter_x, (int32_t)h); break; default: - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/convolve2_vert_dspr2.c b/vpx_dsp/mips/convolve2_vert_dspr2.c index eb1975e..a3e967b 100644 --- a/vpx_dsp/mips/convolve2_vert_dspr2.c +++ b/vpx_dsp/mips/convolve2_vert_dspr2.c @@ -201,9 +201,10 @@ static void convolve_bi_vert_64_dspr2(const uint8_t *src, int32_t src_stride, void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; uint32_t pos = 38; assert(y_step_q4 == 16); @@ -228,8 +229,8 @@ void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, convolve_bi_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/convolve8_avg_dspr2.c b/vpx_dsp/mips/convolve8_avg_dspr2.c index b4ed6ee..d9c2bef 100644 --- a/vpx_dsp/mips/convolve8_avg_dspr2.c +++ b/vpx_dsp/mips/convolve8_avg_dspr2.c @@ -334,15 +334,16 @@ static void convolve_avg_vert_64_dspr2(const uint8_t *src, int32_t src_stride, void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; assert(y_step_q4 == 16); assert(((const int32_t *)filter_y)[1] != 0x800000); if (((const int32_t *)filter_y)[0] == 0) { - vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve2_avg_vert_dspr2(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -367,8 +368,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } @@ -376,8 +377,8 @@ void vpx_convolve8_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { /* Fixed size intermediate buffer places limits on parameters. */ DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); @@ -390,24 +391,26 @@ void vpx_convolve8_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, if (intermediate_height < h) intermediate_height = h; - vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter_x, - x_step_q4, filter_y, y_step_q4, w, intermediate_height); + vpx_convolve8_horiz(src - (src_stride * 3), src_stride, temp, 64, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, + intermediate_height); - vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert(temp + 64 * 3, 64, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_convolve_avg_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, int w, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { int x, y; uint32_t tp1, tp2, tn1, tp3, tp4, tn2; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; /* prefetch data to cache memory */ prefetch_load(src); diff --git a/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c b/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c index 9a9bab2..fb68ad8 100644 --- a/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c +++ b/vpx_dsp/mips/convolve8_avg_horiz_dspr2.c @@ -938,15 +938,16 @@ static void convolve_avg_horiz_64_dspr2(const uint8_t *src_ptr, void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; assert(x_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); if (((const int32_t *)filter_x)[0] == 0) { - vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve2_avg_horiz_dspr2(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -987,9 +988,8 @@ void vpx_convolve8_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, h); break; default: - vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, w, - h); + vpx_convolve8_avg_horiz_c(src + 3, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/convolve8_dspr2.c b/vpx_dsp/mips/convolve8_dspr2.c index 8d35b63..89f0f41 100644 --- a/vpx_dsp/mips/convolve8_dspr2.c +++ b/vpx_dsp/mips/convolve8_dspr2.c @@ -1296,9 +1296,11 @@ void copy_horiz_transposed(const uint8_t *src, ptrdiff_t src_stride, } void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; DECLARE_ALIGNED(32, uint8_t, temp[64 * 135]); int32_t intermediate_height = ((h * y_step_q4) >> 4) + 7; uint32_t pos = 38; @@ -1395,14 +1397,15 @@ void vpx_convolve8_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h) { + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { int x, y; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; /* prefetch data to cache memory */ prefetch_load(src); diff --git a/vpx_dsp/mips/convolve8_horiz_dspr2.c b/vpx_dsp/mips/convolve8_horiz_dspr2.c index 196a0a2..77e95c8 100644 --- a/vpx_dsp/mips/convolve8_horiz_dspr2.c +++ b/vpx_dsp/mips/convolve8_horiz_dspr2.c @@ -818,15 +818,16 @@ static void convolve_horiz_64_dspr2(const uint8_t *src_ptr, int32_t src_stride, void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; assert(x_step_q4 == 16); assert(((const int32_t *)filter_x)[1] != 0x800000); if (((const int32_t *)filter_x)[0] == 0) { - vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve2_horiz_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -868,8 +869,8 @@ void vpx_convolve8_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filter_x, (int32_t)h); break; default: - vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src + 3, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/convolve8_vert_dspr2.c b/vpx_dsp/mips/convolve8_vert_dspr2.c index ad107d5..c329f71 100644 --- a/vpx_dsp/mips/convolve8_vert_dspr2.c +++ b/vpx_dsp/mips/convolve8_vert_dspr2.c @@ -318,15 +318,16 @@ static void convolve_vert_64_dspr2(const uint8_t *src, int32_t src_stride, void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; assert(y_step_q4 == 16); assert(((const int32_t *)filter_y)[1] != 0x800000); if (((const int32_t *)filter_y)[0] == 0) { - vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve2_vert_dspr2(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } else { uint32_t pos = 38; @@ -349,8 +350,8 @@ void vpx_convolve8_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, convolve_vert_64_dspr2(src, src_stride, dst, dst_stride, filter_y, h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/convolve_common_dspr2.h b/vpx_dsp/mips/convolve_common_dspr2.h index 4eee3bd..48e440d 100644 --- a/vpx_dsp/mips/convolve_common_dspr2.h +++ b/vpx_dsp/mips/convolve_common_dspr2.h @@ -24,21 +24,21 @@ extern "C" { #if HAVE_DSPR2 void vpx_convolve2_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); void vpx_convolve2_avg_horiz_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h); void vpx_convolve2_avg_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter, int w, @@ -46,9 +46,9 @@ void vpx_convolve2_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, void vpx_convolve2_vert_dspr2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h); + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h); #endif // #if HAVE_DSPR2 #ifdef __cplusplus diff --git a/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c index ad2af28..a88eee6 100644 --- a/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_avg_horiz_msa.c @@ -633,9 +633,10 @@ static void common_hz_2t_and_aver_dst_64w_msa(const uint8_t *src, void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; int8_t cnt, filt_hor[8]; assert(x_step_q4 == 16); @@ -668,8 +669,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, &filt_hor[3], h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -695,8 +696,8 @@ void vpx_convolve8_avg_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filt_hor, h); break; default: - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/vpx_convolve8_avg_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_msa.c index 1cfa632..aebc80d 100644 --- a/vpx_dsp/mips/vpx_convolve8_avg_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_avg_msa.c @@ -516,9 +516,10 @@ static void common_hv_2ht_2vt_and_aver_dst_64w_msa( void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_hor[8], filt_ver[8]; assert(x_step_q4 == 16); @@ -560,14 +561,14 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_hor[3], &filt_ver[3], h); break; default: - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } else if (((const int32_t *)filter_x)[0] == 0 || ((const int32_t *)filter_y)[0] == 0) { - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } else { switch (w) { case 4: @@ -596,8 +597,8 @@ void vpx_convolve8_avg_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver, h); break; default: - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c index 146ce3b..da7d2e6 100644 --- a/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_avg_vert_msa.c @@ -605,9 +605,10 @@ static void common_vt_2t_and_aver_dst_64w_msa(const uint8_t *src, void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_ver[8]; assert(y_step_q4 == 16); @@ -640,8 +641,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, &filt_ver[3], h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -668,8 +669,8 @@ void vpx_convolve8_avg_vert_msa(const uint8_t *src, ptrdiff_t src_stride, (int32_t)dst_stride, filt_ver, h); break; default: - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/vpx_convolve8_horiz_msa.c b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c index 9e8bf7b..152dc26 100644 --- a/vpx_dsp/mips/vpx_convolve8_horiz_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_horiz_msa.c @@ -621,9 +621,10 @@ static void common_hz_2t_64w_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_x = filter[x0_q4]; int8_t cnt, filt_hor[8]; assert(x_step_q4 == 16); @@ -656,8 +657,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_hor[3], h); break; default: - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -683,8 +684,8 @@ void vpx_convolve8_horiz_msa(const uint8_t *src, ptrdiff_t src_stride, filt_hor, h); break; default: - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/vpx_convolve8_msa.c b/vpx_dsp/mips/vpx_convolve8_msa.c index b16ec57..8c4d19f 100644 --- a/vpx_dsp/mips/vpx_convolve8_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_msa.c @@ -541,9 +541,11 @@ static void common_hv_2ht_2vt_64w_msa(const uint8_t *src, int32_t src_stride, } void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int32_t x_step_q4, const int16_t *filter_y, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int32_t x_step_q4, int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { + const int16_t *const filter_x = filter[x0_q4]; + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_hor[8], filt_ver[8]; assert(x_step_q4 == 16); @@ -585,14 +587,14 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, &filt_ver[3], (int32_t)h); break; default: - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } else if (((const int32_t *)filter_x)[0] == 0 || ((const int32_t *)filter_y)[0] == 0) { - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); } else { switch (w) { case 4: @@ -621,8 +623,8 @@ void vpx_convolve8_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, (int32_t)h); break; default: - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/vpx_convolve8_vert_msa.c b/vpx_dsp/mips/vpx_convolve8_vert_msa.c index 4106822..13fce00 100644 --- a/vpx_dsp/mips/vpx_convolve8_vert_msa.c +++ b/vpx_dsp/mips/vpx_convolve8_vert_msa.c @@ -628,9 +628,10 @@ static void common_vt_2t_64w_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, int w, int h) { + const int16_t *const filter_y = filter[y0_q4]; int8_t cnt, filt_ver[8]; assert(y_step_q4 == 16); @@ -663,8 +664,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, &filt_ver[3], h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } else { @@ -690,8 +691,8 @@ void vpx_convolve8_vert_msa(const uint8_t *src, ptrdiff_t src_stride, filt_ver, h); break; default: - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } diff --git a/vpx_dsp/mips/vpx_convolve_avg_msa.c b/vpx_dsp/mips/vpx_convolve_avg_msa.c index f9f7181..ce64993 100644 --- a/vpx_dsp/mips/vpx_convolve_avg_msa.c +++ b/vpx_dsp/mips/vpx_convolve_avg_msa.c @@ -189,13 +189,14 @@ static void avg_width64_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve_avg_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; switch (w) { case 4: { diff --git a/vpx_dsp/mips/vpx_convolve_copy_msa.c b/vpx_dsp/mips/vpx_convolve_copy_msa.c index 241257c..c2ab33a 100644 --- a/vpx_dsp/mips/vpx_convolve_copy_msa.c +++ b/vpx_dsp/mips/vpx_convolve_copy_msa.c @@ -199,13 +199,14 @@ static void copy_width64_msa(const uint8_t *src, int32_t src_stride, void vpx_convolve_copy_msa(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; switch (w) { case 4: { diff --git a/vpx_dsp/ppc/vpx_convolve_vsx.c b/vpx_dsp/ppc/vpx_convolve_vsx.c index 55dcdc2..5c3ba45 100644 --- a/vpx_dsp/ppc/vpx_convolve_vsx.c +++ b/vpx_dsp/ppc/vpx_convolve_vsx.c @@ -53,13 +53,13 @@ static inline void copy_w64(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve_copy_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, - int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; switch (w) { case 16: { @@ -132,14 +132,8 @@ static inline void avg_w64(const uint8_t *src, ptrdiff_t src_stride, void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int32_t filter_x_stride, - const int16_t *filter_y, int32_t filter_y_stride, - int32_t w, int32_t h) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; - + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int32_t y_step_q4, int32_t w, int32_t h) { switch (w) { case 16: { avg_w16(src, src_stride, dst, dst_stride, h); @@ -154,8 +148,8 @@ void vpx_convolve_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, break; } default: { - vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter_x, - filter_x_stride, filter_y, filter_y_stride, w, h); + vpx_convolve_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); break; } } @@ -299,9 +293,9 @@ static inline void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, static inline void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h) { + const InterpKernel *const filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -324,95 +318,77 @@ static inline void convolve(const uint8_t *src, ptrdiff_t src_stride, assert(x_step_q4 <= 32); convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, - x_filters, x0_q4, x_step_q4, w, intermediate_height); - convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, - y_filters, y0_q4, y_step_q4, w, h); + filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, + y0_q4, y_step_q4, w, h); } void vpx_convolve8_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; + (void)y0_q4; (void)y_step_q4; - convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - w, h); + convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, + h); } void vpx_convolve8_avg_horiz_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; + (void)y0_q4; (void)y_step_q4; - convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, w, h); + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + w, h); } void vpx_convolve8_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; + (void)x0_q4; (void)x_step_q4; - convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, - w, h); + convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, + h); } void vpx_convolve8_avg_vert_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; + (void)x0_q4; (void)x_step_q4; - convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, - y_step_q4, w, h); + convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, + w, h); } void vpx_convolve8_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - filters_y, y0_q4, y_step_q4, w, h); + convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); } void vpx_convolve8_avg_vsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { // Fixed size intermediate buffer places limits on parameters. DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); assert(w <= 64); assert(h <= 64); - vpx_convolve8_vsx(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y, + vpx_convolve8_vsx(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); - vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); + vpx_convolve_avg_vsx(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); } diff --git a/vpx_dsp/vpx_convolve.c b/vpx_dsp/vpx_convolve.c index 02c5a95..76b5e47 100644 --- a/vpx_dsp/vpx_convolve.c +++ b/vpx_dsp/vpx_convolve.c @@ -114,10 +114,9 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, } static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const InterpKernel *const x_filters, - int x0_q4, int x_step_q4, - const InterpKernel *const y_filters, int y0_q4, - int y_step_q4, int w, int h) { + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -140,108 +139,86 @@ static void convolve(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, assert(x_step_q4 <= 32); convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, - x_filters, x0_q4, x_step_q4, w, intermediate_height); - convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, - y_filters, y0_q4, y_step_q4, w, h); + filter, x0_q4, x_step_q4, w, intermediate_height); + convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, + y0_q4, y_step_q4, w, h); } void vpx_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)y0_q4; (void)y_step_q4; - - convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - w, h); + convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, + h); } void vpx_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; + (void)y0_q4; (void)y_step_q4; - - convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, w, h); + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + w, h); } void vpx_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + (void)x0_q4; (void)x_step_q4; - - convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, y_step_q4, - w, h); + convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, + h); } void vpx_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; + (void)x0_q4; (void)x_step_q4; - - convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, - y_step_q4, w, h); + convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, + w, h); } void vpx_convolve8_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - filters_y, y0_q4, y_step_q4, w, h); + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, y0_q4, + y_step_q4, w, h); } void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { // Fixed size intermediate buffer places limits on parameters. DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); assert(w <= 64); assert(h <= 64); - vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y, + vpx_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); - vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); + vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); } void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int filter_x_stride, const int16_t *filter_y, - int filter_y_stride, int w, int h) { + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { int r; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; for (r = h; r > 0; --r) { memcpy(dst, src, w); @@ -251,15 +228,16 @@ void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, } void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int filter_x_stride, const int16_t *filter_y, - int filter_y_stride, int w, int h) { + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { int x, y; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; for (y = 0; y < h; ++y) { for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); @@ -269,53 +247,52 @@ void vpx_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, } void vpx_scaled_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, - int w, int h) { - vpx_convolve8_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { + vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); } void vpx_scaled_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h) { - vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter_x, - x_step_q4, filter_y, y_step_q4, w, h); + const InterpKernel *filter, int x0_q4, int x_step_q4, + int y0_q4, int y_step_q4, int w, int h) { + vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } void vpx_scaled_avg_2d_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, - filter_y, y_step_q4, w, h); + vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, y0_q4, y_step_q4, w, h); } #if CONFIG_VP9_HIGHBITDEPTH @@ -417,9 +394,9 @@ static void highbd_convolve_avg_vert(const uint16_t *src, ptrdiff_t src_stride, static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h, int bd) { + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -442,113 +419,97 @@ static void highbd_convolve(const uint16_t *src, ptrdiff_t src_stride, assert(x_step_q4 <= 32); highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, - temp, 64, x_filters, x0_q4, x_step_q4, w, + temp, 64, filter, x0_q4, x_step_q4, w, intermediate_height, bd); highbd_convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, - y_filters, y0_q4, y_step_q4, w, h, bd); + filter, y0_q4, y_step_q4, w, h, bd); } void vpx_highbd_convolve8_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, - int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, + int w, int h, int bd) { + (void)y0_q4; (void)y_step_q4; - highbd_convolve_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + highbd_convolve_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, h, bd); } void vpx_highbd_convolve8_avg_horiz_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - (void)filter_y; + (void)y0_q4; (void)y_step_q4; - highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4, + highbd_convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, w, h, bd); } void vpx_highbd_convolve8_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; + (void)x0_q4; (void)x_step_q4; - highbd_convolve_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + highbd_convolve_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, h, bd); } void vpx_highbd_convolve8_avg_vert_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - (void)filter_x; + (void)x0_q4; (void)x_step_q4; - highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4, + highbd_convolve_avg_vert(src, src_stride, dst, dst_stride, filter, y0_q4, y_step_q4, w, h, bd); } void vpx_highbd_convolve8_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - highbd_convolve(src, src_stride, dst, dst_stride, filters_x, x0_q4, x_step_q4, - filters_y, y0_q4, y_step_q4, w, h, bd); + highbd_convolve(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h, bd); } void vpx_highbd_convolve8_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { // Fixed size intermediate buffer places limits on parameters. DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]); assert(w <= 64); assert(h <= 64); - vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, - filter_y, y_step_q4, w, h, bd); - vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h, + vpx_highbd_convolve8_c(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h, bd); + vpx_highbd_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h, bd); } void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h, int bd) { + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { int r; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; for (r = h; r > 0; --r) { @@ -560,15 +521,16 @@ void vpx_highbd_convolve_copy_c(const uint16_t *src, ptrdiff_t src_stride, void vpx_highbd_convolve_avg_c(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, - int w, int h, int bd) { + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h, int bd) { int x, y; - (void)filter_x; - (void)filter_x_stride; - (void)filter_y; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; for (y = 0; y < h; ++y) { diff --git a/vpx_dsp/vpx_convolve.h b/vpx_dsp/vpx_convolve.h index 1aedd32..7979268 100644 --- a/vpx_dsp/vpx_convolve.h +++ b/vpx_dsp/vpx_convolve.h @@ -19,15 +19,15 @@ extern "C" { typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, int w, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h); #if CONFIG_VP9_HIGHBITDEPTH typedef void (*highbd_convolve_fn_t)(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd); #endif diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 3826b13..bfc9550 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -6,6 +6,7 @@ print <= 16) { \ - vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ - src_start, src_stride, dst, dst_stride, h, filter, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } else { \ - while (w >= 16) { \ - vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 16; \ - dst += 16; \ - w -= 16; \ - } \ - while (w >= 8) { \ - vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 8; \ - dst += 8; \ - w -= 8; \ - } \ - while (w >= 4) { \ - vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ - src, src_stride, dst, dst_stride, h, filter, bd); \ - src += 4; \ - dst += 4; \ - w -= 4; \ - } \ - } \ - } \ - if (w) { \ - vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h, bd); \ - } \ - } - -#define HIGH_FUN_CONV_2D(avg, opt) \ - void vpx_highbd_convolve8_##avg##opt( \ +#define HIGH_FUN_CONV_1D(name, offset, step_q4, dir, src_start, avg, opt) \ + void vpx_highbd_convolve8_##name##_##opt( \ const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ - ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, \ - const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { \ - assert(w <= 64); \ - assert(h <= 64); \ - if (x_step_q4 == 16 && y_step_q4 == 16) { \ - if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ - vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ - fdata2, 64, filter_x, x_step_q4, \ - filter_y, y_step_q4, w, h + 7, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt( \ - fdata2 + 192, 64, dst, dst_stride, filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h, bd); \ + ptrdiff_t dst_stride, const InterpKernel *filter_kernel, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ + const int16_t *filter = filter_kernel[offset]; \ + if (step_q4 == 16 && filter[3] != 128) { \ + if (filter[0] | filter[1] | filter[2]) { \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##8_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ } else { \ - DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ - vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h + 1, bd); \ - vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, \ - y_step_q4, w, h, bd); \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##2_##avg##opt( \ + src, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ } \ - } else { \ - vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \ - filter_x, x_step_q4, filter_y, y_step_q4, \ - w, h, bd); \ } \ + if (w) { \ + vpx_highbd_convolve8_##name##_c(src, src_stride, dst, dst_stride, \ + filter_kernel, x0_q4, x_step_q4, y0_q4, \ + y_step_q4, w, h, bd); \ + } \ + } + +#define HIGH_FUN_CONV_2D(avg, opt) \ + void vpx_highbd_convolve8_##avg##opt( \ + const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, \ + ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \ + int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ + const int16_t *filter_x = filter[x0_q4]; \ + assert(w <= 64); \ + assert(h <= 64); \ + if (x_step_q4 == 16 && y_step_q4 == 16) { \ + if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \ + vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \ + fdata2, 64, filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h + 7, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt( \ + fdata2 + 192, 64, dst, dst_stride, filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h, bd); \ + } else { \ + DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \ + vpx_highbd_convolve8_horiz_##opt(src, src_stride, fdata2, 64, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, \ + w, h + 1, bd); \ + vpx_highbd_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \ + filter, x0_q4, x_step_q4, \ + y0_q4, y_step_q4, w, h, bd); \ + } \ + } else { \ + vpx_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter, \ + x0_q4, x_step_q4, y0_q4, y_step_q4, w, h, \ + bd); \ + } \ } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c index 2fc7b74..7e75d5d 100644 --- a/vpx_dsp/x86/highbd_convolve_avx2.c +++ b/vpx_dsp/x86/highbd_convolve_avx2.c @@ -18,13 +18,14 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int width, int h, int bd) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; assert(width % 4 == 0); @@ -99,13 +100,14 @@ void vpx_highbd_convolve_copy_avx2(const uint16_t *src, ptrdiff_t src_stride, void vpx_highbd_convolve_avg_avx2(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int filter_x_stride, - const int16_t *filter_y, int filter_y_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int width, int h, int bd) { - (void)filter_x; - (void)filter_y; - (void)filter_x_stride; - (void)filter_y_stride; + (void)filter; + (void)x0_q4; + (void)x_step_q4; + (void)y0_q4; + (void)y_step_q4; (void)bd; assert(width % 4 == 0); @@ -1073,8 +1075,8 @@ void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2 #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2 -HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); -HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); HIGH_FUN_CONV_2D(, avx2); void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t, @@ -1098,8 +1100,8 @@ void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t, #define vpx_highbd_filter_block1d4_v2_avg_avx2 \ vpx_highbd_filter_block1d4_v2_avg_sse2 -HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, avx2); -HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2); +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, avx2); HIGH_FUN_CONV_2D(avg_, avx2); diff --git a/vpx_dsp/x86/vpx_asm_stubs.c b/vpx_dsp/x86/vpx_asm_stubs.c index 727d9d1..4f164af 100644 --- a/vpx_dsp/x86/vpx_asm_stubs.c +++ b/vpx_dsp/x86/vpx_asm_stubs.c @@ -41,38 +41,38 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_sse2; // void vpx_convolve8_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_avg_horiz_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); // void vpx_convolve8_avg_vert_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2); // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); FUN_CONV_2D(, sse2); FUN_CONV_2D(avg_, sse2); @@ -140,22 +140,22 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; // const int16_t *filter_y, // int y_step_q4, // int w, int h, int bd); -HIGH_FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , sse2); -HIGH_FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , sse2); -HIGH_FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, sse2); -HIGH_FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, +HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2); +HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , sse2); +HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2); +HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, sse2); // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h, int bd); // void vpx_highbd_convolve8_avg_sse2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h, int bd); +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h, int bd); HIGH_FUN_CONV_2D(, sse2); HIGH_FUN_CONV_2D(avg_, sse2); #endif // CONFIG_VP9_HIGHBITDEPTH && ARCH_X86_64 diff --git a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm index 389a692..3f444e2 100644 --- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm +++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm @@ -20,14 +20,14 @@ SECTION .text %endif %ifidn %2, highbd %define pavg pavgw -cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ +cglobal %2_convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \ dst, dst_stride, \ - fx, fxs, fy, fys, w, h, bd + f, fxo, fxs, fyo, fys, w, h, bd %else %define pavg pavgb -cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ +cglobal convolve_%1, 4, 8, 4+AUX_XMM_REGS, src, src_stride, \ dst, dst_stride, \ - fx, fxs, fy, fys, w, h + f, fxo, fxs, fyo, fys, w, h %endif mov r4d, dword wm %ifidn %2, highbd diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 7c1ecc0..6eafe9a 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -554,21 +554,21 @@ filter8_1dfunction vpx_filter_block1d4_h2_ssse3; #define vpx_filter_block1d4_h2_avx2 vpx_filter_block1d4_h2_ssse3 // void vpx_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); FUN_CONV_2D(, avx2); #endif // HAVE_AX2 && HAVE_SSSE3 diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 09c75d4..c1b81f2 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -306,29 +306,28 @@ filter8_1dfunction vpx_filter_block1d4_h2_avg_ssse3; // void vpx_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); // void vpx_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, -// int w, int h); -FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3); -FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3); -FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3); -FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, - ssse3); +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, +// int y_step_q4, int w, int h); +FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3); +FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , ssse3); +FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3); +FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v, src - src_stride * 3, avg_, ssse3); #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \ out2, out3, out4, out5, out6, out7) \ @@ -813,9 +812,9 @@ static void scaledconvolve_vert_w16(const uint8_t *src, ptrdiff_t src_stride, static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, - const InterpKernel *const x_filters, int x0_q4, - int x_step_q4, const InterpKernel *const y_filters, - int y0_q4, int y_step_q4, int w, int h) { + const InterpKernel *const filter, int x0_q4, + int x_step_q4, int y0_q4, int y_step_q4, int w, + int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. @@ -840,49 +839,43 @@ static void scaledconvolve2d(const uint8_t *src, ptrdiff_t src_stride, if (w >= 8) { scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, x_filters, x0_q4, x_step_q4, - w, intermediate_height); + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); } else { scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), - src_stride, temp, 64, x_filters, x0_q4, x_step_q4, - w, intermediate_height); + src_stride, temp, 64, filter, x0_q4, x_step_q4, w, + intermediate_height); } if (w >= 16) { scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + dst_stride, filter, y0_q4, y_step_q4, w, h); } else if (w == 8) { scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + dst_stride, filter, y0_q4, y_step_q4, w, h); } else { scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, - dst_stride, y_filters, y0_q4, y_step_q4, w, h); + dst_stride, filter, y0_q4, y_step_q4, w, h); } } void vpx_scaled_2d_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, - ptrdiff_t dst_stride, const int16_t *filter_x, - int x_step_q4, const int16_t *filter_y, int y_step_q4, + ptrdiff_t dst_stride, const InterpKernel *filter, + int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - const InterpKernel *const filters_x = get_filter_base(filter_x); - const int x0_q4 = get_filter_offset(filter_x, filters_x); - - const InterpKernel *const filters_y = get_filter_base(filter_y); - const int y0_q4 = get_filter_offset(filter_y, filters_y); - - scaledconvolve2d(src, src_stride, dst, dst_stride, filters_x, x0_q4, - x_step_q4, filters_y, y0_q4, y_step_q4, w, h); + scaledconvolve2d(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4, + y0_q4, y_step_q4, w, h); } // void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); // void vpx_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride, // uint8_t *dst, ptrdiff_t dst_stride, -// const int16_t *filter_x, int x_step_q4, -// const int16_t *filter_y, int y_step_q4, +// const InterpKernel *filter, int x0_q4, +// int32_t x_step_q4, int y0_q4, int y_step_q4, // int w, int h); FUN_CONV_2D(, ssse3); FUN_CONV_2D(avg_, ssse3);