From d8424d2890ddce0d92778e055eee145655cf034e Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Mon, 5 Mar 2018 16:25:08 -0800 Subject: [PATCH] Fix a bug in vp9_highbd_iht8x8_64_add_neon This bug was introduced in 29b6a30c. BUG=webm:1403 Change-Id: I9e0bf2c7a01d8ff1c714c12236f7985b772b0540 --- test/dct_test.cc | 4 - vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c | 191 +++++------------------ vp9/common/vp9_rtcd_defs.pl | 2 +- 3 files changed, 37 insertions(+), 160 deletions(-) diff --git a/test/dct_test.cc b/test/dct_test.cc index bfa05e5..66b2add 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -631,13 +631,9 @@ static const FuncInfo ht_neon_func_info[] = { #if CONFIG_VP9_HIGHBITDEPTH { &vp9_highbd_fht4x4_c, &highbd_iht_wrapper, 4, 2 }, -// TODO(linfengz): reenable these functions once test vector failures are -// addressed. -#if 0 { &vp9_highbd_fht8x8_c, &highbd_iht_wrapper, 8, 2 }, #endif -#endif { &vp9_fht4x4_c, &iht_wrapper, 4, 1 }, { &vp9_fht8x8_c, &iht_wrapper, 8, 1 }, { &vp9_fht16x16_c, &iht_wrapper, 16, 1 } diff --git a/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c index 498f035..9b74d27 100644 --- a/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c +++ b/vp9/common/arm/neon/vp9_highbd_iht8x8_add_neon.c @@ -132,84 +132,10 @@ static INLINE int32x4_t sub_dct_const_round_shift_low_8_bd12( return vcombine_s32(out_lo, out_hi); } -static INLINE void iadst8_bd10(int32x4_t *const io0, int32x4_t *const io1, - int32x4_t *const io2, int32x4_t *const io3, - int32x4_t *const io4, int32x4_t *const io5, - int32x4_t *const io6, int32x4_t *const io7) { - const int32x4_t c0 = - create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); - const int32x4_t c1 = - create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64); - const int32x4_t c2 = - create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64); - int32x4_t x[8], t[4]; - int32x4_t s[8]; - - x[0] = *io7; - x[1] = *io0; - x[2] = *io5; - x[3] = *io2; - x[4] = *io3; - x[5] = *io4; - x[6] = *io1; - x[7] = *io6; - - // stage 1 - iadst_butterfly_lane_0_1_bd10_neon(x[0], x[1], vget_low_s32(c0), &s[0], - &s[1]); - iadst_butterfly_lane_0_1_bd10_neon(x[2], x[3], vget_high_s32(c0), &s[2], - &s[3]); - iadst_butterfly_lane_0_1_bd10_neon(x[4], x[5], vget_low_s32(c1), &s[4], - &s[5]); - iadst_butterfly_lane_0_1_bd10_neon(x[6], x[7], vget_high_s32(c1), &s[6], - &s[7]); - - x[0] = add_dct_const_round_shift_low_8_bd10(s[0], s[4]); - x[1] = add_dct_const_round_shift_low_8_bd10(s[1], s[5]); - x[2] = add_dct_const_round_shift_low_8_bd10(s[2], s[6]); - x[3] = add_dct_const_round_shift_low_8_bd10(s[3], s[7]); - x[4] = sub_dct_const_round_shift_low_8_bd10(s[0], s[4]); - x[5] = sub_dct_const_round_shift_low_8_bd10(s[1], s[5]); - x[6] = sub_dct_const_round_shift_low_8_bd10(s[2], s[6]); - x[7] = sub_dct_const_round_shift_low_8_bd10(s[3], s[7]); - - // stage 2 - t[0] = x[0]; - t[1] = x[1]; - t[2] = x[2]; - t[3] = x[3]; - iadst_butterfly_lane_0_1_bd10_neon(x[4], x[5], vget_high_s32(c2), &s[4], - &s[5]); - iadst_butterfly_lane_1_0_bd10_neon(x[7], x[6], vget_high_s32(c2), &s[7], - &s[6]); - - x[0] = vaddq_s32(t[0], t[2]); - x[1] = vaddq_s32(t[1], t[3]); - x[2] = vsubq_s32(t[0], t[2]); - x[3] = vsubq_s32(t[1], t[3]); - x[4] = add_dct_const_round_shift_low_8_bd10(s[4], s[6]); - x[5] = add_dct_const_round_shift_low_8_bd10(s[5], s[7]); - x[6] = sub_dct_const_round_shift_low_8_bd10(s[4], s[6]); - x[7] = sub_dct_const_round_shift_low_8_bd10(s[5], s[7]); - - // stage 3 - iadst_half_butterfly_bd10_neon(x + 2, vget_low_s32(c2)); - iadst_half_butterfly_bd10_neon(x + 6, vget_low_s32(c2)); - - *io0 = x[0]; - *io1 = vnegq_s32(x[4]); - *io2 = x[6]; - *io3 = vnegq_s32(x[2]); - *io4 = x[3]; - *io5 = vnegq_s32(x[7]); - *io6 = x[5]; - *io7 = vnegq_s32(x[1]); -} - -static INLINE void iadst8_bd12(int32x4_t *const io0, int32x4_t *const io1, - int32x4_t *const io2, int32x4_t *const io3, - int32x4_t *const io4, int32x4_t *const io5, - int32x4_t *const io6, int32x4_t *const io7) { +static INLINE void highbd_iadst8(int32x4_t *const io0, int32x4_t *const io1, + int32x4_t *const io2, int32x4_t *const io3, + int32x4_t *const io4, int32x4_t *const io5, + int32x4_t *const io6, int32x4_t *const io7) { const int32x4_t c0 = create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64); const int32x4_t c1 = @@ -394,31 +320,17 @@ void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 - if (bd == 10) { - idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], - &a[4], &a[5], &a[6], &a[7]); - idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], - &a[12], &a[13], &a[14], &a[15]); - transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], - &a[11]); - iadst8_bd10(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); - transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], - &a[15]); - iadst8_bd10(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], + &a[4], &a[5], &a[6], &a[7]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], + &a[12], &a[13], &a[14], &a[15]); + transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], + &a[11]); + highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); + transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); - } else { - idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], - &a[4], &a[5], &a[6], &a[7]); - idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], - &a[12], &a[13], &a[14], &a[15]); - transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], - &a[11]); - iadst8_bd12(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); - transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], - &a[15]); - iadst8_bd12(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], - &a[15]); - } break; } @@ -427,67 +339,36 @@ void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 - if (bd == 10) { - transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], - &a[7]); - iadst8_bd10(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); - transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], - &a[14], &a[15]); - iadst8_bd10(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], + &a[7]); + highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); - idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], - &a[2], &a[10], &a[3], &a[11]); - idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], - &a[6], &a[14], &a[7], &a[15]); - } else { - transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], - &a[7]); - iadst8_bd12(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); - transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], - &a[14], &a[15]); - iadst8_bd12(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], - &a[15]); - idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], - &a[2], &a[10], &a[3], &a[11]); - idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], - &a[6], &a[14], &a[7], &a[15]); - } + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], + &a[2], &a[10], &a[3], &a[11]); + idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], + &a[6], &a[14], &a[7], &a[15]); break; } default: { assert(tx_type == ADST_ADST); - if (bd == 10) { - transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], - &a[7]); - iadst8_bd10(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); - transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], - &a[14], &a[15]); - iadst8_bd10(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], + &a[7]); + highbd_iadst8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); + transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], + &a[15]); + highbd_iadst8(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); - transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], - &a[11]); - iadst8_bd10(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); - transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], - &a[15]); - iadst8_bd10(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], + &a[11]); + highbd_iadst8(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); + transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], + &a[15]); + highbd_iadst8(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); - } else { - transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], - &a[7]); - iadst8_bd12(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); - transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], - &a[14], &a[15]); - iadst8_bd12(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], - &a[15]); - transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], - &a[11]); - iadst8_bd12(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); - transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], - &a[15]); - iadst8_bd12(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], - &a[15]); - } break; } } diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index fcd8f7e..c705c5e 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -104,7 +104,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/; - specialize qw/vp9_highbd_iht8x8_64_add sse4_1/; + specialize qw/vp9_highbd_iht8x8_64_add neon sse4_1/; specialize qw/vp9_highbd_iht16x16_256_add sse4_1/; } } -- 2.7.4