}
void RunComparison() {
- ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, 0, height_));
- ASM_REGISTER_STATE_CHECK(asm_func_(hbuf_asm_, source_data_, 0, height_));
+ ASM_REGISTER_STATE_CHECK(c_func_(hbuf_c_, source_data_, width_, height_));
+ ASM_REGISTER_STATE_CHECK(
+ asm_func_(hbuf_asm_, source_data_, width_, height_));
EXPECT_EQ(0, memcmp(hbuf_c_, hbuf_asm_, sizeof(*hbuf_c_) * 16))
<< "Output mismatch";
}
make_tuple(16, 16, 5, 4, &vpx_avg_4x4_neon),
make_tuple(32, 32, 15, 4, &vpx_avg_4x4_neon)));
-// Disabled neon optimization since it caused mismatch. See details in:
-// https://bugs.chromium.org/p/webm/issues/detail?id=1809
-// INSTANTIATE_TEST_SUITE_P(
-// NEON, IntProRowTest,
-// ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon,
-// &vpx_int_pro_row_c),
-// make_tuple(32, &vpx_int_pro_row_neon,
-// &vpx_int_pro_row_c), make_tuple(64,
-// &vpx_int_pro_row_neon,
-// &vpx_int_pro_row_c)));
-//
-// INSTANTIATE_TEST_SUITE_P(
-// NEON, IntProColTest,
-// ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon,
-// &vpx_int_pro_col_c),
-// make_tuple(32, &vpx_int_pro_col_neon,
-// &vpx_int_pro_col_c), make_tuple(64,
-// &vpx_int_pro_col_neon,
-// &vpx_int_pro_col_c)));
+INSTANTIATE_TEST_SUITE_P(
+ NEON, IntProRowTest,
+ ::testing::Values(make_tuple(16, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
+ make_tuple(32, &vpx_int_pro_row_neon, &vpx_int_pro_row_c),
+ make_tuple(64, &vpx_int_pro_row_neon,
+ &vpx_int_pro_row_c)));
+
+INSTANTIATE_TEST_SUITE_P(
+ NEON, IntProColTest,
+ ::testing::Values(make_tuple(16, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
+ make_tuple(32, &vpx_int_pro_col_neon, &vpx_int_pro_col_c),
+ make_tuple(64, &vpx_int_pro_col_neon,
+ &vpx_int_pro_col_c)));
INSTANTIATE_TEST_SUITE_P(NEON, SatdLowbdTest,
::testing::Values(make_tuple(16, &vpx_satd_neon),
return horizontal_add_int32x4(vaddq_s32(sum_s32[0], sum_s32[1]));
}
-// void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
-// const int ref_stride, const int height) {
-// int i;
-// uint8x16_t r0, r1, r2, r3;
-// uint16x8_t sum_lo[2], sum_hi[2];
-// uint16x8_t tmp_lo[2], tmp_hi[2];
-// int16x8_t avg_lo, avg_hi;
-//
-// const int norm_factor = (height >> 5) + 3;
-// const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
-//
-// assert(height >= 4 && height % 4 == 0);
-//
-// r0 = vld1q_u8(ref + 0 * ref_stride);
-// r1 = vld1q_u8(ref + 1 * ref_stride);
-// r2 = vld1q_u8(ref + 2 * ref_stride);
-// r3 = vld1q_u8(ref + 3 * ref_stride);
-//
-// sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
-// sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
-// sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
-// sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
-//
-// for (i = 4; i < height; i += 4) {
-// r0 = vld1q_u8(ref + 0 * ref_stride);
-// r1 = vld1q_u8(ref + 1 * ref_stride);
-// r2 = vld1q_u8(ref + 2 * ref_stride);
-// r3 = vld1q_u8(ref + 3 * ref_stride);
-//
-// tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
-// tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
-// tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
-// tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
-//
-// sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]);
-// sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]);
-// sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]);
-// sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]);
-//
-// ref += 4 * ref_stride;
-// }
-//
-// sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
-// sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
-//
-// avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
-// avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
-//
-// vst1q_s16(hbuf, avg_lo);
-// vst1q_s16(hbuf + 8, avg_hi);
-// }
-
-// int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
-// uint16x8_t sum;
-// int i;
-//
-// assert(width >= 16 && width % 16 == 0);
-//
-// sum = vpaddlq_u8(vld1q_u8(ref));
-// for (i = 16; i < width; i += 16) {
-// sum = vpadalq_u8(sum, vld1q_u8(ref + i));
-// }
-//
-// return (int16_t)horizontal_add_uint16x8(sum);
-// }
+void vpx_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref,
+ const int ref_stride, const int height) {
+ int i;
+ uint8x16_t r0, r1, r2, r3;
+ uint16x8_t sum_lo[2], sum_hi[2];
+ uint16x8_t tmp_lo[2], tmp_hi[2];
+ int16x8_t avg_lo, avg_hi;
+
+ const int norm_factor = (height >> 5) + 3;
+ const int16x8_t neg_norm_factor = vdupq_n_s16(-norm_factor);
+
+ assert(height >= 4 && height % 4 == 0);
+
+ r0 = vld1q_u8(ref + 0 * ref_stride);
+ r1 = vld1q_u8(ref + 1 * ref_stride);
+ r2 = vld1q_u8(ref + 2 * ref_stride);
+ r3 = vld1q_u8(ref + 3 * ref_stride);
+
+ sum_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+ sum_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+ sum_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+ sum_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+ ref += 4 * ref_stride;
+
+ for (i = 4; i < height; i += 4) {
+ r0 = vld1q_u8(ref + 0 * ref_stride);
+ r1 = vld1q_u8(ref + 1 * ref_stride);
+ r2 = vld1q_u8(ref + 2 * ref_stride);
+ r3 = vld1q_u8(ref + 3 * ref_stride);
+
+ tmp_lo[0] = vaddl_u8(vget_low_u8(r0), vget_low_u8(r1));
+ tmp_hi[0] = vaddl_u8(vget_high_u8(r0), vget_high_u8(r1));
+ tmp_lo[1] = vaddl_u8(vget_low_u8(r2), vget_low_u8(r3));
+ tmp_hi[1] = vaddl_u8(vget_high_u8(r2), vget_high_u8(r3));
+
+ sum_lo[0] = vaddq_u16(sum_lo[0], tmp_lo[0]);
+ sum_hi[0] = vaddq_u16(sum_hi[0], tmp_hi[0]);
+ sum_lo[1] = vaddq_u16(sum_lo[1], tmp_lo[1]);
+ sum_hi[1] = vaddq_u16(sum_hi[1], tmp_hi[1]);
+
+ ref += 4 * ref_stride;
+ }
+
+ sum_lo[0] = vaddq_u16(sum_lo[0], sum_lo[1]);
+ sum_hi[0] = vaddq_u16(sum_hi[0], sum_hi[1]);
+
+ avg_lo = vshlq_s16(vreinterpretq_s16_u16(sum_lo[0]), neg_norm_factor);
+ avg_hi = vshlq_s16(vreinterpretq_s16_u16(sum_hi[0]), neg_norm_factor);
+
+ vst1q_s16(hbuf, avg_lo);
+ vst1q_s16(hbuf + 8, avg_hi);
+}
+
+int16_t vpx_int_pro_col_neon(uint8_t const *ref, const int width) {
+ uint16x8_t sum;
+ int i;
+
+ assert(width >= 16 && width % 16 == 0);
+
+ sum = vpaddlq_u8(vld1q_u8(ref));
+ for (i = 16; i < width; i += 16) {
+ sum = vpadalq_u8(sum, vld1q_u8(ref + i));
+ }
+
+ return (int16_t)horizontal_add_uint16x8(sum);
+}
// ref, src = [0, 510] - max diff = 16-bits
// bwl = {2, 3, 4}, width = {16, 32, 64}