From 911bb980b1d2501ee9fa053ff3f6faffbba0b0a2 Mon Sep 17 00:00:00 2001 From: Linfeng Zhang Date: Wed, 28 Dec 2016 13:51:44 -0800 Subject: [PATCH] Clean DC only idct NEON intrinsics BUG=webm:1301 Change-Id: Iffc83854218460b3f687f3774e71d45b552382a5 --- vpx_dsp/arm/idct16x16_1_add_neon.c | 95 ++++++++++++++---------- vpx_dsp/arm/idct32x32_1_add_neon.c | 143 +++++++++---------------------------- vpx_dsp/arm/idct4x4_1_add_neon.c | 34 +++++---- vpx_dsp/arm/idct8x8_1_add_neon.c | 80 +++++++++++---------- vpx_dsp/arm/idct_neon.h | 6 ++ 5 files changed, 156 insertions(+), 202 deletions(-) diff --git a/vpx_dsp/arm/idct16x16_1_add_neon.c b/vpx_dsp/arm/idct16x16_1_add_neon.c index 266f976..968bc5c 100644 --- a/vpx_dsp/arm/idct16x16_1_add_neon.c +++ b/vpx_dsp/arm/idct16x16_1_add_neon.c @@ -11,49 +11,66 @@ #include #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/inv_txfm.h" -#include "vpx_ports/mem.h" -void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, - int stride) { - uint8x8_t d2u8, d3u8, d30u8, d31u8; - uint64x1_t d2u64, d3u64, d4u64, d5u64; - uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; - int16x8_t q0s16; - uint8_t *d1, *d2; - int16_t i, j, a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 6); - - q0s16 = vdupq_n_s16(a1); - q0u16 = vreinterpretq_u16_s16(q0s16); - - for (d1 = d2 = dest, i = 0; i < 4; i++) { - for (j = 0; j < 2; j++) { - d2u64 = vld1_u64((const uint64_t *)d1); - d3u64 = vld1_u64((const uint64_t *)(d1 + 8)); - d1 += stride; - d4u64 = vld1_u64((const uint64_t *)d1); - d5u64 = vld1_u64((const uint64_t *)(d1 + 8)); - d1 += stride; +static INLINE void idct16x16_1_add_pos_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a = vld1q_u8(*dest); + const uint8x16_t b = vqaddq_u8(a, res); + vst1q_u8(*dest, b); + *dest += stride; +} - q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); - q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); - q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); - q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); +static INLINE void idct16x16_1_add_neg_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a = vld1q_u8(*dest); + const uint8x16_t b = vqsubq_u8(a, res); + vst1q_u8(*dest, b); + *dest += stride; +} - d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); - d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); - d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); - d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); +void vpx_idct16x16_1_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d3u8)); - d2 += stride; - vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); - vst1_u64((uint64_t *)(d2 + 8), vreinterpret_u64_u8(d31u8)); - d2 += stride; - } + if (a1 >= 0) { + const uint8x16_t dc = create_dcq(a1); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + idct16x16_1_add_pos_kernel(&dest, stride, dc); + } else { + const uint8x16_t dc = create_dcq(-a1); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); + idct16x16_1_add_neg_kernel(&dest, stride, dc); } } diff --git a/vpx_dsp/arm/idct32x32_1_add_neon.c b/vpx_dsp/arm/idct32x32_1_add_neon.c index 165178d..604d82a 100644 --- a/vpx_dsp/arm/idct32x32_1_add_neon.c +++ b/vpx_dsp/arm/idct32x32_1_add_neon.c @@ -10,127 +10,48 @@ #include -#include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/idct_neon.h" #include "vpx_dsp/inv_txfm.h" -#include "vpx_ports/mem.h" -static INLINE void LD_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vld1q_u8(d); - d += d_stride; - *q9u8 = vld1q_u8(d); - d += d_stride; - *q10u8 = vld1q_u8(d); - d += d_stride; - *q11u8 = vld1q_u8(d); - d += d_stride; - *q12u8 = vld1q_u8(d); - d += d_stride; - *q13u8 = vld1q_u8(d); - d += d_stride; - *q14u8 = vld1q_u8(d); - d += d_stride; - *q15u8 = vld1q_u8(d); +static INLINE void idct32x32_1_add_pos_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a0 = vld1q_u8(*dest); + const uint8x16_t a1 = vld1q_u8(*dest + 16); + const uint8x16_t b0 = vqaddq_u8(a0, res); + const uint8x16_t b1 = vqaddq_u8(a1, res); + vst1q_u8(*dest, b0); + vst1q_u8(*dest + 16, b1); + *dest += stride; } -static INLINE void ADD_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vqaddq_u8(*q8u8, qdiffu8); - *q9u8 = vqaddq_u8(*q9u8, qdiffu8); - *q10u8 = vqaddq_u8(*q10u8, qdiffu8); - *q11u8 = vqaddq_u8(*q11u8, qdiffu8); - *q12u8 = vqaddq_u8(*q12u8, qdiffu8); - *q13u8 = vqaddq_u8(*q13u8, qdiffu8); - *q14u8 = vqaddq_u8(*q14u8, qdiffu8); - *q15u8 = vqaddq_u8(*q15u8, qdiffu8); -} - -static INLINE void SUB_DIFF_16x8(uint8x16_t qdiffu8, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - *q8u8 = vqsubq_u8(*q8u8, qdiffu8); - *q9u8 = vqsubq_u8(*q9u8, qdiffu8); - *q10u8 = vqsubq_u8(*q10u8, qdiffu8); - *q11u8 = vqsubq_u8(*q11u8, qdiffu8); - *q12u8 = vqsubq_u8(*q12u8, qdiffu8); - *q13u8 = vqsubq_u8(*q13u8, qdiffu8); - *q14u8 = vqsubq_u8(*q14u8, qdiffu8); - *q15u8 = vqsubq_u8(*q15u8, qdiffu8); -} - -static INLINE void ST_16x8(uint8_t *d, int d_stride, uint8x16_t *q8u8, - uint8x16_t *q9u8, uint8x16_t *q10u8, - uint8x16_t *q11u8, uint8x16_t *q12u8, - uint8x16_t *q13u8, uint8x16_t *q14u8, - uint8x16_t *q15u8) { - vst1q_u8(d, *q8u8); - d += d_stride; - vst1q_u8(d, *q9u8); - d += d_stride; - vst1q_u8(d, *q10u8); - d += d_stride; - vst1q_u8(d, *q11u8); - d += d_stride; - vst1q_u8(d, *q12u8); - d += d_stride; - vst1q_u8(d, *q13u8); - d += d_stride; - vst1q_u8(d, *q14u8); - d += d_stride; - vst1q_u8(d, *q15u8); +static INLINE void idct32x32_1_add_neg_kernel(uint8_t **dest, const int stride, + const uint8x16_t res) { + const uint8x16_t a0 = vld1q_u8(*dest); + const uint8x16_t a1 = vld1q_u8(*dest + 16); + const uint8x16_t b0 = vqsubq_u8(a0, res); + const uint8x16_t b1 = vqsubq_u8(a1, res); + vst1q_u8(*dest, b0); + vst1q_u8(*dest + 16, b1); + *dest += stride; } void vpx_idct32x32_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { - uint8x16_t q0u8, q8u8, q9u8, q10u8, q11u8, q12u8, q13u8, q14u8, q15u8; - int i, j, dest_stride8; - uint8_t *d; - int16_t a1; - int16_t out = dct_const_round_shift(input[0] * cospi_16_64); - - out = dct_const_round_shift(out * cospi_16_64); - a1 = ROUND_POWER_OF_TWO(out, 6); - - dest_stride8 = stride * 8; - if (a1 >= 0) { // diff_positive_32_32 - a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8((uint8_t)a1); - for (i = 0; i < 2; i++, dest += 16) { // diff_positive_32_32_loop - d = dest; - for (j = 0; j < 4; j++) { - LD_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8, - &q15u8); - ADD_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - ST_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8, - &q15u8); - d += dest_stride8; - } + int i; + const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); + + if (a1 >= 0) { + const uint8x16_t dc = create_dcq(a1); + for (i = 0; i < 32; i++) { + idct32x32_1_add_pos_kernel(&dest, stride, dc); } - } else { // diff_negative_32_32 - a1 = -a1; - a1 = a1 < 0 ? 0 : a1 > 255 ? 255 : a1; - q0u8 = vdupq_n_u8((uint8_t)a1); - for (i = 0; i < 2; i++, dest += 16) { // diff_negative_32_32_loop - d = dest; - for (j = 0; j < 4; j++) { - LD_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8, - &q15u8); - SUB_DIFF_16x8(q0u8, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, - &q14u8, &q15u8); - ST_16x8(d, stride, &q8u8, &q9u8, &q10u8, &q11u8, &q12u8, &q13u8, &q14u8, - &q15u8); - d += dest_stride8; - } + } else { + const uint8x16_t dc = create_dcq(-a1); + for (i = 0; i < 32; i++) { + idct32x32_1_add_neg_kernel(&dest, stride, dc); } } } diff --git a/vpx_dsp/arm/idct4x4_1_add_neon.c b/vpx_dsp/arm/idct4x4_1_add_neon.c index 7fd1f2d..d1eae24 100644 --- a/vpx_dsp/arm/idct4x4_1_add_neon.c +++ b/vpx_dsp/arm/idct4x4_1_add_neon.c @@ -14,28 +14,32 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/inv_txfm.h" +static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride, + const int16x8_t res, + uint32x2_t *const d) { + uint16x8_t a; + uint8x8_t b; + *d = vld1_lane_u32((const uint32_t *)*dest, *d, 0); + *d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1); + a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d)); + b = vqmovun_s16(vreinterpretq_s16_u16(a)); + vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0); + *dest += stride; + vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1); + *dest += stride; +} + void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { - int i; - const int16_t out0 = dct_const_round_shift((int16_t)input[0] * cospi_16_64); - const int16_t out1 = dct_const_round_shift(out0 * cospi_16_64); + const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); const int16x8_t dc = vdupq_n_s16(a1); uint32x2_t d = vdup_n_u32(0); - uint16x8_t a; - uint8x8_t b; assert(!((intptr_t)dest % sizeof(uint32_t))); assert(!(stride % sizeof(uint32_t))); - for (i = 0; i < 2; i++) { - d = vld1_lane_u32((const uint32_t *)dest, d, 0); - d = vld1_lane_u32((const uint32_t *)(dest + stride), d, 1); - a = vaddw_u8(vreinterpretq_u16_s16(dc), vreinterpret_u8_u32(d)); - b = vqmovun_s16(vreinterpretq_s16_u16(a)); - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(b), 0); - dest += stride; - vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(b), 1); - dest += stride; - } + idct4x4_1_add_kernel(&dest, stride, dc, &d); + idct4x4_1_add_kernel(&dest, stride, dc, &d); } diff --git a/vpx_dsp/arm/idct8x8_1_add_neon.c b/vpx_dsp/arm/idct8x8_1_add_neon.c index 0a6af4f..7bcce91 100644 --- a/vpx_dsp/arm/idct8x8_1_add_neon.c +++ b/vpx_dsp/arm/idct8x8_1_add_neon.c @@ -12,47 +12,53 @@ #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/inv_txfm.h" -#include "vpx_ports/mem.h" -void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, - int stride) { - int i; - const int16_t out0 = dct_const_round_shift(input[0] * cospi_16_64); - const int16_t out1 = dct_const_round_shift(out0 * cospi_16_64); - const int16_t out2 = ROUND_POWER_OF_TWO(out1, 5); - const int16x8_t dc = vdupq_n_s16(out2); - const uint16x8_t dc_u16 = vreinterpretq_u16_s16(dc); - const uint8_t *dst = dest; - uint8x8_t d0, d1, d2, d3; - uint16x8_t d0_u16, d1_u16, d2_u16, d3_u16; +static INLINE uint8x8_t create_dcd(const int16_t dc) { + int16x8_t t = vdupq_n_s16(dc); + return vqmovun_s16(t); +} - for (i = 0; i < 2; i++) { - d0 = vld1_u8(dst); - dst += stride; - d1 = vld1_u8(dst); - dst += stride; - d2 = vld1_u8(dst); - dst += stride; - d3 = vld1_u8(dst); - dst += stride; +static INLINE void idct8x8_1_add_pos_kernel(uint8_t **dest, const int stride, + const uint8x8_t res) { + const uint8x8_t a = vld1_u8(*dest); + const uint8x8_t b = vqadd_u8(a, res); + vst1_u8(*dest, b); + *dest += stride; +} - d0_u16 = vaddw_u8(dc_u16, d0); - d1_u16 = vaddw_u8(dc_u16, d1); - d2_u16 = vaddw_u8(dc_u16, d2); - d3_u16 = vaddw_u8(dc_u16, d3); +static INLINE void idct8x8_1_add_neg_kernel(uint8_t **dest, const int stride, + const uint8x8_t res) { + const uint8x8_t a = vld1_u8(*dest); + const uint8x8_t b = vqsub_u8(a, res); + vst1_u8(*dest, b); + *dest += stride; +} - d0 = vqmovun_s16(vreinterpretq_s16_u16(d0_u16)); - d1 = vqmovun_s16(vreinterpretq_s16_u16(d1_u16)); - d2 = vqmovun_s16(vreinterpretq_s16_u16(d2_u16)); - d3 = vqmovun_s16(vreinterpretq_s16_u16(d3_u16)); +void vpx_idct8x8_1_add_neon(const tran_low_t *input, uint8_t *dest, + int stride) { + const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); + const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); + const int16_t a1 = ROUND_POWER_OF_TWO(out1, 5); - vst1_u8(dest, d0); - dest += stride; - vst1_u8(dest, d1); - dest += stride; - vst1_u8(dest, d2); - dest += stride; - vst1_u8(dest, d3); - dest += stride; + if (a1 >= 0) { + const uint8x8_t dc = create_dcd(a1); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + idct8x8_1_add_pos_kernel(&dest, stride, dc); + } else { + const uint8x8_t dc = create_dcd(-a1); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); + idct8x8_1_add_neg_kernel(&dest, stride, dc); } } diff --git a/vpx_dsp/arm/idct_neon.h b/vpx_dsp/arm/idct_neon.h index d9a676c..04352c4 100644 --- a/vpx_dsp/arm/idct_neon.h +++ b/vpx_dsp/arm/idct_neon.h @@ -181,6 +181,12 @@ static INLINE void add_and_store_u8_s16(const int16x8_t a0, const int16x8_t a1, vst1_u8(b, b7); } +static INLINE uint8x16_t create_dcq(const int16_t dc) { + // Clip both sides and gcc may compile to assembly 'usat'. + const int16_t t = (dc < 0) ? 0 : ((dc > 255) ? 255 : dc); + return vdupq_n_u8((uint8_t)t); +} + static INLINE void idct4x4_16_kernel_bd8(const int16x4_t cospis, int16x8_t *const a0, int16x8_t *const a1) { -- 2.7.4