From 57b9afa58f849a8165ce3132c21087ae451d862c Mon Sep 17 00:00:00 2001 From: James Zern Date: Tue, 2 May 2023 18:37:59 -0700 Subject: [PATCH] s/__aarch64__/VPX_ARCH_AARCH64/ This allows AArch64 to be correctly detected when building with Visual Studio (cl.exe) and fixes a crash in vp9_diamond_search_sad_neon.c. There are still test failures, however. Microsoft's compiler doesn't define __ARM_FEATURE_*. To use those paths we may need to rely on _M_ARM64_EXTENSION. Bug: webm:1788 Bug: b/277255076 Change-Id: I4d26f5f84dbd0cbcd1cdf0d7d932ebcf109febe5 --- vp8/encoder/arm/neon/fastquantizeb_neon.c | 8 ++--- vp9/encoder/arm/neon/vp9_denoiser_neon.c | 2 +- vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c | 14 ++++----- vp9/encoder/arm/neon/vp9_quantize_neon.c | 6 ++-- vpx_dsp/arm/avg_neon.c | 2 +- vpx_dsp/arm/highbd_avg_neon.c | 2 +- vpx_dsp/arm/highbd_quantize_neon.c | 8 ++--- vpx_dsp/arm/quantize_neon.c | 8 ++--- vpx_dsp/arm/sum_neon.h | 34 +++++++++++----------- vpx_dsp/arm/transpose_neon.h | 10 +++---- vpx_dsp/arm/vpx_convolve8_neon.c | 6 ++-- vpx_dsp/arm/vpx_convolve8_neon.h | 8 ++--- 12 files changed, 54 insertions(+), 54 deletions(-) diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c index 6fc6080..950c943 100644 --- a/vp8/encoder/arm/neon/fastquantizeb_neon.c +++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c @@ -28,11 +28,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { zig_zag1 = vld1q_u16(inv_zig_zag + 8); int16x8_t x0, x1, sz0, sz1, y0, y1; uint16x8_t eob0, eob1; -#ifndef __aarch64__ +#if !VPX_ARCH_AARCH64 uint16x4_t eob_d16; uint32x2_t eob_d32; uint32x4_t eob_q32; -#endif // __arch64__ +#endif // !VPX_ARCH_AARCH64 /* sign of z: z >> 15 */ sz0 = vshrq_n_s16(z0, 15); @@ -70,7 +70,7 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { /* select the largest value */ eob0 = vmaxq_u16(eob0, eob1); -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *d->eob = (int8_t)vmaxvq_u16(eob0); #else eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0)); @@ -79,7 +79,7 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { eob_d32 = vpmax_u32(eob_d32, eob_d32); vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 /* qcoeff = x */ vst1q_s16(d->qcoeff, x0); diff --git a/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/vp9/encoder/arm/neon/vp9_denoiser_neon.c index 53e8c7e..d631cd4 100644 --- a/vp9/encoder/arm/neon/vp9_denoiser_neon.c +++ b/vp9/encoder/arm/neon/vp9_denoiser_neon.c @@ -21,7 +21,7 @@ // Compute the sum of all pixel differences of this MB. static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_s8(v_sum_diff_total); #else const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total); diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c index 255e6fb..b82b3f9 100644 --- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c +++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c @@ -94,7 +94,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, // Work out the start point for the search const uint8_t *best_address = in_what; const uint8_t *new_best_address = best_address; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address); #else int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address); @@ -117,7 +117,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, int8x16_t v_inside_d; uint32x4_t v_outside_d; int32x4_t v_cost_d, v_sad_d; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 int64x2_t v_blocka[2]; #else int32x4_t v_blocka[1]; @@ -138,7 +138,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, vreinterpretq_s32_s16(v_these_mv_w))); // If none of them are inside, then move on -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d)); #else horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)), @@ -167,7 +167,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, // Compute the SIMD pointer offsets. { -#if defined(__aarch64__) // sizeof(intptr_t) == 8 +#if VPX_ARCH_AARCH64 // sizeof(intptr_t) == 8 // Load the offsets int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]); int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]); @@ -234,7 +234,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, // Find the minimum value and index horizontally in v_sad_d { uint32_t local_best_sad; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d)); #else uint32x2_t horiz_min_0 = @@ -256,7 +256,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d); v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff)); -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 local_best_idx = vminvq_u32(v_mask_d); #else horiz_min_0 = @@ -280,7 +280,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x, best_address = new_best_address; v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int)); -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 v_ba_q = vdupq_n_s64((intptr_t)best_address); #else v_ba_d = vdupq_n_s32((intptr_t)best_address); diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index c2b55fc..97ab136 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -50,7 +50,7 @@ static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr, } static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 return (uint16_t)vmaxvq_s16(v_eobmax); #else const int16x4_t v_eobmax_3210 = @@ -65,7 +65,7 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) { vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); return (uint16_t)vget_lane_s16(v_eobmax_final, 0); -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 } static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr, @@ -81,7 +81,7 @@ static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr, static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round, int16x8_t *v_quant, int16x8_t *v_dequant) { -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *v_round = vdupq_laneq_s16(*v_round, 1); *v_quant = vdupq_laneq_s16(*v_quant, 1); *v_dequant = vdupq_laneq_s16(*v_dequant, 1); diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c index d48115d..8c61fc2 100644 --- a/vpx_dsp/arm/avg_neon.c +++ b/vpx_dsp/arm/avg_neon.c @@ -210,7 +210,7 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b, const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max); const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min); -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 *min = *max = 0; // Clear high bits *((uint8_t *)max) = vmaxvq_u8(ab07_max); *((uint8_t *)min) = vminvq_u8(ab07_min); diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c index fc10197..8939ee1 100644 --- a/vpx_dsp/arm/highbd_avg_neon.c +++ b/vpx_dsp/arm/highbd_avg_neon.c @@ -114,7 +114,7 @@ void vpx_highbd_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint16x8_t min4567 = vminq_u16(min45, min67); const uint16x8_t min07 = vminq_u16(min0123, min4567); -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 *min = *max = 0; // Clear high bits *((uint16_t *)max) = vmaxvq_u16(max07); *((uint16_t *)min) = vminvq_u16(min07); diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c index 526447a..d2a7add 100644 --- a/vpx_dsp/arm/highbd_quantize_neon.c +++ b/vpx_dsp/arm/highbd_quantize_neon.c @@ -166,7 +166,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } while (n_coeffs > 0); } -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *eob_ptr = vmaxvq_u16(eob_max); #else { @@ -176,7 +176,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 // Need these here, else the compiler complains about mixing declarations and // code in C90 (void)n_coeffs; @@ -291,7 +291,7 @@ void vpx_highbd_quantize_b_32x32_neon( } } -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *eob_ptr = vmaxvq_u16(eob_max); #else { @@ -301,5 +301,5 @@ void vpx_highbd_quantize_b_32x32_neon( const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 } diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c index cc8f623..35c67f6 100644 --- a/vpx_dsp/arm/quantize_neon.c +++ b/vpx_dsp/arm/quantize_neon.c @@ -134,7 +134,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, } while (n_coeffs > 0); } -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *eob_ptr = vmaxvq_u16(eob_max); #else { @@ -144,7 +144,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 // Need these here, else the compiler complains about mixing declarations and // code in C90 (void)scan; @@ -276,7 +276,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, } } -#ifdef __aarch64__ +#if VPX_ARCH_AARCH64 *eob_ptr = vmaxvq_u16(eob_max); #else { @@ -286,5 +286,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1); vst1_lane_u16(eob_ptr, eob_max_2, 0); } -#endif // __aarch64__ +#endif // VPX_ARCH_AARCH64 } diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index a0c72f9..48a2fc0 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -17,7 +17,7 @@ #include "vpx/vpx_integer.h" static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlv_u8(a); #else const uint16x4_t b = vpaddl_u8(a); @@ -27,7 +27,7 @@ static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) { } static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlv_u8(a); #else const uint16x4_t b = vpaddl_u8(a); @@ -38,7 +38,7 @@ static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) { } static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_u8(a); #else const uint16x8_t b = vpaddlq_u8(a); @@ -50,7 +50,7 @@ static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) { } static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddv_u16(a); #else const uint16x4_t b = vpadd_u16(a, a); @@ -60,7 +60,7 @@ static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) { } static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_s16(a); #else const int32x4_t b = vpaddlq_s16(a); @@ -72,7 +72,7 @@ static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) { } static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_u16(a); #else const uint32x4_t b = vpaddlq_u16(a); @@ -84,7 +84,7 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) { } static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); const uint16x8_t b0 = vpaddq_u16(a0, a1); @@ -102,7 +102,7 @@ static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) { static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo, const uint16x8_t vec_hi) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi); #else const uint32x4_t vec_l_lo = @@ -127,7 +127,7 @@ static INLINE uint32x4_t horizontal_long_add_4d_uint16x8( const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]); const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]); const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]); -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 const uint32x4_t c0 = vpaddq_u32(b0, b1); const uint32x4_t c1 = vpaddq_u32(b2, b3); return vpaddq_u32(c0, c1); @@ -143,7 +143,7 @@ static INLINE uint32x4_t horizontal_long_add_4d_uint16x8( } static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddv_s32(a); #else return vget_lane_s32(a, 0) + vget_lane_s32(a, 1); @@ -151,7 +151,7 @@ static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) { } static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddv_u32(a); #else return vget_lane_u32(a, 0) + vget_lane_u32(a, 1); @@ -159,7 +159,7 @@ static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) { } static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddvq_s32(a); #else const int64x2_t b = vpaddlq_s32(a); @@ -170,7 +170,7 @@ static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) { } static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddvq_u32(a); #else const uint64x2_t b = vpaddlq_u32(a); @@ -181,7 +181,7 @@ static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) { } static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]); uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]); return vpaddq_u32(res01, res23); @@ -196,7 +196,7 @@ static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) { } static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddlvq_u32(a); #else const uint64x2_t b = vpaddlq_u32(a); @@ -205,7 +205,7 @@ static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) { } static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddvq_s64(a); #else return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1); @@ -213,7 +213,7 @@ static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) { } static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) { -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 return vaddvq_u64(a); #else return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1); diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h index 518278f..74f85a6 100644 --- a/vpx_dsp/arm/transpose_neon.h +++ b/vpx_dsp/arm/transpose_neon.h @@ -23,7 +23,7 @@ // b0.val[1]: 04 05 06 07 20 21 22 23 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { int16x8x2_t b0; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 b0.val[0] = vreinterpretq_s16_s64( vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); b0.val[1] = vreinterpretq_s16_s64( @@ -39,7 +39,7 @@ static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) { static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { int32x4x2_t b0; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 b0.val[0] = vreinterpretq_s32_s64( vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1))); b0.val[1] = vreinterpretq_s32_s64( @@ -53,7 +53,7 @@ static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) { static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { int64x2x2_t b0; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)); b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)); #else @@ -67,7 +67,7 @@ static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) { uint8x16x2_t b0; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 b0.val[0] = vreinterpretq_u8_u64( vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); b0.val[1] = vreinterpretq_u8_u64( @@ -83,7 +83,7 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) { static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) { uint16x8x2_t b0; -#if defined(__aarch64__) +#if VPX_ARCH_AARCH64 b0.val[0] = vreinterpretq_u16_u64( vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1))); b0.val[1] = vreinterpretq_u16_u64( diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c index b4cdd58..b312cc7 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.c +++ b/vpx_dsp/arm/vpx_convolve8_neon.c @@ -31,7 +31,7 @@ // instructions. This optimization is much faster in speed unit test, but slowed // down the whole decoder by 5%. -#if defined(__aarch64__) && \ +#if VPX_ARCH_AARCH64 && \ (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8)) DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = { @@ -1261,7 +1261,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, #endif // defined(__ARM_FEATURE_MATMUL_INT8) -#else // !(defined(__aarch64__) && +#else // !(VPX_ARCH_AARCH64 && // (defined(__ARM_FEATURE_DOTPROD) || // defined(__ARM_FEATURE_MATMUL_INT8))) @@ -2105,6 +2105,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride, } } -#endif // #if defined(__aarch64__) && +#endif // #if VPX_ARCH_AARCH64 && // (defined(__ARM_FEATURE_DOTPROD) || // defined(__ARM_FEATURE_MATMUL_INT8)) diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h index ed7f180..07cf824 100644 --- a/vpx_dsp/arm/vpx_convolve8_neon.h +++ b/vpx_dsp/arm/vpx_convolve8_neon.h @@ -16,7 +16,7 @@ #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" -#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo, const int8x16_t samples_hi, @@ -114,9 +114,9 @@ static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples, return vqrshrun_n_s16(sum, 7); } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD) +#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD) -#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8) +#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo, const uint8x16_t samples_hi, @@ -199,7 +199,7 @@ static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples, return vqrshrun_n_s16(sum, 7); } -#endif // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8) +#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8) static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1, const int16x4_t s2, const int16x4_t s3, -- 2.7.4