From 57b9afa58f849a8165ce3132c21087ae451d862c Mon Sep 17 00:00:00 2001
From: James Zern <jzern@google.com>
Date: Tue, 2 May 2023 18:37:59 -0700
Subject: [PATCH] s/__aarch64__/VPX_ARCH_AARCH64/

This allows AArch64 to be correctly detected when building with Visual
Studio (cl.exe) and fixes a crash in vp9_diamond_search_sad_neon.c.
There are still test failures, however.

Microsoft's compiler doesn't define __ARM_FEATURE_*. To use those paths
we may need to rely on _M_ARM64_EXTENSION.

Bug: webm:1788
Bug: b/277255076
Change-Id: I4d26f5f84dbd0cbcd1cdf0d7d932ebcf109febe5
---
 vp8/encoder/arm/neon/fastquantizeb_neon.c          |  8 ++---
 vp9/encoder/arm/neon/vp9_denoiser_neon.c           |  2 +-
 vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c | 14 ++++-----
 vp9/encoder/arm/neon/vp9_quantize_neon.c           |  6 ++--
 vpx_dsp/arm/avg_neon.c                             |  2 +-
 vpx_dsp/arm/highbd_avg_neon.c                      |  2 +-
 vpx_dsp/arm/highbd_quantize_neon.c                 |  8 ++---
 vpx_dsp/arm/quantize_neon.c                        |  8 ++---
 vpx_dsp/arm/sum_neon.h                             | 34 +++++++++++-----------
 vpx_dsp/arm/transpose_neon.h                       | 10 +++----
 vpx_dsp/arm/vpx_convolve8_neon.c                   |  6 ++--
 vpx_dsp/arm/vpx_convolve8_neon.h                   |  8 ++---
 12 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c
index 6fc6080..950c943 100644
--- a/vp8/encoder/arm/neon/fastquantizeb_neon.c
+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -28,11 +28,11 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
                    zig_zag1 = vld1q_u16(inv_zig_zag + 8);
   int16x8_t x0, x1, sz0, sz1, y0, y1;
   uint16x8_t eob0, eob1;
-#ifndef __aarch64__
+#if !VPX_ARCH_AARCH64
   uint16x4_t eob_d16;
   uint32x2_t eob_d32;
   uint32x4_t eob_q32;
-#endif  // __arch64__
+#endif  // !VPX_ARCH_AARCH64
 
   /* sign of z: z >> 15 */
   sz0 = vshrq_n_s16(z0, 15);
@@ -70,7 +70,7 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
 
   /* select the largest value */
   eob0 = vmaxq_u16(eob0, eob1);
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *d->eob = (int8_t)vmaxvq_u16(eob0);
 #else
   eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
@@ -79,7 +79,7 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
   eob_d32 = vpmax_u32(eob_d32, eob_d32);
 
   vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
 
   /* qcoeff = x */
   vst1q_s16(d->qcoeff, x0);
diff --git a/vp9/encoder/arm/neon/vp9_denoiser_neon.c b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
index 53e8c7e..d631cd4 100644
--- a/vp9/encoder/arm/neon/vp9_denoiser_neon.c
+++ b/vp9/encoder/arm/neon/vp9_denoiser_neon.c
@@ -21,7 +21,7 @@
 
 // Compute the sum of all pixel differences of this MB.
 static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_s8(v_sum_diff_total);
 #else
   const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
diff --git a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
index 255e6fb..b82b3f9 100644
--- a/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
+++ b/vp9/encoder/arm/neon/vp9_diamond_search_sad_neon.c
@@ -94,7 +94,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
   // Work out the start point for the search
   const uint8_t *best_address = in_what;
   const uint8_t *new_best_address = best_address;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
 #else
   int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
@@ -117,7 +117,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
       int8x16_t v_inside_d;
       uint32x4_t v_outside_d;
       int32x4_t v_cost_d, v_sad_d;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
       int64x2_t v_blocka[2];
 #else
       int32x4_t v_blocka[1];
@@ -138,7 +138,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
                     vreinterpretq_s32_s16(v_these_mv_w)));
 
       // If none of them are inside, then move on
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
       horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
 #else
       horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
@@ -167,7 +167,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
 
       // Compute the SIMD pointer offsets.
       {
-#if defined(__aarch64__)  //  sizeof(intptr_t) == 8
+#if VPX_ARCH_AARCH64  //  sizeof(intptr_t) == 8
         // Load the offsets
         int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
         int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
@@ -234,7 +234,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
       // Find the minimum value and index horizontally in v_sad_d
       {
         uint32_t local_best_sad;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
         local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
 #else
         uint32x2_t horiz_min_0 =
@@ -256,7 +256,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
           uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
           v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
 
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
           local_best_idx = vminvq_u32(v_mask_d);
 #else
           horiz_min_0 =
@@ -280,7 +280,7 @@ int vp9_diamond_search_sad_neon(const MACROBLOCK *x,
     best_address = new_best_address;
 
     v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
     v_ba_q = vdupq_n_s64((intptr_t)best_address);
 #else
     v_ba_d = vdupq_n_s32((intptr_t)best_address);
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
index c2b55fc..97ab136 100644
--- a/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -50,7 +50,7 @@ static VPX_FORCE_INLINE int16x8_t get_max_lane_eob(const int16_t *iscan_ptr,
 }
 
 static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   return (uint16_t)vmaxvq_s16(v_eobmax);
 #else
   const int16x4_t v_eobmax_3210 =
@@ -65,7 +65,7 @@ static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
       vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
 
   return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
 }
 
 static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr,
@@ -81,7 +81,7 @@ static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr,
 static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round,
                                               int16x8_t *v_quant,
                                               int16x8_t *v_dequant) {
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *v_round = vdupq_laneq_s16(*v_round, 1);
   *v_quant = vdupq_laneq_s16(*v_quant, 1);
   *v_dequant = vdupq_laneq_s16(*v_dequant, 1);
diff --git a/vpx_dsp/arm/avg_neon.c b/vpx_dsp/arm/avg_neon.c
index d48115d..8c61fc2 100644
--- a/vpx_dsp/arm/avg_neon.c
+++ b/vpx_dsp/arm/avg_neon.c
@@ -210,7 +210,7 @@ void vpx_minmax_8x8_neon(const uint8_t *a, int a_stride, const uint8_t *b,
   const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
   const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
 
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   *min = *max = 0;  // Clear high bits
   *((uint8_t *)max) = vmaxvq_u8(ab07_max);
   *((uint8_t *)min) = vminvq_u8(ab07_min);
diff --git a/vpx_dsp/arm/highbd_avg_neon.c b/vpx_dsp/arm/highbd_avg_neon.c
index fc10197..8939ee1 100644
--- a/vpx_dsp/arm/highbd_avg_neon.c
+++ b/vpx_dsp/arm/highbd_avg_neon.c
@@ -114,7 +114,7 @@ void vpx_highbd_minmax_8x8_neon(const uint8_t *a, int a_stride,
   const uint16x8_t min4567 = vminq_u16(min45, min67);
   const uint16x8_t min07 = vminq_u16(min0123, min4567);
 
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   *min = *max = 0;  // Clear high bits
   *((uint16_t *)max) = vmaxvq_u16(max07);
   *((uint16_t *)min) = vminvq_u16(min07);
diff --git a/vpx_dsp/arm/highbd_quantize_neon.c b/vpx_dsp/arm/highbd_quantize_neon.c
index 526447a..d2a7add 100644
--- a/vpx_dsp/arm/highbd_quantize_neon.c
+++ b/vpx_dsp/arm/highbd_quantize_neon.c
@@ -166,7 +166,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     } while (n_coeffs > 0);
   }
 
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *eob_ptr = vmaxvq_u16(eob_max);
 #else
   {
@@ -176,7 +176,7 @@ void vpx_highbd_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
   // Need these here, else the compiler complains about mixing declarations and
   // code in C90
   (void)n_coeffs;
@@ -291,7 +291,7 @@ void vpx_highbd_quantize_b_32x32_neon(
     }
   }
 
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *eob_ptr = vmaxvq_u16(eob_max);
 #else
   {
@@ -301,5 +301,5 @@ void vpx_highbd_quantize_b_32x32_neon(
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
 }
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c
index cc8f623..35c67f6 100644
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -134,7 +134,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     } while (n_coeffs > 0);
   }
 
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *eob_ptr = vmaxvq_u16(eob_max);
 #else
   {
@@ -144,7 +144,7 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
   // Need these here, else the compiler complains about mixing declarations and
   // code in C90
   (void)scan;
@@ -276,7 +276,7 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
     }
   }
 
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
   *eob_ptr = vmaxvq_u16(eob_max);
 #else
   {
@@ -286,5 +286,5 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr,
     const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
     vst1_lane_u16(eob_ptr, eob_max_2, 0);
   }
-#endif  // __aarch64__
+#endif  // VPX_ARCH_AARCH64
 }
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index a0c72f9..48a2fc0 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -17,7 +17,7 @@
 #include "vpx/vpx_integer.h"
 
 static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlv_u8(a);
 #else
   const uint16x4_t b = vpaddl_u8(a);
@@ -27,7 +27,7 @@ static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) {
 }
 
 static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlv_u8(a);
 #else
   const uint16x4_t b = vpaddl_u8(a);
@@ -38,7 +38,7 @@ static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) {
 }
 
 static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_u8(a);
 #else
   const uint16x8_t b = vpaddlq_u8(a);
@@ -50,7 +50,7 @@ static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) {
 }
 
 static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddv_u16(a);
 #else
   const uint16x4_t b = vpadd_u16(a, a);
@@ -60,7 +60,7 @@ static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) {
 }
 
 static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_s16(a);
 #else
   const int32x4_t b = vpaddlq_s16(a);
@@ -72,7 +72,7 @@ static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
 }
 
 static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_u16(a);
 #else
   const uint32x4_t b = vpaddlq_u16(a);
@@ -84,7 +84,7 @@ static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
 }
 
 static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
   const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
   const uint16x8_t b0 = vpaddq_u16(a0, a1);
@@ -102,7 +102,7 @@ static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
 
 static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
                                                     const uint16x8_t vec_hi) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
 #else
   const uint32x4_t vec_l_lo =
@@ -127,7 +127,7 @@ static INLINE uint32x4_t horizontal_long_add_4d_uint16x8(
   const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
   const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
   const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   const uint32x4_t c0 = vpaddq_u32(b0, b1);
   const uint32x4_t c1 = vpaddq_u32(b2, b3);
   return vpaddq_u32(c0, c1);
@@ -143,7 +143,7 @@ static INLINE uint32x4_t horizontal_long_add_4d_uint16x8(
 }
 
 static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddv_s32(a);
 #else
   return vget_lane_s32(a, 0) + vget_lane_s32(a, 1);
@@ -151,7 +151,7 @@ static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
 }
 
 static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddv_u32(a);
 #else
   return vget_lane_u32(a, 0) + vget_lane_u32(a, 1);
@@ -159,7 +159,7 @@ static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
 }
 
 static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddvq_s32(a);
 #else
   const int64x2_t b = vpaddlq_s32(a);
@@ -170,7 +170,7 @@ static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
 }
 
 static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddvq_u32(a);
 #else
   const uint64x2_t b = vpaddlq_u32(a);
@@ -181,7 +181,7 @@ static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
 }
 
 static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
   uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
   return vpaddq_u32(res01, res23);
@@ -196,7 +196,7 @@ static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
 }
 
 static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddlvq_u32(a);
 #else
   const uint64x2_t b = vpaddlq_u32(a);
@@ -205,7 +205,7 @@ static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) {
 }
 
 static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddvq_s64(a);
 #else
   return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
@@ -213,7 +213,7 @@ static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) {
 }
 
 static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   return vaddvq_u64(a);
 #else
   return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h
index 518278f..74f85a6 100644
--- a/vpx_dsp/arm/transpose_neon.h
+++ b/vpx_dsp/arm/transpose_neon.h
@@ -23,7 +23,7 @@
 // b0.val[1]: 04 05 06 07 20 21 22 23
 static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
   int16x8x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   b0.val[0] = vreinterpretq_s16_s64(
       vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
   b0.val[1] = vreinterpretq_s16_s64(
@@ -39,7 +39,7 @@ static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
 
 static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
   int32x4x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   b0.val[0] = vreinterpretq_s32_s64(
       vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
   b0.val[1] = vreinterpretq_s32_s64(
@@ -53,7 +53,7 @@ static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
 
 static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
   int64x2x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
   b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
 #else
@@ -67,7 +67,7 @@ static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
 
 static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
   uint8x16x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   b0.val[0] = vreinterpretq_u8_u64(
       vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
   b0.val[1] = vreinterpretq_u8_u64(
@@ -83,7 +83,7 @@ static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
 
 static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
   uint16x8x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
   b0.val[0] = vreinterpretq_u16_u64(
       vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
   b0.val[1] = vreinterpretq_u16_u64(
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.c b/vpx_dsp/arm/vpx_convolve8_neon.c
index b4cdd58..b312cc7 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.c
+++ b/vpx_dsp/arm/vpx_convolve8_neon.c
@@ -31,7 +31,7 @@
 // instructions. This optimization is much faster in speed unit test, but slowed
 // down the whole decoder by 5%.
 
-#if defined(__aarch64__) && \
+#if VPX_ARCH_AARCH64 && \
     (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
 
 DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
@@ -1261,7 +1261,7 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
 
 #endif  // defined(__ARM_FEATURE_MATMUL_INT8)
 
-#else  // !(defined(__aarch64__) &&
+#else  // !(VPX_ARCH_AARCH64 &&
        //   (defined(__ARM_FEATURE_DOTPROD) ||
        //    defined(__ARM_FEATURE_MATMUL_INT8)))
 
@@ -2105,6 +2105,6 @@ void vpx_convolve8_avg_vert_neon(const uint8_t *src, ptrdiff_t src_stride,
   }
 }
 
-#endif  // #if defined(__aarch64__) &&
+#endif  // #if VPX_ARCH_AARCH64 &&
         //     (defined(__ARM_FEATURE_DOTPROD) ||
         //      defined(__ARM_FEATURE_MATMUL_INT8))
diff --git a/vpx_dsp/arm/vpx_convolve8_neon.h b/vpx_dsp/arm/vpx_convolve8_neon.h
index ed7f180..07cf824 100644
--- a/vpx_dsp/arm/vpx_convolve8_neon.h
+++ b/vpx_dsp/arm/vpx_convolve8_neon.h
@@ -16,7 +16,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
 
 static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
                                                  const int8x16_t samples_hi,
@@ -114,9 +114,9 @@ static INLINE uint8x8_t convolve8_8_sdot(uint8x16_t samples,
   return vqrshrun_n_s16(sum, 7);
 }
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
 
-#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
 
 static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
                                                   const uint8x16_t samples_hi,
@@ -199,7 +199,7 @@ static INLINE uint8x8_t convolve8_8_usdot(uint8x16_t samples,
   return vqrshrun_n_s16(sum, 7);
 }
 
-#endif  // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#endif  // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
 
 static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
                                     const int16x4_t s2, const int16x4_t s3,
-- 
2.7.4