zig_zag1 = vld1q_u16(inv_zig_zag + 8);
int16x8_t x0, x1, sz0, sz1, y0, y1;
uint16x8_t eob0, eob1;
-#ifndef __aarch64__
+#if !VPX_ARCH_AARCH64
uint16x4_t eob_d16;
uint32x2_t eob_d32;
uint32x4_t eob_q32;
-#endif // __arch64__
+#endif // !VPX_ARCH_AARCH64
/* sign of z: z >> 15 */
sz0 = vshrq_n_s16(z0, 15);
/* select the largest value */
eob0 = vmaxq_u16(eob0, eob1);
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
*d->eob = (int8_t)vmaxvq_u16(eob0);
#else
eob_d16 = vmax_u16(vget_low_u16(eob0), vget_high_u16(eob0));
eob_d32 = vpmax_u32(eob_d32, eob_d32);
vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
-#endif // __aarch64__
+#endif // VPX_ARCH_AARCH64
/* qcoeff = x */
vst1q_s16(d->qcoeff, x0);
// Compute the sum of all pixel differences of this MB.
static INLINE int horizontal_add_s8x16(const int8x16_t v_sum_diff_total) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddlvq_s8(v_sum_diff_total);
#else
const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff_total);
// Work out the start point for the search
const uint8_t *best_address = in_what;
const uint8_t *new_best_address = best_address;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
int64x2_t v_ba_q = vdupq_n_s64((intptr_t)best_address);
#else
int32x4_t v_ba_d = vdupq_n_s32((intptr_t)best_address);
int8x16_t v_inside_d;
uint32x4_t v_outside_d;
int32x4_t v_cost_d, v_sad_d;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
int64x2_t v_blocka[2];
#else
int32x4_t v_blocka[1];
vreinterpretq_s32_s16(v_these_mv_w)));
// If none of them are inside, then move on
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
horiz_max = vmaxvq_u32(vreinterpretq_u32_s8(v_inside_d));
#else
horiz_max_0 = vmax_u32(vget_low_u32(vreinterpretq_u32_s8(v_inside_d)),
// Compute the SIMD pointer offsets.
{
-#if defined(__aarch64__) // sizeof(intptr_t) == 8
+#if VPX_ARCH_AARCH64 // sizeof(intptr_t) == 8
// Load the offsets
int64x2_t v_bo10_q = vld1q_s64((const int64_t *)&ss_os[i + 0]);
int64x2_t v_bo32_q = vld1q_s64((const int64_t *)&ss_os[i + 2]);
// Find the minimum value and index horizontally in v_sad_d
{
uint32_t local_best_sad;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
local_best_sad = vminvq_u32(vreinterpretq_u32_s32(v_sad_d));
#else
uint32x2_t horiz_min_0 =
uint32x4_t v_mask_d = vandq_u32(v_sel_d, v_idx_d);
v_mask_d = vbslq_u32(v_sel_d, v_mask_d, vdupq_n_u32(0xffffffff));
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
local_best_idx = vminvq_u32(v_mask_d);
#else
horiz_min_0 =
best_address = new_best_address;
v_bmv_w = vreinterpretq_s16_s32(vdupq_n_s32(bmv.as_int));
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
v_ba_q = vdupq_n_s64((intptr_t)best_address);
#else
v_ba_d = vdupq_n_s32((intptr_t)best_address);
}
static VPX_FORCE_INLINE uint16_t get_max_eob(int16x8_t v_eobmax) {
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
return (uint16_t)vmaxvq_s16(v_eobmax);
#else
const int16x4_t v_eobmax_3210 =
vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
return (uint16_t)vget_lane_s16(v_eobmax_final, 0);
-#endif // __aarch64__
+#endif // VPX_ARCH_AARCH64
}
static VPX_FORCE_INLINE void load_fp_values(const int16_t *round_ptr,
static VPX_FORCE_INLINE void update_fp_values(int16x8_t *v_round,
int16x8_t *v_quant,
int16x8_t *v_dequant) {
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
*v_round = vdupq_laneq_s16(*v_round, 1);
*v_quant = vdupq_laneq_s16(*v_quant, 1);
*v_dequant = vdupq_laneq_s16(*v_dequant, 1);
const uint8x16_t ab07_max = vmaxq_u8(ab0123_max, ab4567_max);
const uint8x16_t ab07_min = vminq_u8(ab0123_min, ab4567_min);
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
*min = *max = 0; // Clear high bits
*((uint8_t *)max) = vmaxvq_u8(ab07_max);
*((uint8_t *)min) = vminvq_u8(ab07_min);
const uint16x8_t min4567 = vminq_u16(min45, min67);
const uint16x8_t min07 = vminq_u16(min0123, min4567);
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
*min = *max = 0; // Clear high bits
*((uint16_t *)max) = vmaxvq_u16(max07);
*((uint16_t *)min) = vminvq_u16(min07);
} while (n_coeffs > 0);
}
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
*eob_ptr = vmaxvq_u16(eob_max);
#else
{
const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
vst1_lane_u16(eob_ptr, eob_max_2, 0);
}
-#endif // __aarch64__
+#endif // VPX_ARCH_AARCH64
// Need these here, else the compiler complains about mixing declarations and
// code in C90
(void)n_coeffs;
}
}
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
*eob_ptr = vmaxvq_u16(eob_max);
#else
{
const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
vst1_lane_u16(eob_ptr, eob_max_2, 0);
}
-#endif // __aarch64__
+#endif // VPX_ARCH_AARCH64
}
} while (n_coeffs > 0);
}
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
*eob_ptr = vmaxvq_u16(eob_max);
#else
{
const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
vst1_lane_u16(eob_ptr, eob_max_2, 0);
}
-#endif // __aarch64__
+#endif // VPX_ARCH_AARCH64
// Need these here, else the compiler complains about mixing declarations and
// code in C90
(void)scan;
}
}
-#ifdef __aarch64__
+#if VPX_ARCH_AARCH64
*eob_ptr = vmaxvq_u16(eob_max);
#else
{
const uint16x4_t eob_max_2 = vpmax_u16(eob_max_1, eob_max_1);
vst1_lane_u16(eob_ptr, eob_max_2, 0);
}
-#endif // __aarch64__
+#endif // VPX_ARCH_AARCH64
}
#include "vpx/vpx_integer.h"
static INLINE uint16_t horizontal_add_uint8x4(const uint8x8_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddlv_u8(a);
#else
const uint16x4_t b = vpaddl_u8(a);
}
static INLINE uint16_t horizontal_add_uint8x8(const uint8x8_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddlv_u8(a);
#else
const uint16x4_t b = vpaddl_u8(a);
}
static INLINE uint16_t horizontal_add_uint8x16(const uint8x16_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddlvq_u8(a);
#else
const uint16x8_t b = vpaddlq_u8(a);
}
static INLINE uint16_t horizontal_add_uint16x4(const uint16x4_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddv_u16(a);
#else
const uint16x4_t b = vpadd_u16(a, a);
}
static INLINE int32_t horizontal_add_int16x8(const int16x8_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddlvq_s16(a);
#else
const int32x4_t b = vpaddlq_s16(a);
}
static INLINE uint32_t horizontal_add_uint16x8(const uint16x8_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddlvq_u16(a);
#else
const uint32x4_t b = vpaddlq_u16(a);
}
static INLINE uint32x4_t horizontal_add_4d_uint16x8(const uint16x8_t sum[4]) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
const uint16x8_t b0 = vpaddq_u16(a0, a1);
static INLINE uint32_t horizontal_long_add_uint16x8(const uint16x8_t vec_lo,
const uint16x8_t vec_hi) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddlvq_u16(vec_lo) + vaddlvq_u16(vec_hi);
#else
const uint32x4_t vec_l_lo =
const uint32x4_t b1 = vpadalq_u16(a1, sum_hi[1]);
const uint32x4_t b2 = vpadalq_u16(a2, sum_hi[2]);
const uint32x4_t b3 = vpadalq_u16(a3, sum_hi[3]);
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
const uint32x4_t c0 = vpaddq_u32(b0, b1);
const uint32x4_t c1 = vpaddq_u32(b2, b3);
return vpaddq_u32(c0, c1);
}
static INLINE int32_t horizontal_add_int32x2(const int32x2_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddv_s32(a);
#else
return vget_lane_s32(a, 0) + vget_lane_s32(a, 1);
}
static INLINE uint32_t horizontal_add_uint32x2(const uint32x2_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddv_u32(a);
#else
return vget_lane_u32(a, 0) + vget_lane_u32(a, 1);
}
static INLINE int32_t horizontal_add_int32x4(const int32x4_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddvq_s32(a);
#else
const int64x2_t b = vpaddlq_s32(a);
}
static INLINE uint32_t horizontal_add_uint32x4(const uint32x4_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddvq_u32(a);
#else
const uint64x2_t b = vpaddlq_u32(a);
}
static INLINE uint32x4_t horizontal_add_4d_uint32x4(const uint32x4_t sum[4]) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
uint32x4_t res01 = vpaddq_u32(sum[0], sum[1]);
uint32x4_t res23 = vpaddq_u32(sum[2], sum[3]);
return vpaddq_u32(res01, res23);
}
static INLINE uint64_t horizontal_long_add_uint32x4(const uint32x4_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddlvq_u32(a);
#else
const uint64x2_t b = vpaddlq_u32(a);
}
static INLINE int64_t horizontal_add_int64x2(const int64x2_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddvq_s64(a);
#else
return vgetq_lane_s64(a, 0) + vgetq_lane_s64(a, 1);
}
static INLINE uint64_t horizontal_add_uint64x2(const uint64x2_t a) {
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
return vaddvq_u64(a);
#else
return vgetq_lane_u64(a, 0) + vgetq_lane_u64(a, 1);
// b0.val[1]: 04 05 06 07 20 21 22 23
static INLINE int16x8x2_t vpx_vtrnq_s64_to_s16(int32x4_t a0, int32x4_t a1) {
int16x8x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
b0.val[0] = vreinterpretq_s16_s64(
vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
b0.val[1] = vreinterpretq_s16_s64(
static INLINE int32x4x2_t vpx_vtrnq_s64_to_s32(int32x4_t a0, int32x4_t a1) {
int32x4x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
b0.val[0] = vreinterpretq_s32_s64(
vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1)));
b0.val[1] = vreinterpretq_s32_s64(
static INLINE int64x2x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) {
int64x2x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
b0.val[0] = vtrn1q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
b0.val[1] = vtrn2q_s64(vreinterpretq_s64_s32(a0), vreinterpretq_s64_s32(a1));
#else
static INLINE uint8x16x2_t vpx_vtrnq_u64_to_u8(uint32x4_t a0, uint32x4_t a1) {
uint8x16x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
b0.val[0] = vreinterpretq_u8_u64(
vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
b0.val[1] = vreinterpretq_u8_u64(
static INLINE uint16x8x2_t vpx_vtrnq_u64_to_u16(uint32x4_t a0, uint32x4_t a1) {
uint16x8x2_t b0;
-#if defined(__aarch64__)
+#if VPX_ARCH_AARCH64
b0.val[0] = vreinterpretq_u16_u64(
vtrn1q_u64(vreinterpretq_u64_u32(a0), vreinterpretq_u64_u32(a1)));
b0.val[1] = vreinterpretq_u16_u64(
// instructions. This optimization is much faster in speed unit test, but slowed
// down the whole decoder by 5%.
-#if defined(__aarch64__) && \
+#if VPX_ARCH_AARCH64 && \
(defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
DECLARE_ALIGNED(16, static const uint8_t, dot_prod_permute_tbl[48]) = {
#endif // defined(__ARM_FEATURE_MATMUL_INT8)
-#else // !(defined(__aarch64__) &&
+#else // !(VPX_ARCH_AARCH64 &&
// (defined(__ARM_FEATURE_DOTPROD) ||
// defined(__ARM_FEATURE_MATMUL_INT8)))
}
}
-#endif // #if defined(__aarch64__) &&
+#endif // #if VPX_ARCH_AARCH64 &&
// (defined(__ARM_FEATURE_DOTPROD) ||
// defined(__ARM_FEATURE_MATMUL_INT8))
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
-#if defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
static INLINE int32x4_t convolve8_4_sdot_partial(const int8x16_t samples_lo,
const int8x16_t samples_hi,
return vqrshrun_n_s16(sum, 7);
}
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_DOTPROD)
+#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_DOTPROD)
-#if defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#if VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
static INLINE int32x4_t convolve8_4_usdot_partial(const uint8x16_t samples_lo,
const uint8x16_t samples_hi,
return vqrshrun_n_s16(sum, 7);
}
-#endif // defined(__aarch64__) && defined(__ARM_FEATURE_MATMUL_INT8)
+#endif // VPX_ARCH_AARCH64 && defined(__ARM_FEATURE_MATMUL_INT8)
static INLINE int16x4_t convolve8_4(const int16x4_t s0, const int16x4_t s1,
const int16x4_t s2, const int16x4_t s3,