[NEON] Improve vpx_quantize_b* functions

author Konstantinos Margaritis <konma@vectorcamp.gr>

Sat, 20 Aug 2022 19:02:15 +0000 (19:02 +0000)

committer Konstantinos Margaritis <konma@vectorcamp.gr>

Tue, 23 Aug 2022 10:29:01 +0000 (10:29 +0000)
author Konstantinos Margaritis <konma@vectorcamp.gr>
Sat, 20 Aug 2022 19:02:15 +0000 (19:02 +0000)
committer Konstantinos Margaritis <konma@vectorcamp.gr>
Tue, 23 Aug 2022 10:29:01 +0000 (10:29 +0000)
diff --git a/vpx_dsp/arm/quantize_neon.c b/vpx_dsp/arm/quantize_neon.c

index bd7818a..dcdf588 100644 (file)
--- a/vpx_dsp/arm/quantize_neon.c
+++ b/vpx_dsp/arm/quantize_neon.c
@@ -17,20 +17,57 @@
  
  static INLINE void calculate_dqcoeff_and_store(const int16x8_t qcoeff,
                                                 const int16x8_t dequant,
-                                               tran_low_t *dqcoeff) {
+                                               tran_low_t *dqcoeff_ptr) {
+#if CONFIG_VP9_HIGHBITDEPTH
    const int32x4_t dqcoeff_0 =
        vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
    const int32x4_t dqcoeff_1 =
        vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
  
-#if CONFIG_VP9_HIGHBITDEPTH
-  vst1q_s32(dqcoeff, dqcoeff_0);
-  vst1q_s32(dqcoeff + 4, dqcoeff_1);
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
  #else
-  vst1q_s16(dqcoeff, vcombine_s16(vmovn_s32(dqcoeff_0), vmovn_s32(dqcoeff_1)));
+  vst1q_s16(dqcoeff_ptr, vmulq_s16(qcoeff, dequant));
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  }
  
+static INLINE int16x8_t
+quantize_b_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+                const int16x8_t round, const int16x8_t quant,
+                const int16x8_t quant_shift, const int16x8_t dequant) {
+  // Load coeffs as 8 x 16-bit ints, take sign and abs values
+  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+  const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+  // Calculate mask of elements outside the bin
+  const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+  // Get the rounded values
+  const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+  // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+  int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+  qcoeff = vaddq_s16(qcoeff, rounded);
+
+  // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
+  qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
+
+  // Restore the sign bit.
+  qcoeff = veorq_s16(qcoeff, coeff_sign);
+  qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+  // Only keep the relevant coeffs
+  qcoeff = vandq_s16(qcoeff, zbin_mask);
+  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+  calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
+
+  return qcoeff;
+}
+
  void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *zbin_ptr, const int16_t *round_ptr,
                           const int16_t *quant_ptr,
@@ -41,106 +78,61 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    const int16x8_t one = vdupq_n_s16(1);
    const int16x8_t neg_one = vdupq_n_s16(-1);
    uint16x8_t eob_max;
-  (void)scan;
+
+  // Only the first element of each vector is DC.
+  int16x8_t zbin = vld1q_s16(zbin_ptr);
+  int16x8_t round = vld1q_s16(round_ptr);
+  int16x8_t quant = vld1q_s16(quant_ptr);
+  int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+  int16x8_t dequant = vld1q_s16(dequant_ptr);
  
    // Process first 8 values which include a dc component.
    {
-    // Only the first element of each vector is DC.
-    const int16x8_t zbin = vld1q_s16(zbin_ptr);
-    const int16x8_t round = vld1q_s16(round_ptr);
-    const int16x8_t quant = vld1q_s16(quant_ptr);
-    const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
-    const int16x8_t dequant = vld1q_s16(dequant_ptr);
      // Add one because the eob does not index from 0.
      const uint16x8_t v_iscan =
          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
  
-    const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-    const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-    const int16x8_t zbin_mask =
-        vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
-
-    const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
-    // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-    int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
-    qcoeff = vaddq_s16(qcoeff, rounded);
-
-    // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
-    qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
-
-    // Restore the sign bit.
-    qcoeff = veorq_s16(qcoeff, coeff_sign);
-    qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-    qcoeff = vandq_s16(qcoeff, zbin_mask);
+    const int16x8_t qcoeff =
+        quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round, quant,
+                        quant_shift, dequant);
  
      // Set non-zero elements to -1 and use that to extract values for eob.
      eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
  
+    __builtin_prefetch(coeff_ptr + 64);
      coeff_ptr += 8;
      iscan += 8;
-
-    store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
      qcoeff_ptr += 8;
-
-    calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
      dqcoeff_ptr += 8;
    }
  
    n_coeffs -= 8;
  
    {
-    const int16x8_t zbin = vdupq_n_s16(zbin_ptr[1]);
-    const int16x8_t round = vdupq_n_s16(round_ptr[1]);
-    const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
-    const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
-    const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+    zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+    round = vdupq_lane_s16(vget_low_s16(round), 1);
+    quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+    quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+    dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
  
      do {
        // Add one because the eob is not its index.
        const uint16x8_t v_iscan =
            vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
  
-      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-      const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-      const int16x8_t zbin_mask =
-          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
-
-      const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
-      // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-      int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
-      qcoeff = vaddq_s16(qcoeff, rounded);
-
-      // (qcoeff * quant_shift * 2) >> 16 >> 1 == (qcoeff * quant_shift) >> 16
-      qcoeff = vshrq_n_s16(vqdmulhq_s16(qcoeff, quant_shift), 1);
-
-      // Restore the sign bit.
-      qcoeff = veorq_s16(qcoeff, coeff_sign);
-      qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-      qcoeff = vandq_s16(qcoeff, zbin_mask);
+      const int16x8_t qcoeff =
+          quantize_b_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                          quant, quant_shift, dequant);
  
        // Set non-zero elements to -1 and use that to extract values for eob.
        eob_max =
            vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
  
+      __builtin_prefetch(coeff_ptr + 64);
        coeff_ptr += 8;
        iscan += 8;
-
-      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
        qcoeff_ptr += 8;
-
-      calculate_dqcoeff_and_store(qcoeff, dequant, dqcoeff_ptr);
        dqcoeff_ptr += 8;
-
        n_coeffs -= 8;
      } while (n_coeffs > 0);
    }
@@ -156,6 +148,9 @@ void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      vst1_lane_u16(eob_ptr, eob_max_2, 0);
    }
  #endif  // __aarch64__
+  // Need these here, else the compiler complains about mixing declarations and
+  // code in C90
+  (void)scan;
  }
  
  static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
@@ -164,7 +159,7 @@ static INLINE int32x4_t extract_sign_bit(int32x4_t a) {
  
  static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
                                                       const int16x8_t dequant,
-                                                     tran_low_t *dqcoeff) {
+                                                     tran_low_t *dqcoeff_ptr) {
    int32x4_t dqcoeff_0 = vmull_s16(vget_low_s16(qcoeff), vget_low_s16(dequant));
    int32x4_t dqcoeff_1 =
        vmull_s16(vget_high_s16(qcoeff), vget_high_s16(dequant));
@@ -176,14 +171,51 @@ static INLINE void calculate_dqcoeff_and_store_32x32(const int16x8_t qcoeff,
  #if CONFIG_VP9_HIGHBITDEPTH
    dqcoeff_0 = vshrq_n_s32(dqcoeff_0, 1);
    dqcoeff_1 = vshrq_n_s32(dqcoeff_1, 1);
-  vst1q_s32(dqcoeff, dqcoeff_0);
-  vst1q_s32(dqcoeff + 4, dqcoeff_1);
+  vst1q_s32(dqcoeff_ptr, dqcoeff_0);
+  vst1q_s32(dqcoeff_ptr + 4, dqcoeff_1);
  #else
-  vst1q_s16(dqcoeff,
+  vst1q_s16(dqcoeff_ptr,
              vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)));
  #endif  // CONFIG_VP9_HIGHBITDEPTH
  }
  
+static INLINE int16x8_t
+quantize_b_32x32_neon(const tran_low_t *coeff_ptr, tran_low_t *qcoeff_ptr,
+                      tran_low_t *dqcoeff_ptr, const int16x8_t zbin,
+                      const int16x8_t round, const int16x8_t quant,
+                      const int16x8_t quant_shift, const int16x8_t dequant) {
+  // Load coeffs as 8 x 16-bit ints, take sign and abs values
+  const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
+  const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
+  const int16x8_t coeff_abs = vabsq_s16(coeff);
+
+  // Calculate mask of elements outside the bin
+  const int16x8_t zbin_mask = vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
+
+  // Get the rounded values
+  const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
+
+  // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
+  int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
+
+  qcoeff = vaddq_s16(qcoeff, rounded);
+
+  // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
+  qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
+
+  // Restore the sign bit.
+  qcoeff = veorq_s16(qcoeff, coeff_sign);
+  qcoeff = vsubq_s16(qcoeff, coeff_sign);
+
+  // Only keep the relevant coeffs
+  qcoeff = vandq_s16(qcoeff, zbin_mask);
+  store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
+
+  calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
+
+  return qcoeff;
+}
+
  // Main difference is that zbin values are halved before comparison and dqcoeff
  // values are divided by 2. zbin is rounded but dqcoeff is not.
  void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
@@ -198,103 +230,58 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
    const int16x8_t neg_one = vdupq_n_s16(-1);
    uint16x8_t eob_max;
    int i;
-  (void)scan;
-  (void)n_coeffs;  // Because we will always calculate 32*32.
+
+  // Only the first element of each vector is DC.
+  int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
+  int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
+  int16x8_t quant = vld1q_s16(quant_ptr);
+  int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
+  int16x8_t dequant = vld1q_s16(dequant_ptr);
  
    // Process first 8 values which include a dc component.
    {
-    // Only the first element of each vector is DC.
-    const int16x8_t zbin = vrshrq_n_s16(vld1q_s16(zbin_ptr), 1);
-    const int16x8_t round = vrshrq_n_s16(vld1q_s16(round_ptr), 1);
-    const int16x8_t quant = vld1q_s16(quant_ptr);
-    const int16x8_t quant_shift = vld1q_s16(quant_shift_ptr);
-    const int16x8_t dequant = vld1q_s16(dequant_ptr);
      // Add one because the eob does not index from 0.
      const uint16x8_t v_iscan =
          vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
  
-    const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-    const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-    const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-    const int16x8_t zbin_mask =
-        vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
-
-    const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
-    // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-    int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
-    qcoeff = vaddq_s16(qcoeff, rounded);
-
-    // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
-    qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
-
-    // Restore the sign bit.
-    qcoeff = veorq_s16(qcoeff, coeff_sign);
-    qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-    qcoeff = vandq_s16(qcoeff, zbin_mask);
+    const int16x8_t qcoeff =
+        quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                              quant, quant_shift, dequant);
  
      // Set non-zero elements to -1 and use that to extract values for eob.
      eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan);
  
+    __builtin_prefetch(coeff_ptr + 64);
      coeff_ptr += 8;
      iscan += 8;
-
-    store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
      qcoeff_ptr += 8;
-
-    calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
      dqcoeff_ptr += 8;
    }
  
    {
-    const int16x8_t zbin = vrshrq_n_s16(vdupq_n_s16(zbin_ptr[1]), 1);
-    const int16x8_t round = vrshrq_n_s16(vdupq_n_s16(round_ptr[1]), 1);
-    const int16x8_t quant = vdupq_n_s16(quant_ptr[1]);
-    const int16x8_t quant_shift = vdupq_n_s16(quant_shift_ptr[1]);
-    const int16x8_t dequant = vdupq_n_s16(dequant_ptr[1]);
+    zbin = vdupq_lane_s16(vget_low_s16(zbin), 1);
+    round = vdupq_lane_s16(vget_low_s16(round), 1);
+    quant = vdupq_lane_s16(vget_low_s16(quant), 1);
+    quant_shift = vdupq_lane_s16(vget_low_s16(quant_shift), 1);
+    dequant = vdupq_lane_s16(vget_low_s16(dequant), 1);
  
      for (i = 1; i < 32 * 32 / 8; ++i) {
        // Add one because the eob is not its index.
        const uint16x8_t v_iscan =
            vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one));
  
-      const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr);
-      const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15);
-      const int16x8_t coeff_abs = vabsq_s16(coeff);
-
-      const int16x8_t zbin_mask =
-          vreinterpretq_s16_u16(vcgeq_s16(coeff_abs, zbin));
-
-      const int16x8_t rounded = vqaddq_s16(coeff_abs, round);
-
-      // (round * quant * 2) >> 16 >> 1 == (round * quant) >> 16
-      int16x8_t qcoeff = vshrq_n_s16(vqdmulhq_s16(rounded, quant), 1);
-
-      qcoeff = vaddq_s16(qcoeff, rounded);
-
-      // (qcoeff * quant_shift * 2) >> 16 == (qcoeff * quant_shift) >> 15
-      qcoeff = vqdmulhq_s16(qcoeff, quant_shift);
-
-      // Restore the sign bit.
-      qcoeff = veorq_s16(qcoeff, coeff_sign);
-      qcoeff = vsubq_s16(qcoeff, coeff_sign);
-
-      qcoeff = vandq_s16(qcoeff, zbin_mask);
+      const int16x8_t qcoeff =
+          quantize_b_32x32_neon(coeff_ptr, qcoeff_ptr, dqcoeff_ptr, zbin, round,
+                                quant, quant_shift, dequant);
  
        // Set non-zero elements to -1 and use that to extract values for eob.
        eob_max =
            vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan));
  
+      __builtin_prefetch(coeff_ptr + 64);
        coeff_ptr += 8;
        iscan += 8;
-
-      store_s16q_to_tran_low(qcoeff_ptr, qcoeff);
        qcoeff_ptr += 8;
-
-      calculate_dqcoeff_and_store_32x32(qcoeff, dequant, dqcoeff_ptr);
        dqcoeff_ptr += 8;
      }
    }
@@ -310,4 +297,8 @@ void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
      vst1_lane_u16(eob_ptr, eob_max_2, 0);
    }
  #endif  // __aarch64__
+  // Need these here, else the compiler complains about mixing declarations and
+  // code in C90
+  (void)n_coeffs;
+  (void)scan;
  }
author	Konstantinos Margaritis <konma@vectorcamp.gr>
	Sat, 20 Aug 2022 19:02:15 +0000 (19:02 +0000)
committer	Konstantinos Margaritis <konma@vectorcamp.gr>
	Tue, 23 Aug 2022 10:29:01 +0000 (10:29 +0000)