quantize: standardize vp9_quantize_fp_sse2
authorJohann <johann.koenig@gmail.com>
Sat, 24 Sep 2022 01:53:05 +0000 (10:53 +0900)
committerJohann Koenig <johannkoenig@google.com>
Mon, 26 Sep 2022 22:10:35 +0000 (22:10 +0000)
Match style for vpx_quantize_b_sse2 and prepare to rewrite
ssse3 version in intrinsics.

Need to evaluate the value of threshold breakout before
going further.

Change-Id: I9cfceb1bb0dc237cd6b73fc8d41d78bba444a15b

vp9/encoder/x86/vp9_quantize_sse2.c
vpx_dsp/x86/quantize_sse2.h

index 0fd0dcc..da4cd9e 100644 (file)
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
+#include "vpx_dsp/x86/quantize_sse2.h"
 
 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                           const int16_t *round_ptr, const int16_t *quant_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
                           const int16_t *scan, const int16_t *iscan) {
-  __m128i zero;
+  const __m128i zero = _mm_setzero_si128();
   __m128i thr;
   int nzflag;
   __m128i eob;
@@ -35,159 +36,105 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
   qcoeff_ptr += n_coeffs;
   dqcoeff_ptr += n_coeffs;
   n_coeffs = -n_coeffs;
-  zero = _mm_setzero_si128();
+
+  // Setup global values.
+  load_fp_values(round_ptr, &round, quant_ptr, &quant, dequant_ptr, &dequant);
 
   {
     __m128i coeff0, coeff1;
+    __m128i coeff0_sign, coeff1_sign;
+    __m128i qcoeff0, qcoeff1;
+    // Do DC and first 15 AC.
+    coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+    coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
 
-    // Setup global values
-    {
-      round = _mm_load_si128((const __m128i *)round_ptr);
-      quant = _mm_load_si128((const __m128i *)quant_ptr);
-      dequant = _mm_load_si128((const __m128i *)dequant_ptr);
-    }
+    // Poor man's abs().
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
+
+    qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+    qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+
+    round = _mm_unpackhi_epi64(round, round);
+    quant = _mm_unpackhi_epi64(quant, quant);
+
+    qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+    qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
+
+    // Reinsert signs.
+    qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
+
+    store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
+    store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
+
+    qcoeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+    dequant = _mm_unpackhi_epi64(dequant, dequant);
+    qcoeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+    store_tran_low(qcoeff0, dqcoeff_ptr + n_coeffs);
+    store_tran_low(qcoeff1, dqcoeff_ptr + n_coeffs + 8);
+
+    eob = scan_for_eob(&qcoeff0, &qcoeff1, iscan + n_coeffs, 0, zero);
+
+    n_coeffs += 8 * 2;
+  }
+
+  thr = _mm_srai_epi16(dequant, 1);
+
+  // AC only loop.
+  while (n_coeffs < 0) {
+    __m128i coeff0, coeff1;
+    __m128i coeff0_sign, coeff1_sign;
+    __m128i qcoeff0, qcoeff1;
+
+    coeff0 = load_tran_low(coeff_ptr + n_coeffs);
+    coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
+
+    // Poor man's abs().
+    coeff0_sign = _mm_srai_epi16(coeff0, 15);
+    coeff1_sign = _mm_srai_epi16(coeff1, 15);
+    qcoeff0 = invert_sign_sse2(coeff0, coeff0_sign);
+    qcoeff1 = invert_sign_sse2(coeff1, coeff1_sign);
 
-    {
-      __m128i coeff0_sign, coeff1_sign;
-      __m128i qcoeff0, qcoeff1;
-      __m128i qtmp0, qtmp1;
-      // Do DC and first 15 AC
-      coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
-      // Poor man's sign extract
-      coeff0_sign = _mm_srai_epi16(coeff0, 15);
-      coeff1_sign = _mm_srai_epi16(coeff1, 15);
-      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+    nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+             _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
 
+    if (nzflag) {
       qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-      round = _mm_unpackhi_epi64(round, round);
       qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-      qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-      quant = _mm_unpackhi_epi64(quant, quant);
-      qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+      qcoeff0 = _mm_mulhi_epi16(qcoeff0, quant);
+      qcoeff1 = _mm_mulhi_epi16(qcoeff1, quant);
 
-      // Reinsert signs
-      qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-      qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+      // Reinsert signs.
+      qcoeff0 = invert_sign_sse2(qcoeff0, coeff0_sign);
+      qcoeff1 = invert_sign_sse2(qcoeff1, coeff1_sign);
 
       store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
       store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
 
       coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-      dequant = _mm_unpackhi_epi64(dequant, dequant);
       coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
 
       store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
       store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-    }
-
-    {
-      // Scan for eob
-      __m128i zero_coeff0, zero_coeff1;
-      __m128i nzero_coeff0, nzero_coeff1;
-      __m128i iscan0, iscan1;
-      __m128i eob1;
-      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-      iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
-      iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
-      eob = _mm_and_si128(iscan0, nzero_coeff0);
-      eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-      eob = _mm_max_epi16(eob, eob1);
-    }
-    n_coeffs += 8 * 2;
-  }
-
-  thr = _mm_srai_epi16(dequant, 1);
+    } else {
+      store_zero_tran_low(qcoeff_ptr + n_coeffs);
+      store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
 
-  // AC only loop
-  while (n_coeffs < 0) {
-    __m128i coeff0, coeff1;
-    {
-      __m128i coeff0_sign, coeff1_sign;
-      __m128i qcoeff0, qcoeff1;
-      __m128i qtmp0, qtmp1;
-
-      coeff0 = load_tran_low(coeff_ptr + n_coeffs);
-      coeff1 = load_tran_low(coeff_ptr + n_coeffs + 8);
-
-      // Poor man's sign extract
-      coeff0_sign = _mm_srai_epi16(coeff0, 15);
-      coeff1_sign = _mm_srai_epi16(coeff1, 15);
-      qcoeff0 = _mm_xor_si128(coeff0, coeff0_sign);
-      qcoeff1 = _mm_xor_si128(coeff1, coeff1_sign);
-      qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-      qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-      nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
-               _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
-
-      if (nzflag) {
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
-
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
-
-        store_tran_low(qcoeff0, qcoeff_ptr + n_coeffs);
-        store_tran_low(qcoeff1, qcoeff_ptr + n_coeffs + 8);
-
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
-
-        store_tran_low(coeff0, dqcoeff_ptr + n_coeffs);
-        store_tran_low(coeff1, dqcoeff_ptr + n_coeffs + 8);
-      } else {
-        store_zero_tran_low(qcoeff_ptr + n_coeffs);
-        store_zero_tran_low(qcoeff_ptr + n_coeffs + 8);
-
-        store_zero_tran_low(dqcoeff_ptr + n_coeffs);
-        store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
-      }
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+      store_zero_tran_low(dqcoeff_ptr + n_coeffs + 8);
     }
 
     if (nzflag) {
-      // Scan for eob
-      __m128i zero_coeff0, zero_coeff1;
-      __m128i nzero_coeff0, nzero_coeff1;
-      __m128i iscan0, iscan1;
-      __m128i eob0, eob1;
-      zero_coeff0 = _mm_cmpeq_epi16(coeff0, zero);
-      zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero);
-      nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero);
-      nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero);
-      iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs));
-      iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1);
-      eob0 = _mm_and_si128(iscan0, nzero_coeff0);
-      eob1 = _mm_and_si128(iscan1, nzero_coeff1);
-      eob0 = _mm_max_epi16(eob0, eob1);
+      const __m128i eob0 =
+          scan_for_eob(&coeff0, &coeff1, iscan + n_coeffs, 0, zero);
       eob = _mm_max_epi16(eob, eob0);
     }
     n_coeffs += 8 * 2;
   }
 
-  // Accumulate EOB
-  {
-    __m128i eob_shuffled;
-    eob_shuffled = _mm_shuffle_epi32(eob, 0xe);
-    eob = _mm_max_epi16(eob, eob_shuffled);
-    eob_shuffled = _mm_shufflelo_epi16(eob, 0xe);
-    eob = _mm_max_epi16(eob, eob_shuffled);
-    eob_shuffled = _mm_shufflelo_epi16(eob, 0x1);
-    eob = _mm_max_epi16(eob, eob_shuffled);
-    *eob_ptr = _mm_extract_epi16(eob, 1);
-  }
+  *eob_ptr = accumulate_eob(eob);
 }
index 580dd88..27bfb4e 100644 (file)
@@ -29,6 +29,15 @@ static INLINE void load_b_values(const int16_t *zbin_ptr, __m128i *zbin,
   *shift = _mm_load_si128((const __m128i *)shift_ptr);
 }
 
+static INLINE void load_fp_values(const int16_t *round_ptr, __m128i *round,
+                                  const int16_t *quant_ptr, __m128i *quant,
+                                  const int16_t *dequant_ptr,
+                                  __m128i *dequant) {
+  *round = _mm_load_si128((const __m128i *)round_ptr);
+  *quant = _mm_load_si128((const __m128i *)quant_ptr);
+  *dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+}
+
 // With ssse3 and later abs() and sign() are preferred.
 static INLINE __m128i invert_sign_sse2(__m128i a, __m128i sign) {
   a = _mm_xor_si128(a, sign);