From 2200039d33c49a9f7a5c438656df143755b022c4 Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Wed, 30 Mar 2022 14:57:46 +0900
Subject: [PATCH] quantize: replace highbd versions

The optimized quantize functions were already built to handle
highbd values. The only difference is the clamping. All highbd
functions expand to 32bits when running in highbd mode.

Removes vpx_highbd_quantize_32x32_sse2 as it is slower than the
C version in the worst case.

Bug: webm:1586
Change-Id: I49bf8a6a2041f78450bf43a4f655c67656b0f8d9
---
 test/vp9_quantize_test.cc                 |  42 +++++----
 vp9/encoder/vp9_encodemb.c                |  48 +++++-----
 vp9/encoder/vp9_quantize.c                |   8 --
 vpx_dsp/quantize.h                        |  16 ++++
 vpx_dsp/vpx_dsp.mk                        |   3 -
 vpx_dsp/vpx_dsp_rtcd_defs.pl              |   8 --
 vpx_dsp/x86/highbd_quantize_intrin_sse2.c | 152 ------------------------------
 7 files changed, 66 insertions(+), 211 deletions(-)
 delete mode 100644 vpx_dsp/x86/highbd_quantize_intrin_sse2.c

diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index d54f1bc..5773cd9 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -30,6 +30,7 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/msvc.h"
 #include "vpx_ports/vpx_timer.h"
+#include "vpx_dsp/quantize.h"
 
 using libvpx_test::ACMRandom;
 using libvpx_test::Buffer;
@@ -464,22 +465,12 @@ using std::make_tuple;
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
-    ::testing::Values(
-        make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c, VPX_BITS_8, 16,
-                   false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_8, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_10, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_sse2, &vpx_highbd_quantize_b_c,
-                   VPX_BITS_12, 16, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_8, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_10, 32, false),
-        make_tuple(&vpx_highbd_quantize_b_32x32_sse2,
-                   &vpx_highbd_quantize_b_32x32_c, VPX_BITS_12, 32, false)));
-
+    ::testing::Values(make_tuple(&vpx_quantize_b_sse2, &vpx_quantize_b_c,
+                                 VPX_BITS_8, 16, false),
+                      make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                                 VPX_BITS_10, 16, false),
+                      make_tuple(&vpx_quantize_b_sse2, &vpx_highbd_quantize_b_c,
+                                 VPX_BITS_12, 16, false)));
 #else
 INSTANTIATE_TEST_SUITE_P(
     SSE2, VP9QuantizeTest,
@@ -519,6 +510,24 @@ INSTANTIATE_TEST_SUITE_P(
 #endif  // HAVE_SSSE3
 
 #if HAVE_AVX
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(
+    AVX, VP9QuantizeTest,
+    ::testing::Values(
+        make_tuple(&vpx_quantize_b_avx, &vpx_quantize_b_c, VPX_BITS_8, 16,
+                   false),
+        make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_10,
+                   16, false),
+        make_tuple(&vpx_quantize_b_avx, &vpx_highbd_quantize_b_c, VPX_BITS_12,
+                   16, false),
+        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
+                   VPX_BITS_8, 32, false),
+        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
+                   VPX_BITS_10, 32, false),
+        make_tuple(&vpx_quantize_b_32x32_avx, &vpx_highbd_quantize_b_32x32_c,
+                   VPX_BITS_12, 32, false)));
+
+#else
 INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
                          ::testing::Values(make_tuple(&vpx_quantize_b_avx,
                                                       &vpx_quantize_b_c,
@@ -526,6 +535,7 @@ INSTANTIATE_TEST_SUITE_P(AVX, VP9QuantizeTest,
                                            make_tuple(&vpx_quantize_b_32x32_avx,
                                                       &vpx_quantize_b_32x32_c,
                                                       VPX_BITS_8, 32, false)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif  // HAVE_AVX
 
 #if VPX_ARCH_X86_64 && HAVE_AVX2
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index fa222f9..e708555 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -511,28 +511,28 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b_32x32(
-            coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
-            dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
+        vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
+                             p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                             scan_order->scan, scan_order->iscan);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
-                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
+                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
-                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
+                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
-                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
+                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
         break;
     }
     return;
@@ -857,9 +857,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
           vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src,
                                     src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b_32x32(
-              coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
-              dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
+          vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
+                               p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                               eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -876,9 +876,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
-                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                eob, scan_order->scan, scan_order->iscan);
+          vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
+                         p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                         scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -896,9 +896,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
-                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                eob, scan_order->scan, scan_order->iscan);
+          vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
+                         qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                         scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -917,9 +917,9 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
           else
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
-                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                                eob, scan_order->scan, scan_order->iscan);
+          vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
+                         qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                         scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 9058997..1c401e9 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -164,14 +164,6 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
     return;
   }
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin,
-                          p->round, p->quant, p->quant_shift, qcoeff, dqcoeff,
-                          pd->dequant, &p->eobs[block], scan, iscan);
-    return;
-  }
-#endif
   vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, p->round,
                  p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant,
                  &p->eobs[block], scan, iscan);
diff --git a/vpx_dsp/quantize.h b/vpx_dsp/quantize.h
index 8e13844..0fcd779 100644
--- a/vpx_dsp/quantize.h
+++ b/vpx_dsp/quantize.h
@@ -37,6 +37,22 @@ void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
                                   const int16_t dequant, uint16_t *eob_ptr);
+
+// Only used for reference. The optimized versions can handle HBD.
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
+                             const int16_t *quant_shift_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                             const int16_t *scan, const int16_t *iscan);
+
+void vpx_highbd_quantize_b_32x32_c(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
+    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
+    const int16_t *scan, const int16_t *iscan);
 #endif
 
 #ifdef __cplusplus
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index a880e1d..b930fbd 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -318,9 +318,6 @@ DSP_SRCS-$(HAVE_SSSE3)  += x86/quantize_ssse3.h
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx.c
 DSP_SRCS-$(HAVE_NEON)   += arm/quantize_neon.c
 DSP_SRCS-$(HAVE_VSX)    += ppc/quantize_vsx.c
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
-endif
 
 # avg
 DSP_SRCS-yes           += avg.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 372903a..63097b0 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -714,14 +714,6 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
 
   add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/;
-
-  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b sse2/;
-
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-    specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
-  }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
 
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
deleted file mode 100644
index 4535a0f..0000000
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/*
- *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h>
-#include <emmintrin.h>
-
-#include "./vpx_dsp_rtcd.h"
-#include "vpx_dsp/vpx_dsp_common.h"
-#include "vpx_mem/vpx_mem.h"
-#include "vpx_ports/mem.h"
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
-                                const int16_t *zbin_ptr,
-                                const int16_t *round_ptr,
-                                const int16_t *quant_ptr,
-                                const int16_t *quant_shift_ptr,
-                                tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
-                                const int16_t *scan, const int16_t *iscan) {
-  int i, j, non_zero_regs = (int)count / 4, eob_i = -1;
-  __m128i zbins[2];
-  __m128i nzbins[2];
-
-  zbins[0] = _mm_set_epi32((int)zbin_ptr[1], (int)zbin_ptr[1], (int)zbin_ptr[1],
-                           (int)zbin_ptr[0]);
-  zbins[1] = _mm_set1_epi32((int)zbin_ptr[1]);
-
-  nzbins[0] = _mm_setzero_si128();
-  nzbins[1] = _mm_setzero_si128();
-  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
-  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
-
-  (void)scan;
-
-  memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
-
-  // Pre-scan pass
-  for (i = ((int)count / 4) - 1; i >= 0; i--) {
-    __m128i coeffs, cmp1, cmp2;
-    int test;
-    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-    cmp1 = _mm_and_si128(cmp1, cmp2);
-    test = _mm_movemask_epi8(cmp1);
-    if (test == 0xffff)
-      non_zero_regs--;
-    else
-      break;
-  }
-
-  // Quantization pass:
-  for (i = 0; i < non_zero_regs; i++) {
-    __m128i coeffs, coeffs_sign, tmp1, tmp2;
-    int test;
-    int abs_coeff[4];
-    int coeff_sign[4];
-
-    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-    coeffs_sign = _mm_srai_epi32(coeffs, 31);
-    coeffs = _mm_sub_epi32(_mm_xor_si128(coeffs, coeffs_sign), coeffs_sign);
-    tmp1 = _mm_cmpgt_epi32(coeffs, zbins[i != 0]);
-    tmp2 = _mm_cmpeq_epi32(coeffs, zbins[i != 0]);
-    tmp1 = _mm_or_si128(tmp1, tmp2);
-    test = _mm_movemask_epi8(tmp1);
-    _mm_storeu_si128((__m128i *)abs_coeff, coeffs);
-    _mm_storeu_si128((__m128i *)coeff_sign, coeffs_sign);
-
-    for (j = 0; j < 4; j++) {
-      if (test & (1 << (4 * j))) {
-        int k = 4 * i + j;
-        const int64_t tmp3 = abs_coeff[j] + round_ptr[k != 0];
-        const int64_t tmp4 = ((tmp3 * quant_ptr[k != 0]) >> 16) + tmp3;
-        const uint32_t abs_qcoeff =
-            (uint32_t)((tmp4 * quant_shift_ptr[k != 0]) >> 16);
-        qcoeff_ptr[k] = (int)(abs_qcoeff ^ coeff_sign[j]) - coeff_sign[j];
-        dqcoeff_ptr[k] = qcoeff_ptr[k] * dequant_ptr[k != 0];
-        if (abs_qcoeff) eob_i = iscan[k] > eob_i ? iscan[k] : eob_i;
-      }
-    }
-  }
-  *eob_ptr = eob_i + 1;
-}
-
-void vpx_highbd_quantize_b_32x32_sse2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
-    const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
-  __m128i zbins[2];
-  __m128i nzbins[2];
-  int idx = 0;
-  int idx_arr[1024];
-  int i, eob = -1;
-  const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
-  const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
-  (void)scan;
-
-  zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
-  zbins[1] = _mm_set1_epi32(zbin1_tmp);
-
-  nzbins[0] = _mm_setzero_si128();
-  nzbins[1] = _mm_setzero_si128();
-  nzbins[0] = _mm_sub_epi32(nzbins[0], zbins[0]);
-  nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
-
-  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
-  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
-
-  // Pre-scan pass
-  for (i = 0; i < n_coeffs / 4; i++) {
-    __m128i coeffs, cmp1, cmp2;
-    int test;
-    coeffs = _mm_load_si128((const __m128i *)(coeff_ptr + i * 4));
-    cmp1 = _mm_cmplt_epi32(coeffs, zbins[i != 0]);
-    cmp2 = _mm_cmpgt_epi32(coeffs, nzbins[i != 0]);
-    cmp1 = _mm_and_si128(cmp1, cmp2);
-    test = _mm_movemask_epi8(cmp1);
-    if (!(test & 0xf)) idx_arr[idx++] = i * 4;
-    if (!(test & 0xf0)) idx_arr[idx++] = i * 4 + 1;
-    if (!(test & 0xf00)) idx_arr[idx++] = i * 4 + 2;
-    if (!(test & 0xf000)) idx_arr[idx++] = i * 4 + 3;
-  }
-
-  // Quantization pass: only process the coefficients selected in
-  // pre-scan pass. Note: idx can be zero.
-  for (i = 0; i < idx; i++) {
-    const int rc = idx_arr[i];
-    const int coeff = coeff_ptr[rc];
-    const int coeff_sign = (coeff >> 31);
-    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
-    const int64_t tmp1 = abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-    const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
-    const uint32_t abs_qcoeff =
-        (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> 15);
-    qcoeff_ptr[rc] = (int)(abs_qcoeff ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
-    if (abs_qcoeff) eob = iscan[idx_arr[i]] > eob ? iscan[idx_arr[i]] : eob;
-  }
-  *eob_ptr = eob + 1;
-}
-#endif
-- 
2.7.4