From 2134eb2f054baaec9a796784aeeafb0b669ff601 Mon Sep 17 00:00:00 2001
From: Johann <johannkoenig@google.com>
Date: Fri, 31 Oct 2014 13:42:55 -0700
Subject: [PATCH] Remove pair quantization

The intrinsics version of the pair quant is slower than running it
individually.

Change-Id: I7b4ea8599d4aab04be0a5a0c59b8b29a7fc283f4
---
 vp8/common/rtcd_defs.pl                   |  15 ----
 vp8/encoder/arm/neon/fastquantizeb_neon.c | 124 +-----------------------------
 vp8/encoder/arm/quantize_arm.c            |  64 ---------------
 vp8/encoder/block.h                       |   1 -
 vp8/encoder/encodeframe.c                 |   2 -
 vp8/encoder/ethreading.c                  |   1 -
 vp8/encoder/onyx_if.c                     |   2 -
 vp8/encoder/quantize.c                    |  23 +-----
 vp8/encoder/quantize.h                    |   3 +
 vp8/vp8cx_arm.mk                          |   1 -
 10 files changed, 10 insertions(+), 226 deletions(-)
 delete mode 100644 vp8/encoder/arm/quantize_arm.c

diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 88a5b5b..6756008 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -457,21 +457,6 @@ add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
 specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon/;
 $vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6;
 
-add_proto qw/void vp8_regular_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
-# no asm yet
-
-add_proto qw/void vp8_fast_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2";
-specialize qw/vp8_fast_quantize_b_pair neon/;
-
-add_proto qw/void vp8_quantize_mb/, "struct macroblock *";
-specialize qw/vp8_quantize_mb neon/;
-
-add_proto qw/void vp8_quantize_mby/, "struct macroblock *";
-specialize qw/vp8_quantize_mby neon/;
-
-add_proto qw/void vp8_quantize_mbuv/, "struct macroblock *";
-specialize qw/vp8_quantize_mbuv neon/;
-
 #
 # Block subtraction
 #
diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c
index 4876428..caa7637 100644
--- a/vp8/encoder/arm/neon/fastquantizeb_neon.c
+++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c
@@ -10,13 +10,12 @@
 
 #include <arm_neon.h>
 #include "vp8/encoder/block.h"
-#include "vpx_mem/vpx_mem.h"
 
 static const uint16_t inv_zig_zag[16] = {
-    0x0001, 0x0002, 0x0006, 0x0007,
-    0x0003, 0x0005, 0x0008, 0x000d,
-    0x0004, 0x0009, 0x000c, 0x000e,
-    0x000a, 0x000b, 0x000f, 0x0010
+    1,  2,  6,   7,
+    3,  5,  8,  13,
+    4,  9,  12, 14,
+    10, 11, 15, 16
 };
 
 void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
@@ -88,118 +87,3 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) {
 
     vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0);
 }
-
-void vp8_fast_quantize_b_pair_neon(BLOCK *b0, BLOCK *b1,
-                                   BLOCKD *d0, BLOCKD *d1) {
-    const int16x8_t one_q = vdupq_n_s16(0xff),
-                    b0_z0 = vld1q_s16(b0->coeff),
-                    b0_z1 = vld1q_s16(b0->coeff + 8),
-                    b0_round0 = vld1q_s16(b0->round),
-                    b0_round1 = vld1q_s16(b0->round + 8),
-                    b0_quant0 = vld1q_s16(b0->quant_fast),
-                    b0_quant1 = vld1q_s16(b0->quant_fast + 8),
-                    d0_dequant0 = vld1q_s16(d0->dequant),
-                    d0_dequant1 = vld1q_s16(d0->dequant + 8),
-                    b1_z0 = vld1q_s16(b1->coeff),
-                    b1_z1 = vld1q_s16(b1->coeff + 8),
-                    b1_round0 = vld1q_s16(b1->round),
-                    b1_round1 = vld1q_s16(b1->round + 8),
-                    b1_quant0 = vld1q_s16(b1->quant_fast),
-                    b1_quant1 = vld1q_s16(b1->quant_fast + 8),
-                    d1_dequant0 = vld1q_s16(d1->dequant),
-                    d1_dequant1 = vld1q_s16(d1->dequant + 8);
-    const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag),
-                     zig_zag1 = vld1q_u16(inv_zig_zag + 8);
-    int16x8_t b0_x0, b0_x1, b0_sz0, b0_sz1, b0_y0, b0_y1,
-              b1_x0, b1_x1, b1_sz0, b1_sz1, b1_y0, b1_y1;
-    uint16x8_t b0_eob0, b0_eob1,
-               b1_eob0, b1_eob1;
-    uint16x4_t b0_eob_d16, b1_eob_d16;
-    uint32x2_t b0_eob_d32, b1_eob_d32;
-    uint32x4_t b0_eob_q32, b1_eob_q32;
-
-    /* sign of z: z >> 15 */
-    b0_sz0 = vshrq_n_s16(b0_z0, 15);
-    b0_sz1 = vshrq_n_s16(b0_z1, 15);
-    b1_sz0 = vshrq_n_s16(b1_z0, 15);
-    b1_sz1 = vshrq_n_s16(b1_z1, 15);
-
-    /* x = abs(z) */
-    b0_x0 = vabsq_s16(b0_z0);
-    b0_x1 = vabsq_s16(b0_z1);
-    b1_x0 = vabsq_s16(b1_z0);
-    b1_x1 = vabsq_s16(b1_z1);
-
-    /* x += round */
-    b0_x0 = vaddq_s16(b0_x0, b0_round0);
-    b0_x1 = vaddq_s16(b0_x1, b0_round1);
-    b1_x0 = vaddq_s16(b1_x0, b1_round0);
-    b1_x1 = vaddq_s16(b1_x1, b1_round1);
-
-    /* y = 2 * (x * quant) >> 16 */
-    b0_y0 = vqdmulhq_s16(b0_x0, b0_quant0);
-    b0_y1 = vqdmulhq_s16(b0_x1, b0_quant1);
-    b1_y0 = vqdmulhq_s16(b1_x0, b1_quant0);
-    b1_y1 = vqdmulhq_s16(b1_x1, b1_quant1);
-
-    /* Compensate for doubling in vqdmulhq */
-    b0_y0 = vshrq_n_s16(b0_y0, 1);
-    b0_y1 = vshrq_n_s16(b0_y1, 1);
-    b1_y0 = vshrq_n_s16(b1_y0, 1);
-    b1_y1 = vshrq_n_s16(b1_y1, 1);
-
-    /* Restore sign bit */
-    b0_y0 = veorq_s16(b0_y0, b0_sz0);
-    b0_y1 = veorq_s16(b0_y1, b0_sz1);
-    b0_x0 = vsubq_s16(b0_y0, b0_sz0);
-    b0_x1 = vsubq_s16(b0_y1, b0_sz1);
-    b1_y0 = veorq_s16(b1_y0, b1_sz0);
-    b1_y1 = veorq_s16(b1_y1, b1_sz1);
-    b1_x0 = vsubq_s16(b1_y0, b1_sz0);
-    b1_x1 = vsubq_s16(b1_y1, b1_sz1);
-
-    /* find non-zero elements */
-    b0_eob0 = vtstq_s16(b0_x0, one_q);
-    b0_eob1 = vtstq_s16(b0_x1, one_q);
-    b1_eob0 = vtstq_s16(b1_x0, one_q);
-    b1_eob1 = vtstq_s16(b1_x1, one_q);
-
-    /* mask zig zag */
-    b0_eob0 = vandq_u16(b0_eob0, zig_zag0);
-    b0_eob1 = vandq_u16(b0_eob1, zig_zag1);
-    b1_eob0 = vandq_u16(b1_eob0, zig_zag0);
-    b1_eob1 = vandq_u16(b1_eob1, zig_zag1);
-
-    /* select the largest value */
-    b0_eob0 = vmaxq_u16(b0_eob0, b0_eob1);
-    b0_eob_d16 = vmax_u16(vget_low_u16(b0_eob0),
-                          vget_high_u16(b0_eob0));
-    b0_eob_q32 = vmovl_u16(b0_eob_d16);
-    b0_eob_d32 = vmax_u32(vget_low_u32(b0_eob_q32),
-                          vget_high_u32(b0_eob_q32));
-    b0_eob_d32 = vpmax_u32(b0_eob_d32, b0_eob_d32);
-
-    b1_eob0 = vmaxq_u16(b1_eob0, b1_eob1);
-    b1_eob_d16 = vmax_u16(vget_low_u16(b1_eob0),
-                          vget_high_u16(b1_eob0));
-    b1_eob_q32 = vmovl_u16(b1_eob_d16);
-    b1_eob_d32 = vmax_u32(vget_low_u32(b1_eob_q32),
-                          vget_high_u32(b1_eob_q32));
-    b1_eob_d32 = vpmax_u32(b1_eob_d32, b1_eob_d32);
-
-    /* qcoeff = x */
-    vst1q_s16(d0->qcoeff, b0_x0);
-    vst1q_s16(d0->qcoeff + 8, b0_x1);
-    vst1q_s16(d1->qcoeff, b1_x0);
-    vst1q_s16(d1->qcoeff + 8, b1_x1);
-
-    /* dqcoeff = x * dequant */
-    vst1q_s16(d0->dqcoeff, vmulq_s16(d0_dequant0, b0_x0));
-    vst1q_s16(d0->dqcoeff + 8, vmulq_s16(d0_dequant1, b0_x1));
-    vst1q_s16(d1->dqcoeff, vmulq_s16(d1_dequant0, b1_x0));
-    vst1q_s16(d1->dqcoeff + 8, vmulq_s16(d1_dequant1, b1_x1));
-
-    vst1_lane_s8((int8_t *)d0->eob, vreinterpret_s8_u32(b0_eob_d32), 0);
-    vst1_lane_s8((int8_t *)d1->eob, vreinterpret_s8_u32(b1_eob_d32), 0);
-    return;
-}
diff --git a/vp8/encoder/arm/quantize_arm.c b/vp8/encoder/arm/quantize_arm.c
deleted file mode 100644
index 80d9ad0..0000000
--- a/vp8/encoder/arm/quantize_arm.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vpx_config.h"
-#include "vp8_rtcd.h"
-#include "vp8/encoder/block.h"
-#include <math.h>
-#include "vpx_mem/vpx_mem.h"
-#include "vp8/encoder/quantize.h"
-#include "vp8/common/entropy.h"
-
-
-#if HAVE_NEON
-
-/* vp8_quantize_mbX functions here differs from corresponding ones in
- * quantize.c only by using quantize_b_pair function pointer instead of
- * the regular quantize_b function pointer */
-void vp8_quantize_mby_neon(MACROBLOCK *x)
-{
-    int i;
-    int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
-        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
-
-    for (i = 0; i < 16; i+=2)
-        x->quantize_b_pair(&x->block[i], &x->block[i+1],
-                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
-
-    if(has_2nd_order)
-        x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
-}
-
-void vp8_quantize_mb_neon(MACROBLOCK *x)
-{
-    int i;
-    int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
-        && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV);
-
-    for (i = 0; i < 24; i+=2)
-        x->quantize_b_pair(&x->block[i], &x->block[i+1],
-                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
-
-    if (has_2nd_order)
-        x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
-}
-
-
-void vp8_quantize_mbuv_neon(MACROBLOCK *x)
-{
-    int i;
-
-    for (i = 16; i < 24; i+=2)
-        x->quantize_b_pair(&x->block[i], &x->block[i+1],
-                           &x->e_mbd.block[i], &x->e_mbd.block[i+1]);
-}
-
-#endif /* HAVE_NEON */
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 1f212ca..dbdcab9 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -160,7 +160,6 @@ typedef struct macroblock
     void (*short_fdct8x4)(short *input, short *output, int pitch);
     void (*short_walsh4x4)(short *input, short *output, int pitch);
     void (*quantize_b)(BLOCK *b, BLOCKD *d);
-    void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1);
 
 } MACROBLOCK;
 
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index aec6b98..85813b6 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -1252,7 +1252,6 @@ int vp8cx_encode_inter_macroblock
         if(cpi->sf.use_fastquant_for_pick)
         {
             x->quantize_b      = vp8_fast_quantize_b;
-            x->quantize_b_pair = vp8_fast_quantize_b_pair;
 
             /* the fast quantizer does not use zbin_extra, so
              * do not recalculate */
@@ -1265,7 +1264,6 @@ int vp8cx_encode_inter_macroblock
         if (cpi->sf.improved_quant)
         {
             x->quantize_b      = vp8_regular_quantize_b;
-            x->quantize_b_pair = vp8_regular_quantize_b_pair;
         }
 
         /* restore cpi->zbin_mode_boost_enabled */
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 7b8b51f..7814679 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -346,7 +346,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
     z->short_fdct8x4     = x->short_fdct8x4;
     z->short_walsh4x4    = x->short_walsh4x4;
     z->quantize_b        = x->quantize_b;
-    z->quantize_b_pair   = x->quantize_b_pair;
     z->optimize          = x->optimize;
 
     /*
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 45b6b43..6ab5df9 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1083,12 +1083,10 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     if (cpi->sf.improved_quant)
     {
         cpi->mb.quantize_b      = vp8_regular_quantize_b;
-        cpi->mb.quantize_b_pair = vp8_regular_quantize_b_pair;
     }
     else
     {
         cpi->mb.quantize_b      = vp8_fast_quantize_b;
-        cpi->mb.quantize_b_pair = vp8_fast_quantize_b_pair;
     }
     if (cpi->sf.improved_quant != last_improved_quant)
         vp8cx_init_quantizer(cpi);
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index 9953bd6..2feb316 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -101,7 +101,7 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
     *d->eob = (char)(eob + 1);
 }
 
-void vp8_quantize_mby_c(MACROBLOCK *x)
+void vp8_quantize_mby(MACROBLOCK *x)
 {
     int i;
     int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED
@@ -114,7 +114,7 @@ void vp8_quantize_mby_c(MACROBLOCK *x)
         x->quantize_b(&x->block[24], &x->e_mbd.block[24]);
 }
 
-void vp8_quantize_mb_c(MACROBLOCK *x)
+void vp8_quantize_mb(MACROBLOCK *x)
 {
     int i;
     int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED
@@ -125,7 +125,7 @@ void vp8_quantize_mb_c(MACROBLOCK *x)
 }
 
 
-void vp8_quantize_mbuv_c(MACROBLOCK *x)
+void vp8_quantize_mbuv(MACROBLOCK *x)
 {
     int i;
 
@@ -133,23 +133,6 @@ void vp8_quantize_mbuv_c(MACROBLOCK *x)
         x->quantize_b(&x->block[i], &x->e_mbd.block[i]);
 }
 
-/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of
- * these two C functions if corresponding optimized routine is not available.
- * NEON optimized version implements currently the fast quantization for pair
- * of blocks. */
-void vp8_regular_quantize_b_pair(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
-{
-    vp8_regular_quantize_b(b1, d1);
-    vp8_regular_quantize_b(b2, d2);
-}
-
-void vp8_fast_quantize_b_pair_c(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2)
-{
-    vp8_fast_quantize_b_c(b1, d1);
-    vp8_fast_quantize_b_c(b2, d2);
-}
-
-
 static const int qrounding_factors[129] =
 {
     48, 48, 48, 48, 48, 48, 48, 48,
diff --git a/vp8/encoder/quantize.h b/vp8/encoder/quantize.h
index c739b26..7d36c2b 100644
--- a/vp8/encoder/quantize.h
+++ b/vp8/encoder/quantize.h
@@ -18,6 +18,9 @@ extern "C" {
 
 struct VP8_COMP;
 struct macroblock;
+extern void vp8_quantize_mb(struct macroblock *x);
+extern void vp8_quantize_mby(struct macroblock *x);
+extern void vp8_quantize_mbuv(struct macroblock *x);
 extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q);
 extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi);
 extern void vp8_update_zbin_extra(struct VP8_COMP *cpi, struct macroblock *x);
diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk
index 553b1c4..2c2b871 100644
--- a/vp8/vp8cx_arm.mk
+++ b/vp8/vp8cx_arm.mk
@@ -14,7 +14,6 @@ VP8_CX_SRCS-$(ARCH_ARM)  += vp8cx_arm.mk
 #File list for arm
 # encoder
 VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/dct_arm.c
-VP8_CX_SRCS-$(ARCH_ARM)  += encoder/arm/quantize_arm.c
 
 #File list for edsp
 # encoder
-- 
2.7.4