From 2134eb2f054baaec9a796784aeeafb0b669ff601 Mon Sep 17 00:00:00 2001 From: Johann Date: Fri, 31 Oct 2014 13:42:55 -0700 Subject: [PATCH] Remove pair quantization The intrinsics version of the pair quant is slower than running it individually. Change-Id: I7b4ea8599d4aab04be0a5a0c59b8b29a7fc283f4 --- vp8/common/rtcd_defs.pl | 15 ---- vp8/encoder/arm/neon/fastquantizeb_neon.c | 124 +----------------------------- vp8/encoder/arm/quantize_arm.c | 64 --------------- vp8/encoder/block.h | 1 - vp8/encoder/encodeframe.c | 2 - vp8/encoder/ethreading.c | 1 - vp8/encoder/onyx_if.c | 2 - vp8/encoder/quantize.c | 23 +----- vp8/encoder/quantize.h | 3 + vp8/vp8cx_arm.mk | 1 - 10 files changed, 10 insertions(+), 226 deletions(-) delete mode 100644 vp8/encoder/arm/quantize_arm.c diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 88a5b5b..6756008 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -457,21 +457,6 @@ add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *"; specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon/; $vp8_fast_quantize_b_media=vp8_fast_quantize_b_armv6; -add_proto qw/void vp8_regular_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"; -# no asm yet - -add_proto qw/void vp8_fast_quantize_b_pair/, "struct block *b1, struct block *b2, struct blockd *d1, struct blockd *d2"; -specialize qw/vp8_fast_quantize_b_pair neon/; - -add_proto qw/void vp8_quantize_mb/, "struct macroblock *"; -specialize qw/vp8_quantize_mb neon/; - -add_proto qw/void vp8_quantize_mby/, "struct macroblock *"; -specialize qw/vp8_quantize_mby neon/; - -add_proto qw/void vp8_quantize_mbuv/, "struct macroblock *"; -specialize qw/vp8_quantize_mbuv neon/; - # # Block subtraction # diff --git a/vp8/encoder/arm/neon/fastquantizeb_neon.c b/vp8/encoder/arm/neon/fastquantizeb_neon.c index 4876428..caa7637 100644 --- a/vp8/encoder/arm/neon/fastquantizeb_neon.c +++ b/vp8/encoder/arm/neon/fastquantizeb_neon.c @@ -10,13 +10,12 @@ #include #include "vp8/encoder/block.h" -#include "vpx_mem/vpx_mem.h" static const uint16_t inv_zig_zag[16] = { - 0x0001, 0x0002, 0x0006, 0x0007, - 0x0003, 0x0005, 0x0008, 0x000d, - 0x0004, 0x0009, 0x000c, 0x000e, - 0x000a, 0x000b, 0x000f, 0x0010 + 1, 2, 6, 7, + 3, 5, 8, 13, + 4, 9, 12, 14, + 10, 11, 15, 16 }; void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { @@ -88,118 +87,3 @@ void vp8_fast_quantize_b_neon(BLOCK *b, BLOCKD *d) { vst1_lane_s8((int8_t *)d->eob, vreinterpret_s8_u32(eob_d32), 0); } - -void vp8_fast_quantize_b_pair_neon(BLOCK *b0, BLOCK *b1, - BLOCKD *d0, BLOCKD *d1) { - const int16x8_t one_q = vdupq_n_s16(0xff), - b0_z0 = vld1q_s16(b0->coeff), - b0_z1 = vld1q_s16(b0->coeff + 8), - b0_round0 = vld1q_s16(b0->round), - b0_round1 = vld1q_s16(b0->round + 8), - b0_quant0 = vld1q_s16(b0->quant_fast), - b0_quant1 = vld1q_s16(b0->quant_fast + 8), - d0_dequant0 = vld1q_s16(d0->dequant), - d0_dequant1 = vld1q_s16(d0->dequant + 8), - b1_z0 = vld1q_s16(b1->coeff), - b1_z1 = vld1q_s16(b1->coeff + 8), - b1_round0 = vld1q_s16(b1->round), - b1_round1 = vld1q_s16(b1->round + 8), - b1_quant0 = vld1q_s16(b1->quant_fast), - b1_quant1 = vld1q_s16(b1->quant_fast + 8), - d1_dequant0 = vld1q_s16(d1->dequant), - d1_dequant1 = vld1q_s16(d1->dequant + 8); - const uint16x8_t zig_zag0 = vld1q_u16(inv_zig_zag), - zig_zag1 = vld1q_u16(inv_zig_zag + 8); - int16x8_t b0_x0, b0_x1, b0_sz0, b0_sz1, b0_y0, b0_y1, - b1_x0, b1_x1, b1_sz0, b1_sz1, b1_y0, b1_y1; - uint16x8_t b0_eob0, b0_eob1, - b1_eob0, b1_eob1; - uint16x4_t b0_eob_d16, b1_eob_d16; - uint32x2_t b0_eob_d32, b1_eob_d32; - uint32x4_t b0_eob_q32, b1_eob_q32; - - /* sign of z: z >> 15 */ - b0_sz0 = vshrq_n_s16(b0_z0, 15); - b0_sz1 = vshrq_n_s16(b0_z1, 15); - b1_sz0 = vshrq_n_s16(b1_z0, 15); - b1_sz1 = vshrq_n_s16(b1_z1, 15); - - /* x = abs(z) */ - b0_x0 = vabsq_s16(b0_z0); - b0_x1 = vabsq_s16(b0_z1); - b1_x0 = vabsq_s16(b1_z0); - b1_x1 = vabsq_s16(b1_z1); - - /* x += round */ - b0_x0 = vaddq_s16(b0_x0, b0_round0); - b0_x1 = vaddq_s16(b0_x1, b0_round1); - b1_x0 = vaddq_s16(b1_x0, b1_round0); - b1_x1 = vaddq_s16(b1_x1, b1_round1); - - /* y = 2 * (x * quant) >> 16 */ - b0_y0 = vqdmulhq_s16(b0_x0, b0_quant0); - b0_y1 = vqdmulhq_s16(b0_x1, b0_quant1); - b1_y0 = vqdmulhq_s16(b1_x0, b1_quant0); - b1_y1 = vqdmulhq_s16(b1_x1, b1_quant1); - - /* Compensate for doubling in vqdmulhq */ - b0_y0 = vshrq_n_s16(b0_y0, 1); - b0_y1 = vshrq_n_s16(b0_y1, 1); - b1_y0 = vshrq_n_s16(b1_y0, 1); - b1_y1 = vshrq_n_s16(b1_y1, 1); - - /* Restore sign bit */ - b0_y0 = veorq_s16(b0_y0, b0_sz0); - b0_y1 = veorq_s16(b0_y1, b0_sz1); - b0_x0 = vsubq_s16(b0_y0, b0_sz0); - b0_x1 = vsubq_s16(b0_y1, b0_sz1); - b1_y0 = veorq_s16(b1_y0, b1_sz0); - b1_y1 = veorq_s16(b1_y1, b1_sz1); - b1_x0 = vsubq_s16(b1_y0, b1_sz0); - b1_x1 = vsubq_s16(b1_y1, b1_sz1); - - /* find non-zero elements */ - b0_eob0 = vtstq_s16(b0_x0, one_q); - b0_eob1 = vtstq_s16(b0_x1, one_q); - b1_eob0 = vtstq_s16(b1_x0, one_q); - b1_eob1 = vtstq_s16(b1_x1, one_q); - - /* mask zig zag */ - b0_eob0 = vandq_u16(b0_eob0, zig_zag0); - b0_eob1 = vandq_u16(b0_eob1, zig_zag1); - b1_eob0 = vandq_u16(b1_eob0, zig_zag0); - b1_eob1 = vandq_u16(b1_eob1, zig_zag1); - - /* select the largest value */ - b0_eob0 = vmaxq_u16(b0_eob0, b0_eob1); - b0_eob_d16 = vmax_u16(vget_low_u16(b0_eob0), - vget_high_u16(b0_eob0)); - b0_eob_q32 = vmovl_u16(b0_eob_d16); - b0_eob_d32 = vmax_u32(vget_low_u32(b0_eob_q32), - vget_high_u32(b0_eob_q32)); - b0_eob_d32 = vpmax_u32(b0_eob_d32, b0_eob_d32); - - b1_eob0 = vmaxq_u16(b1_eob0, b1_eob1); - b1_eob_d16 = vmax_u16(vget_low_u16(b1_eob0), - vget_high_u16(b1_eob0)); - b1_eob_q32 = vmovl_u16(b1_eob_d16); - b1_eob_d32 = vmax_u32(vget_low_u32(b1_eob_q32), - vget_high_u32(b1_eob_q32)); - b1_eob_d32 = vpmax_u32(b1_eob_d32, b1_eob_d32); - - /* qcoeff = x */ - vst1q_s16(d0->qcoeff, b0_x0); - vst1q_s16(d0->qcoeff + 8, b0_x1); - vst1q_s16(d1->qcoeff, b1_x0); - vst1q_s16(d1->qcoeff + 8, b1_x1); - - /* dqcoeff = x * dequant */ - vst1q_s16(d0->dqcoeff, vmulq_s16(d0_dequant0, b0_x0)); - vst1q_s16(d0->dqcoeff + 8, vmulq_s16(d0_dequant1, b0_x1)); - vst1q_s16(d1->dqcoeff, vmulq_s16(d1_dequant0, b1_x0)); - vst1q_s16(d1->dqcoeff + 8, vmulq_s16(d1_dequant1, b1_x1)); - - vst1_lane_s8((int8_t *)d0->eob, vreinterpret_s8_u32(b0_eob_d32), 0); - vst1_lane_s8((int8_t *)d1->eob, vreinterpret_s8_u32(b1_eob_d32), 0); - return; -} diff --git a/vp8/encoder/arm/quantize_arm.c b/vp8/encoder/arm/quantize_arm.c deleted file mode 100644 index 80d9ad0..0000000 --- a/vp8/encoder/arm/quantize_arm.c +++ /dev/null @@ -1,64 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "vpx_config.h" -#include "vp8_rtcd.h" -#include "vp8/encoder/block.h" -#include -#include "vpx_mem/vpx_mem.h" -#include "vp8/encoder/quantize.h" -#include "vp8/common/entropy.h" - - -#if HAVE_NEON - -/* vp8_quantize_mbX functions here differs from corresponding ones in - * quantize.c only by using quantize_b_pair function pointer instead of - * the regular quantize_b function pointer */ -void vp8_quantize_mby_neon(MACROBLOCK *x) -{ - int i; - int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED - && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); - - for (i = 0; i < 16; i+=2) - x->quantize_b_pair(&x->block[i], &x->block[i+1], - &x->e_mbd.block[i], &x->e_mbd.block[i+1]); - - if(has_2nd_order) - x->quantize_b(&x->block[24], &x->e_mbd.block[24]); -} - -void vp8_quantize_mb_neon(MACROBLOCK *x) -{ - int i; - int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED - && x->e_mbd.mode_info_context->mbmi.mode != SPLITMV); - - for (i = 0; i < 24; i+=2) - x->quantize_b_pair(&x->block[i], &x->block[i+1], - &x->e_mbd.block[i], &x->e_mbd.block[i+1]); - - if (has_2nd_order) - x->quantize_b(&x->block[24], &x->e_mbd.block[24]); -} - - -void vp8_quantize_mbuv_neon(MACROBLOCK *x) -{ - int i; - - for (i = 16; i < 24; i+=2) - x->quantize_b_pair(&x->block[i], &x->block[i+1], - &x->e_mbd.block[i], &x->e_mbd.block[i+1]); -} - -#endif /* HAVE_NEON */ diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h index 1f212ca..dbdcab9 100644 --- a/vp8/encoder/block.h +++ b/vp8/encoder/block.h @@ -160,7 +160,6 @@ typedef struct macroblock void (*short_fdct8x4)(short *input, short *output, int pitch); void (*short_walsh4x4)(short *input, short *output, int pitch); void (*quantize_b)(BLOCK *b, BLOCKD *d); - void (*quantize_b_pair)(BLOCK *b1, BLOCK *b2, BLOCKD *d0, BLOCKD *d1); } MACROBLOCK; diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index aec6b98..85813b6 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -1252,7 +1252,6 @@ int vp8cx_encode_inter_macroblock if(cpi->sf.use_fastquant_for_pick) { x->quantize_b = vp8_fast_quantize_b; - x->quantize_b_pair = vp8_fast_quantize_b_pair; /* the fast quantizer does not use zbin_extra, so * do not recalculate */ @@ -1265,7 +1264,6 @@ int vp8cx_encode_inter_macroblock if (cpi->sf.improved_quant) { x->quantize_b = vp8_regular_quantize_b; - x->quantize_b_pair = vp8_regular_quantize_b_pair; } /* restore cpi->zbin_mode_boost_enabled */ diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 7b8b51f..7814679 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -346,7 +346,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc) z->short_fdct8x4 = x->short_fdct8x4; z->short_walsh4x4 = x->short_walsh4x4; z->quantize_b = x->quantize_b; - z->quantize_b_pair = x->quantize_b_pair; z->optimize = x->optimize; /* diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 45b6b43..6ab5df9 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -1083,12 +1083,10 @@ void vp8_set_speed_features(VP8_COMP *cpi) if (cpi->sf.improved_quant) { cpi->mb.quantize_b = vp8_regular_quantize_b; - cpi->mb.quantize_b_pair = vp8_regular_quantize_b_pair; } else { cpi->mb.quantize_b = vp8_fast_quantize_b; - cpi->mb.quantize_b_pair = vp8_fast_quantize_b_pair; } if (cpi->sf.improved_quant != last_improved_quant) vp8cx_init_quantizer(cpi); diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c index 9953bd6..2feb316 100644 --- a/vp8/encoder/quantize.c +++ b/vp8/encoder/quantize.c @@ -101,7 +101,7 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d) *d->eob = (char)(eob + 1); } -void vp8_quantize_mby_c(MACROBLOCK *x) +void vp8_quantize_mby(MACROBLOCK *x) { int i; int has_2nd_order = (x->e_mbd.mode_info_context->mbmi.mode != B_PRED @@ -114,7 +114,7 @@ void vp8_quantize_mby_c(MACROBLOCK *x) x->quantize_b(&x->block[24], &x->e_mbd.block[24]); } -void vp8_quantize_mb_c(MACROBLOCK *x) +void vp8_quantize_mb(MACROBLOCK *x) { int i; int has_2nd_order=(x->e_mbd.mode_info_context->mbmi.mode != B_PRED @@ -125,7 +125,7 @@ void vp8_quantize_mb_c(MACROBLOCK *x) } -void vp8_quantize_mbuv_c(MACROBLOCK *x) +void vp8_quantize_mbuv(MACROBLOCK *x) { int i; @@ -133,23 +133,6 @@ void vp8_quantize_mbuv_c(MACROBLOCK *x) x->quantize_b(&x->block[i], &x->e_mbd.block[i]); } -/* quantize_b_pair function pointer in MACROBLOCK structure is set to one of - * these two C functions if corresponding optimized routine is not available. - * NEON optimized version implements currently the fast quantization for pair - * of blocks. */ -void vp8_regular_quantize_b_pair(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2) -{ - vp8_regular_quantize_b(b1, d1); - vp8_regular_quantize_b(b2, d2); -} - -void vp8_fast_quantize_b_pair_c(BLOCK *b1, BLOCK *b2, BLOCKD *d1, BLOCKD *d2) -{ - vp8_fast_quantize_b_c(b1, d1); - vp8_fast_quantize_b_c(b2, d2); -} - - static const int qrounding_factors[129] = { 48, 48, 48, 48, 48, 48, 48, 48, diff --git a/vp8/encoder/quantize.h b/vp8/encoder/quantize.h index c739b26..7d36c2b 100644 --- a/vp8/encoder/quantize.h +++ b/vp8/encoder/quantize.h @@ -18,6 +18,9 @@ extern "C" { struct VP8_COMP; struct macroblock; +extern void vp8_quantize_mb(struct macroblock *x); +extern void vp8_quantize_mby(struct macroblock *x); +extern void vp8_quantize_mbuv(struct macroblock *x); extern void vp8_set_quantizer(struct VP8_COMP *cpi, int Q); extern void vp8cx_frame_init_quantizer(struct VP8_COMP *cpi); extern void vp8_update_zbin_extra(struct VP8_COMP *cpi, struct macroblock *x); diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk index 553b1c4..2c2b871 100644 --- a/vp8/vp8cx_arm.mk +++ b/vp8/vp8cx_arm.mk @@ -14,7 +14,6 @@ VP8_CX_SRCS-$(ARCH_ARM) += vp8cx_arm.mk #File list for arm # encoder VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c -VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/quantize_arm.c #File list for edsp # encoder -- 2.7.4