From 0549f5aae91b7d2f1265cea7e2edf34fda8f0433 Mon Sep 17 00:00:00 2001 From: Gabriel Marin Date: Tue, 13 Dec 2016 16:22:48 -0800 Subject: [PATCH] Simplify address arithmetic in vp9_optimize_b Simplify address arithmetic on token_costs to reduce the number of generated instructions that are used for address arithmetic inside routine vp9_optimize_b. It also helps improve instruction scheduling depending on compiler and optimization level. Measured a 9.3% reduction in retired instructions and 5.3% reduction in execution time for this routine with GCC v4.8.4 and optimization flags -O3, and a reduction of up to 11.6% in execution time with other compilers. No change in behavior. TEST=Verified that encoded files match bit for bit, with and without this change. BUG=b/33678225 Change-Id: I6098650fb5cd2aa04e014fe6e68ca20761f3a21f --- vp9/encoder/vp9_encodemb.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 20ebe68..c88efa5 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -109,6 +109,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, int64_t error0, error1; int16_t t0, t1; EXTRABIT e0; + unsigned int(*const token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] = + mb->token_costs[tx_size][type][ref]; int best, band, pt, i, final_eob; #if CONFIG_VP9_HIGHBITDEPTH const int *cat6_high_cost = vp9_get_high_cost_table(xd->bd); @@ -148,10 +150,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, if (next < default_eob) { band = band_translate[i + 1]; pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); - rate0 += mb->token_costs[tx_size][type][ref][band][0][pt] - [tokens[next][0].token]; - rate1 += mb->token_costs[tx_size][type][ref][band][0][pt] - [tokens[next][1].token]; + rate0 += token_costs[band][0][pt][tokens[next][0].token]; + rate1 += token_costs[band][0][pt][tokens[next][1].token]; } UPDATE_RD_COST(); /* And pick the best. */ @@ -208,13 +208,11 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, band = band_translate[i + 1]; if (t0 != EOB_TOKEN) { pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); - rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt] - [tokens[next][0].token]; + rate0 += token_costs[band][!x][pt][tokens[next][0].token]; } if (t1 != EOB_TOKEN) { pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache); - rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt] - [tokens[next][1].token]; + rate1 += token_costs[band][!x][pt][tokens[next][1].token]; } } @@ -270,13 +268,11 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, t1 = tokens[next][1].token; /* Update the cost of each path if we're past the EOB token. */ if (t0 != EOB_TOKEN) { - tokens[next][0].rate += - mb->token_costs[tx_size][type][ref][band][1][pt][t0]; + tokens[next][0].rate += token_costs[band][1][pt][t0]; tokens[next][0].token = ZERO_TOKEN; } if (t1 != EOB_TOKEN) { - tokens[next][1].rate += - mb->token_costs[tx_size][type][ref][band][1][pt][t1]; + tokens[next][1].rate += token_costs[band][1][pt][t1]; tokens[next][1].token = ZERO_TOKEN; } tokens[i][0].best_index = tokens[i][1].best_index = 0; @@ -292,8 +288,8 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size, error1 = tokens[next][1].error; t0 = tokens[next][0].token; t1 = tokens[next][1].token; - rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0]; - rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1]; + rate0 += token_costs[band][0][ctx][t0]; + rate1 += token_costs[band][0][ctx][t1]; UPDATE_RD_COST(); best = rd_cost1 < rd_cost0; final_eob = -1; -- 2.7.4