From 26b6318de83761dd268a589f0b1324153e9d0923 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Mon, 1 Jul 2013 10:40:00 -0700 Subject: [PATCH] Make get_coef_context() branchless. This should significantly speedup cost_coeffs(). Basically what the patch does is to make the neighbour arrays padded by one item to prevent an eob check in get_coef_context(), then it populates each col/row scan and left/top edge coefficient with two times the same neighbour - this prevents a single/double context branch in get_coef_context(). Lastly, it populates neighbour arrays in pixel order (rather than scan order), so we don't have to dereference the scantable to get the correct neighbours. Total encoding time of first 50 frames of bus (speed 0) at 1500kbps goes from 2min10.1 to 2min5.3, i.e. a 2.6% overall speed increase. Change-Id: I42bcd2210fd7bec03767ef0e2945a665b851df56 --- vp9/common/vp9_entropy.c | 107 ++++++++++++++++++++----------------------- vp9/common/vp9_entropy.h | 26 +++-------- vp9/decoder/vp9_detokenize.c | 14 ++---- vp9/encoder/vp9_encodemb.c | 18 +++----- vp9/encoder/vp9_rdopt.c | 11 ++--- vp9/encoder/vp9_tokenize.c | 7 ++- 6 files changed, 75 insertions(+), 108 deletions(-) diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c index ca8b879..f5d5c1a 100644 --- a/vp9/common/vp9_entropy.c +++ b/vp9/common/vp9_entropy.c @@ -461,25 +461,25 @@ void vp9_default_coef_probs(VP9_COMMON *pc) { // for each position in raster scan order. // -1 indicates the neighbor does not exist. DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_4x4_neighbors[16 * MAX_NEIGHBORS]); + vp9_default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_4x4_neighbors[16 * MAX_NEIGHBORS]); + vp9_col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_4x4_neighbors[16 * MAX_NEIGHBORS]); + vp9_row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]); DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_8x8_neighbors[64 * MAX_NEIGHBORS]); + vp9_col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_8x8_neighbors[64 * MAX_NEIGHBORS]); + vp9_row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_8x8_neighbors[64 * MAX_NEIGHBORS]); + vp9_default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]); DECLARE_ALIGNED(16, int16_t, - vp9_col_scan_16x16_neighbors[256 * MAX_NEIGHBORS]); + vp9_col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); DECLARE_ALIGNED(16, int16_t, - vp9_row_scan_16x16_neighbors[256 * MAX_NEIGHBORS]); + vp9_row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_16x16_neighbors[256 * MAX_NEIGHBORS]); + vp9_default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]); DECLARE_ALIGNED(16, int16_t, - vp9_default_scan_32x32_neighbors[1024 * MAX_NEIGHBORS]); + vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]); DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_4x4[16]); DECLARE_ALIGNED(16, int16_t, vp9_col_iscan_4x4[16]); @@ -504,15 +504,17 @@ static int find_in_scan(const int16_t *scan, int l, int idx) { } static void init_scan_neighbors(const int16_t *scan, int16_t *iscan, - int l, int16_t *neighbors, - int max_neighbors) { + int l, int16_t *neighbors) { int l2 = l * l; int n, i, j; - for (n = 0; n < l2; n++) { + // dc doesn't use this type of prediction + neighbors[MAX_NEIGHBORS * 0 + 0] = 0; + neighbors[MAX_NEIGHBORS * 0 + 1] = 0; + iscan[0] = find_in_scan(scan, l, 0); + for (n = 1; n < l2; n++) { int rc = scan[n]; iscan[n] = find_in_scan(scan, l, n); - assert(max_neighbors == MAX_NEIGHBORS); i = rc / l; j = rc % l; if (i > 0 && j > 0) { @@ -524,93 +526,84 @@ static void init_scan_neighbors(const int16_t *scan, // Therefore, if we use ADST/DCT, prefer the DCT neighbor coeff // as a context. If ADST or DCT is used in both directions, we // use the combination of the two as a context. - int a = find_in_scan(scan, l, (i - 1) * l + j); - int b = find_in_scan(scan, l, i * l + j - 1); + int a = (i - 1) * l + j; + int b = i * l + j - 1; if (scan == vp9_col_scan_4x4 || scan == vp9_col_scan_8x8 || scan == vp9_col_scan_16x16) { - neighbors[max_neighbors * n + 0] = a; - neighbors[max_neighbors * n + 1] = -1; + // in the col/row scan cases (as well as left/top edge cases), we set + // both contexts to the same value, so we can branchlessly do a+b+1>>1 + // which automatically becomes a if a == b + neighbors[MAX_NEIGHBORS * n + 0] = + neighbors[MAX_NEIGHBORS * n + 1] = a; } else if (scan == vp9_row_scan_4x4 || scan == vp9_row_scan_8x8 || scan == vp9_row_scan_16x16) { - neighbors[max_neighbors * n + 0] = b; - neighbors[max_neighbors * n + 1] = -1; + neighbors[MAX_NEIGHBORS * n + 0] = + neighbors[MAX_NEIGHBORS * n + 1] = b; } else { - neighbors[max_neighbors * n + 0] = a; - neighbors[max_neighbors * n + 1] = b; + neighbors[MAX_NEIGHBORS * n + 0] = a; + neighbors[MAX_NEIGHBORS * n + 1] = b; } } else if (i > 0) { - neighbors[max_neighbors * n + 0] = find_in_scan(scan, l, (i - 1) * l + j); - neighbors[max_neighbors * n + 1] = -1; - } else if (j > 0) { - neighbors[max_neighbors * n + 0] = - find_in_scan(scan, l, i * l + j - 1); - neighbors[max_neighbors * n + 1] = -1; + neighbors[MAX_NEIGHBORS * n + 0] = + neighbors[MAX_NEIGHBORS * n + 1] = (i - 1) * l + j; } else { - assert(n == 0); - // dc predictor doesn't use previous tokens - neighbors[max_neighbors * n + 0] = -1; + assert(j > 0); + neighbors[MAX_NEIGHBORS * n + 0] = + neighbors[MAX_NEIGHBORS * n + 1] = i * l + j - 1; } - assert(neighbors[max_neighbors * n + 0] < n); + assert(iscan[neighbors[MAX_NEIGHBORS * n + 0]] < n); } + // one padding item so we don't have to add branches in code to handle + // calls to get_coef_context() for the token after the final dc token + neighbors[MAX_NEIGHBORS * l2 + 0] = 0; + neighbors[MAX_NEIGHBORS * l2 + 1] = 0; } void vp9_init_neighbors() { init_scan_neighbors(vp9_default_scan_4x4, vp9_default_iscan_4x4, 4, - vp9_default_scan_4x4_neighbors, MAX_NEIGHBORS); + vp9_default_scan_4x4_neighbors); init_scan_neighbors(vp9_row_scan_4x4, vp9_row_iscan_4x4, 4, - vp9_row_scan_4x4_neighbors, MAX_NEIGHBORS); + vp9_row_scan_4x4_neighbors); init_scan_neighbors(vp9_col_scan_4x4, vp9_col_iscan_4x4, 4, - vp9_col_scan_4x4_neighbors, MAX_NEIGHBORS); + vp9_col_scan_4x4_neighbors); init_scan_neighbors(vp9_default_scan_8x8, vp9_default_iscan_8x8, 8, - vp9_default_scan_8x8_neighbors, MAX_NEIGHBORS); + vp9_default_scan_8x8_neighbors); init_scan_neighbors(vp9_row_scan_8x8, vp9_row_iscan_8x8, 8, - vp9_row_scan_8x8_neighbors, MAX_NEIGHBORS); + vp9_row_scan_8x8_neighbors); init_scan_neighbors(vp9_col_scan_8x8, vp9_col_iscan_8x8, 8, - vp9_col_scan_8x8_neighbors, MAX_NEIGHBORS); + vp9_col_scan_8x8_neighbors); init_scan_neighbors(vp9_default_scan_16x16, vp9_default_iscan_16x16, 16, - vp9_default_scan_16x16_neighbors, MAX_NEIGHBORS); + vp9_default_scan_16x16_neighbors); init_scan_neighbors(vp9_row_scan_16x16, vp9_row_iscan_16x16, 16, - vp9_row_scan_16x16_neighbors, MAX_NEIGHBORS); + vp9_row_scan_16x16_neighbors); init_scan_neighbors(vp9_col_scan_16x16, vp9_col_iscan_16x16, 16, - vp9_col_scan_16x16_neighbors, MAX_NEIGHBORS); + vp9_col_scan_16x16_neighbors); init_scan_neighbors(vp9_default_scan_32x32, vp9_default_iscan_32x32, 32, - vp9_default_scan_32x32_neighbors, MAX_NEIGHBORS); + vp9_default_scan_32x32_neighbors); } -const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan, int *pad) { +const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan) { if (scan == vp9_default_scan_4x4) { - *pad = MAX_NEIGHBORS; return vp9_default_scan_4x4_neighbors; } else if (scan == vp9_row_scan_4x4) { - *pad = MAX_NEIGHBORS; return vp9_row_scan_4x4_neighbors; } else if (scan == vp9_col_scan_4x4) { - *pad = MAX_NEIGHBORS; return vp9_col_scan_4x4_neighbors; } else if (scan == vp9_default_scan_8x8) { - *pad = MAX_NEIGHBORS; return vp9_default_scan_8x8_neighbors; } else if (scan == vp9_row_scan_8x8) { - *pad = 2; return vp9_row_scan_8x8_neighbors; } else if (scan == vp9_col_scan_8x8) { - *pad = 2; return vp9_col_scan_8x8_neighbors; } else if (scan == vp9_default_scan_16x16) { - *pad = MAX_NEIGHBORS; return vp9_default_scan_16x16_neighbors; } else if (scan == vp9_row_scan_16x16) { - *pad = 2; return vp9_row_scan_16x16_neighbors; } else if (scan == vp9_col_scan_16x16) { - *pad = 2; return vp9_col_scan_16x16_neighbors; - } else if (scan == vp9_default_scan_32x32) { - *pad = MAX_NEIGHBORS; - return vp9_default_scan_32x32_neighbors; } else { - assert(0); - return NULL; + assert(scan == vp9_default_scan_32x32); + return vp9_default_scan_32x32_neighbors; } } diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index 5937efa..68c36ea 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -166,28 +166,14 @@ static int get_coef_band(const uint8_t * band_translate, int coef_index) { } #define MAX_NEIGHBORS 2 -static INLINE int get_coef_context(const int16_t *scan, - const int16_t *neighbors, - int nb_pad, uint8_t *token_cache, - int c, int l) { - int eob = l; - assert(nb_pad == MAX_NEIGHBORS); - if (c == eob) { - return 0; - } else { - int ctx; - assert(neighbors[MAX_NEIGHBORS * c + 0] >= 0); - if (neighbors[MAX_NEIGHBORS * c + 1] >= 0) { - ctx = (1 + token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]] + - token_cache[scan[neighbors[MAX_NEIGHBORS * c + 1]]]) >> 1; - } else { - ctx = token_cache[scan[neighbors[MAX_NEIGHBORS * c + 0]]]; - } - return ctx; - } +static INLINE int get_coef_context(const int16_t *neighbors, + uint8_t *token_cache, + int c) { + return (1 + token_cache[neighbors[MAX_NEIGHBORS * c + 0]] + + token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1; } -const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan, int *pad); +const int16_t *vp9_get_coef_neighbors_handle(const int16_t *scan); // 128 lists of probabilities are stored for the following ONE node probs: diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index 81403a4..76889c4 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -97,7 +97,7 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd, TX_SIZE txfm_size, const int16_t *dq, ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L) { ENTROPY_CONTEXT above_ec, left_ec; - int pt, c = 0, pad, default_eob; + int pt, c = 0; int band; vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES]; vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES]; @@ -130,7 +130,6 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd, scan = get_scan_4x4(tx_type); above_ec = A[0] != 0; left_ec = L[0] != 0; - default_eob = 16; band_translate = vp9_coefband_trans_4x4; break; } @@ -140,7 +139,6 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd, scan = get_scan_8x8(tx_type); above_ec = (A[0] + A[1]) != 0; left_ec = (L[0] + L[1]) != 0; - default_eob = 64; band_translate = vp9_coefband_trans_8x8plus; break; } @@ -150,7 +148,6 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd, scan = get_scan_16x16(tx_type); above_ec = (A[0] + A[1] + A[2] + A[3]) != 0; left_ec = (L[0] + L[1] + L[2] + L[3]) != 0; - default_eob = 256; band_translate = vp9_coefband_trans_8x8plus; break; } @@ -158,13 +155,12 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd, scan = vp9_default_scan_32x32; above_ec = (A[0] + A[1] + A[2] + A[3] + A[4] + A[5] + A[6] + A[7]) != 0; left_ec = (L[0] + L[1] + L[2] + L[3] + L[4] + L[5] + L[6] + L[7]) != 0; - default_eob = 1024; band_translate = vp9_coefband_trans_8x8plus; break; } pt = combine_entropy_contexts(above_ec, left_ec); - nb = vp9_get_coef_neighbors_handle(scan, &pad); + nb = vp9_get_coef_neighbors_handle(scan); while (1) { int val; @@ -172,8 +168,7 @@ static int decode_coefs(FRAME_CONTEXT *fc, const MACROBLOCKD *xd, if (c >= seg_eob) break; if (c) - pt = get_coef_context(scan, nb, pad, token_cache, - c, default_eob); + pt = get_coef_context(nb, token_cache, c); band = get_coef_band(band_translate, c); prob = coef_probs[band][pt]; #if !CONFIG_BALANCED_COEFTREE @@ -186,8 +181,7 @@ SKIP_START: if (c >= seg_eob) break; if (c) - pt = get_coef_context(scan, nb, pad, token_cache, - c, default_eob); + pt = get_coef_context(nb, token_cache, c); band = get_coef_band(band_translate, c); prob = coef_probs[band][pt]; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 253f1ae..dabe3a4 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -112,11 +112,10 @@ static const int plane_rd_mult[4] = { static int trellis_get_coeff_context(const int16_t *scan, const int16_t *nb, int idx, int token, - uint8_t *token_cache, - int pad, int l) { + uint8_t *token_cache) { int bak = token_cache[scan[idx]], pt; token_cache[scan[idx]] = vp9_pt_energy_class[token]; - pt = get_coef_context(scan, nb, pad, token_cache, idx + 1, l); + pt = get_coef_context(nb, token_cache, idx + 1); token_cache[scan[idx]] = bak; return pt; } @@ -141,7 +140,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb, int best, band, pt; PLANE_TYPE type = xd->plane[plane].plane_type; int err_mult = plane_rd_mult[type]; - int default_eob, pad; + int default_eob; const int16_t *scan, *nb; const int mul = 1 + (tx_size == TX_32X32); uint8_t token_cache[1024]; @@ -201,7 +200,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb, for (i = 0; i < eob; i++) token_cache[scan[i]] = vp9_pt_energy_class[vp9_dct_value_tokens_ptr[ qcoeff_ptr[scan[i]]].token]; - nb = vp9_get_coef_neighbors_handle(scan, &pad); + nb = vp9_get_coef_neighbors_handle(scan); for (i = eob; i-- > i0;) { int base_bits, d2, dx; @@ -220,8 +219,7 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb, /* Consider both possible successor states. */ if (next < default_eob) { band = get_coef_band(band_translate, i + 1); - pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache, - pad, default_eob); + pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); rate0 += mb->token_costs[tx_size][type][ref][0][band][pt] [tokens[next][0].token]; @@ -273,14 +271,12 @@ static void optimize_b(VP9_COMMON *const cm, MACROBLOCK *mb, if (next < default_eob) { band = get_coef_band(band_translate, i + 1); if (t0 != DCT_EOB_TOKEN) { - pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache, - pad, default_eob); + pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache); rate0 += mb->token_costs[tx_size][type][ref][!x][band][pt] [tokens[next][0].token]; } if (t1 != DCT_EOB_TOKEN) { - pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache, - pad, default_eob); + pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache); rate1 += mb->token_costs[tx_size][type][ref][!x][band][pt] [tokens[next][1].token]; } diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 0cf4130..a34857d 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -304,7 +304,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb, MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; int pt; int c = 0; - int cost = 0, pad; + int cost = 0; const int16_t *scan, *nb; const int eob = xd->plane[plane].eobs[block]; const int16_t *qcoeff_ptr = BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16); @@ -314,7 +314,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb, ENTROPY_CONTEXT above_ec, left_ec; TX_TYPE tx_type = DCT_DCT; const int segment_id = xd->mode_info_context->mbmi.segment_id; - int seg_eob, default_eob; + int seg_eob; uint8_t token_cache[1024]; const uint8_t * band_translate; @@ -372,8 +372,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb, assert(eob <= seg_eob); pt = combine_entropy_contexts(above_ec, left_ec); - nb = vp9_get_coef_neighbors_handle(scan, &pad); - default_eob = seg_eob; + nb = vp9_get_coef_neighbors_handle(scan); if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) seg_eob = 0; @@ -402,7 +401,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb, v = qcoeff_ptr[rc]; t = vp9_dct_value_tokens_ptr[v].token; - pt = get_coef_context(scan, nb, pad, token_cache, c, default_eob); + pt = get_coef_context(nb, token_cache, c); cost += token_costs[!prev_t][band][pt][t] + vp9_dct_value_cost_ptr[v]; token_cache[rc] = vp9_pt_energy_class[t]; prev_t = t; @@ -410,7 +409,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb, // eob token if (c < seg_eob) { - pt = get_coef_context(scan, nb, pad, token_cache, c, default_eob); + pt = get_coef_context(nb, token_cache, c); cost += token_costs[0][get_coef_band(band_translate, c)][pt] [DCT_EOB_TOKEN]; } diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index 91c2a14..ee129a0 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -123,7 +123,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, const int loff = (off >> mod) << tx_size; ENTROPY_CONTEXT *A = xd->plane[plane].above_context + aoff; ENTROPY_CONTEXT *L = xd->plane[plane].left_context + loff; - int seg_eob, default_eob, pad; + int seg_eob; const int segment_id = mbmi->segment_id; const int16_t *scan, *nb; vp9_coeff_count *counts; @@ -178,8 +178,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, } pt = combine_entropy_contexts(above_ec, left_ec); - nb = vp9_get_coef_neighbors_handle(scan, &pad); - default_eob = seg_eob; + nb = vp9_get_coef_neighbors_handle(scan); if (vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)) seg_eob = 0; @@ -191,7 +190,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE_TYPE bsize, int v = 0; rc = scan[c]; if (c) - pt = get_coef_context(scan, nb, pad, token_cache, c, default_eob); + pt = get_coef_context(nb, token_cache, c); if (c < eob) { v = qcoeff_ptr[rc]; assert(-DCT_MAX_VALUE <= v && v < DCT_MAX_VALUE); -- 2.7.4