From eba9c762a1710e29967207a1df092ae0eca40313 Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Tue, 2 Dec 2014 15:47:41 -0800 Subject: [PATCH] vp9_ethread: the tile-based multi-threaded encoder Currently, VP9 supports column-tile encoding, which allows a frame to be encoded in multiple column tiles independently. The number of column tiles are set by encoder option "--tile-columns". This provides a way to encode a frame in parallel. Based on previous set of patches, this patch implemented the tile- based multi-threaded encoder. Each thread processes one or more tiles. Usage: For HD clips: --tile-columns=2 --threads=1/2/3/4 While using 4 threads, tests showed that the encoder achieved 2.3X - 2.5X speedup at good-quality speed 3, and 2X speedup at realtime speed 5. Change-Id: Ied987f8f2618b1283a8643ad255e88341733c9d4 --- vp9/encoder/vp9_encodeframe.c | 59 +++++---- vp9/encoder/vp9_encodeframe.h | 5 + vp9/encoder/vp9_encoder.c | 20 ++++ vp9/encoder/vp9_encoder.h | 7 ++ vp9/encoder/vp9_ethread.c | 272 ++++++++++++++++++++++++++++++++++++++++++ vp9/encoder/vp9_ethread.h | 25 ++++ vp9/vp9_cx_iface.c | 1 + vp9/vp9cx.mk | 2 + 8 files changed, 366 insertions(+), 25 deletions(-) create mode 100644 vp9/encoder/vp9_ethread.c create mode 100644 vp9/encoder/vp9_ethread.h diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index a5adcbb..7317c63 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -36,6 +36,7 @@ #include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_pickmode.h" #include "vp9/encoder/vp9_rd.h" @@ -3422,7 +3423,7 @@ static int get_skip_encode_frame(const VP9_COMMON *cm, ThreadData *const td) { cm->show_frame; } -static void init_tile_data(VP9_COMP *cpi) { +void vp9_init_tile_data(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; @@ -3460,36 +3461,40 @@ static void init_tile_data(VP9_COMP *cpi) { } } +void vp9_encode_tile(VP9_COMP *cpi, ThreadData *td, + int tile_row, int tile_col) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + TileDataEnc *this_tile = + &cpi->tile_data[tile_row * tile_cols + tile_col]; + const TileInfo * const tile_info = &this_tile->tile_info; + TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col]; + int mi_row; + + for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; + mi_row += MI_BLOCK_SIZE) { + if (cpi->sf.use_nonrd_pick_mode) + encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok); + else + encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok); + } + cpi->tok_count[tile_row][tile_col] = + (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]); + assert(tok - cpi->tile_tok[tile_row][tile_col] <= + allocated_tokens(*tile_info)); +} + static void encode_tiles(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; const int tile_cols = 1 << cm->log2_tile_cols; const int tile_rows = 1 << cm->log2_tile_rows; int tile_col, tile_row; - init_tile_data(cpi); + vp9_init_tile_data(cpi); - for (tile_row = 0; tile_row < tile_rows; ++tile_row) { - for (tile_col = 0; tile_col < tile_cols; ++tile_col) { - const TileInfo * const tile_info = - &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info; - TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col]; - int mi_row; - TileDataEnc *this_tile = - &cpi->tile_data[tile_row * tile_cols + tile_col]; - - for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end; - mi_row += MI_BLOCK_SIZE) { - if (cpi->sf.use_nonrd_pick_mode) - encode_nonrd_sb_row(cpi, &cpi->td, this_tile, mi_row, &tok); - else - encode_rd_sb_row(cpi, &cpi->td, this_tile, mi_row, &tok); - } - cpi->tok_count[tile_row][tile_col] = - (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]); - assert(tok - cpi->tile_tok[tile_row][tile_col] <= - allocated_tokens(*tile_info)); - } - } + for (tile_row = 0; tile_row < tile_rows; ++tile_row) + for (tile_col = 0; tile_col < tile_cols; ++tile_col) + vp9_encode_tile(cpi, &cpi->td, tile_row, tile_col); } #if CONFIG_FP_MB_STATS @@ -3596,7 +3601,11 @@ static void encode_frame_internal(VP9_COMP *cpi) { } #endif - encode_tiles(cpi); + // If allowed, encoding tiles in parallel with one thread handling one tile. + if (MIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1) + vp9_encode_tiles_mt(cpi); + else + encode_tiles(cpi); vpx_usec_timer_mark(&emr_timer); cpi->time_encode_sb_row += vpx_usec_timer_elapsed(&emr_timer); diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h index fd1c9aa..556f3a5 100644 --- a/vp9/encoder/vp9_encodeframe.h +++ b/vp9/encoder/vp9_encodeframe.h @@ -19,6 +19,7 @@ extern "C" { struct macroblock; struct yv12_buffer_config; struct VP9_COMP; +struct ThreadData; // Constants used in SOURCE_VAR_BASED_PARTITION #define VAR_HIST_MAX_BG_VAR 1000 @@ -33,6 +34,10 @@ void vp9_setup_src_planes(struct macroblock *x, void vp9_encode_frame(struct VP9_COMP *cpi); +void vp9_init_tile_data(struct VP9_COMP *cpi); +void vp9_encode_tile(struct VP9_COMP *cpi, struct ThreadData *td, + int tile_row, int tile_col); + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 28350ef..2dfa6b5 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -35,6 +35,7 @@ #include "vp9/encoder/vp9_context_tree.h" #include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_encodemv.h" +#include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mbgraph.h" #include "vp9/encoder/vp9_encoder.h" @@ -1728,6 +1729,7 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf) { void vp9_remove_compressor(VP9_COMP *cpi) { VP9_COMMON *const cm = &cpi->common; unsigned int i; + int t; if (!cpi) return; @@ -1800,6 +1802,24 @@ void vp9_remove_compressor(VP9_COMP *cpi) { } #endif + for (t = 0; t < cpi->num_workers; ++t) { + VP9Worker *const worker = &cpi->workers[t]; + EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; + + // Deallocate allocated threads. + vp9_get_worker_interface()->end(worker); + + // Deallocate allocated thread data. + if (t < cpi->num_workers - 1) { + vpx_free(thread_data->td->counts); + vp9_free_pc_tree(thread_data->td); + vpx_free(thread_data->td); + } + + vpx_free(worker->data1); + } + vpx_free(cpi->workers); + dealloc_compressor_data(cpi); for (i = 0; i < sizeof(cpi->mbgraph_stats) / diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index b75f491..7342f74 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -20,6 +20,7 @@ #include "vp9/common/vp9_ppflags.h" #include "vp9/common/vp9_entropymode.h" #include "vp9/common/vp9_onyxc_int.h" +#include "vp9/common/vp9_thread.h" #include "vp9/encoder/vp9_aq_cyclicrefresh.h" #include "vp9/encoder/vp9_context_tree.h" @@ -216,6 +217,8 @@ typedef struct VP9EncoderConfig { int tile_columns; int tile_rows; + int max_threads; + vpx_fixed_buf_t two_pass_stats_in; struct vpx_codec_pkt_list *output_pkt_list; @@ -442,6 +445,10 @@ typedef struct VP9_COMP { #if CONFIG_VP9_TEMPORAL_DENOISING VP9_DENOISER denoiser; #endif + + // Multi-threading + int num_workers; + VP9Worker *workers; } VP9_COMP; void vp9_initialize_enc(); diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c new file mode 100644 index 0000000..daf3da4 --- /dev/null +++ b/vp9/encoder/vp9_ethread.c @@ -0,0 +1,272 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vp9/encoder/vp9_encodeframe.h" +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ethread.h" + +static void accumulate_frame_counts(VP9_COMMON *cm, ThreadData *td) { + int i, j, k, l, m; + + for (i = 0; i < BLOCK_SIZE_GROUPS; i++) + for (j = 0; j < INTRA_MODES; j++) + cm->counts.y_mode[i][j] += td->counts->y_mode[i][j]; + + for (i = 0; i < INTRA_MODES; i++) + for (j = 0; j < INTRA_MODES; j++) + cm->counts.uv_mode[i][j] += td->counts->uv_mode[i][j]; + + for (i = 0; i < PARTITION_CONTEXTS; i++) + for (j = 0; j < PARTITION_TYPES; j++) + cm->counts.partition[i][j] += td->counts->partition[i][j]; + + for (i = 0; i < TX_SIZES; i++) + for (j = 0; j < PLANE_TYPES; j++) + for (k = 0; k < REF_TYPES; k++) + for (l = 0; l < COEF_BANDS; l++) + for (m = 0; m < COEFF_CONTEXTS; m++) + cm->counts.eob_branch[i][j][k][l][m] += + td->counts->eob_branch[i][j][k][l][m]; + // cm->counts.coef is only updated at frame level, so not need + // to accumulate it here. + // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++) + // cm->counts.coef[i][j][k][l][m][n] += + // td->counts->coef[i][j][k][l][m][n]; + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + for (j = 0; j < SWITCHABLE_FILTERS; j++) + cm->counts.switchable_interp[i][j] += td->counts->switchable_interp[i][j]; + + for (i = 0; i < INTER_MODE_CONTEXTS; i++) + for (j = 0; j < INTER_MODES; j++) + cm->counts.inter_mode[i][j] += td->counts->inter_mode[i][j]; + + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) + for (j = 0; j < 2; j++) + cm->counts.intra_inter[i][j] += td->counts->intra_inter[i][j]; + + for (i = 0; i < COMP_INTER_CONTEXTS; i++) + for (j = 0; j < 2; j++) + cm->counts.comp_inter[i][j] += td->counts->comp_inter[i][j]; + + for (i = 0; i < REF_CONTEXTS; i++) + for (j = 0; j < 2; j++) + for (k = 0; k < 2; k++) + cm->counts.single_ref[i][j][k] += td->counts->single_ref[i][j][k]; + + for (i = 0; i < REF_CONTEXTS; i++) + for (j = 0; j < 2; j++) + cm->counts.comp_ref[i][j] += td->counts->comp_ref[i][j]; + + for (i = 0; i < TX_SIZE_CONTEXTS; i++) { + for (j = 0; j < TX_SIZES; j++) + cm->counts.tx.p32x32[i][j] += td->counts->tx.p32x32[i][j]; + + for (j = 0; j < TX_SIZES - 1; j++) + cm->counts.tx.p16x16[i][j] += td->counts->tx.p16x16[i][j]; + + for (j = 0; j < TX_SIZES - 2; j++) + cm->counts.tx.p8x8[i][j] += td->counts->tx.p8x8[i][j]; + } + + for (i = 0; i < SKIP_CONTEXTS; i++) + for (j = 0; j < 2; j++) + cm->counts.skip[i][j] += td->counts->skip[i][j]; + + for (i = 0; i < MV_JOINTS; i++) + cm->counts.mv.joints[i] += td->counts->mv.joints[i]; + + for (k = 0; k < 2; k++) { + nmv_component_counts *comps = &cm->counts.mv.comps[k]; + nmv_component_counts *comps_t = &td->counts->mv.comps[k]; + + for (i = 0; i < 2; i++) { + comps->sign[i] += comps_t->sign[i]; + comps->class0_hp[i] += comps_t->class0_hp[i]; + comps->hp[i] += comps_t->hp[i]; + } + + for (i = 0; i < MV_CLASSES; i++) + comps->classes[i] += comps_t->classes[i]; + + for (i = 0; i < CLASS0_SIZE; i++) { + comps->class0[i] += comps_t->class0[i]; + for (j = 0; j < MV_FP_SIZE; j++) + comps->class0_fp[i][j] += comps_t->class0_fp[i][j]; + } + + for (i = 0; i < MV_OFFSET_BITS; i++) + for (j = 0; j < 2; j++) + comps->bits[i][j] += comps_t->bits[i][j]; + + for (i = 0; i < MV_FP_SIZE; i++) + comps->fp[i] += comps_t->fp[i]; + } +} + +static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { + int i, j, k, l, m, n; + + for (i = 0; i < REFERENCE_MODES; i++) + td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i]; + + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) + td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i]; + + for (i = 0; i < TX_MODES; i++) + td->rd_counts.tx_select_diff[i] += td_t->rd_counts.tx_select_diff[i]; + + for (i = 0; i < TX_SIZES; i++) + for (j = 0; j < PLANE_TYPES; j++) + for (k = 0; k < REF_TYPES; k++) + for (l = 0; l < COEF_BANDS; l++) + for (m = 0; m < COEFF_CONTEXTS; m++) + for (n = 0; n < ENTROPY_TOKENS; n++) + td->rd_counts.coef_counts[i][j][k][l][m][n] += + td_t->rd_counts.coef_counts[i][j][k][l][m][n]; +} + +static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) { + VP9_COMP *const cpi = thread_data->cpi; + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + int t; + + (void) unused; + + for (t = thread_data->start; t < tile_rows * tile_cols; + t += cpi->num_workers) { + int tile_row = t / tile_cols; + int tile_col = t % tile_cols; + + vp9_encode_tile(cpi, thread_data->td, tile_row, tile_col); + } + + return 0; +} + +void vp9_encode_tiles_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); + const int num_workers = MIN(cpi->oxcf.max_threads, tile_cols); + int i; + + vp9_init_tile_data(cpi); + + // Only run once to create threads and allocate thread data. + if (cpi->num_workers == 0) { + CHECK_MEM_ERROR(cm, cpi->workers, + vpx_malloc(num_workers * sizeof(*cpi->workers))); + + for (i = 0; i < num_workers; i++) { + VP9Worker *const worker = &cpi->workers[i]; + EncWorkerData *thread_data; + + ++cpi->num_workers; + + winterface->init(worker); + CHECK_MEM_ERROR(cm, worker->data1, + (EncWorkerData*)vpx_calloc(1, sizeof(EncWorkerData))); + thread_data = (EncWorkerData*)worker->data1; + + if (i < num_workers - 1) { + thread_data->cpi = cpi; + + // Allocate thread data. + CHECK_MEM_ERROR(cm, thread_data->td, + vpx_calloc(1, sizeof(*thread_data->td))); + // Set up pc_tree. + thread_data->td->leaf_tree = NULL; + thread_data->td->pc_tree = NULL; + vp9_setup_pc_tree(cm, thread_data->td); + + // Allocate frame counters in thread data. + CHECK_MEM_ERROR(cm, thread_data->td->counts, + vpx_calloc(1, sizeof(*thread_data->td->counts))); + + // Create threads + if (!winterface->reset(worker)) + vpx_internal_error(&cm->error, VPX_CODEC_ERROR, + "Tile encoder thread creation failed"); + } else { + // Main thread acts as a worker and uses the thread data in cpi. + thread_data->cpi = cpi; + thread_data->td = &cpi->td; + } + + // data2 is unused. + worker->data2 = NULL; + + winterface->sync(worker); + worker->hook = (VP9WorkerHook)enc_worker_hook; + } + } + + for (i = 0; i < num_workers; i++) { + VP9Worker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; + + // Before encoding a frame, copy the thread data from cpi. + thread_data->td->mb = cpi->td.mb; + thread_data->td->rd_counts = cpi->td.rd_counts; + vpx_memcpy(thread_data->td->counts, &cpi->common.counts, + sizeof(cpi->common.counts)); + + // Handle use_nonrd_pick_mode case. + if (cpi->sf.use_nonrd_pick_mode) { + MACROBLOCK *const x = &thread_data->td->mb; + MACROBLOCKD *const xd = &x->e_mbd; + struct macroblock_plane *const p = x->plane; + struct macroblockd_plane *const pd = xd->plane; + PICK_MODE_CONTEXT *ctx = &thread_data->td->pc_root->none; + int j; + + for (j = 0; j < MAX_MB_PLANE; ++j) { + p[j].coeff = ctx->coeff_pbuf[j][0]; + p[j].qcoeff = ctx->qcoeff_pbuf[j][0]; + pd[j].dqcoeff = ctx->dqcoeff_pbuf[j][0]; + p[j].eobs = ctx->eobs_pbuf[j][0]; + } + } + } + + // Encode a frame + for (i = 0; i < num_workers; i++) { + VP9Worker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; + + // Set the starting tile for each thread. + thread_data->start = i; + + if (i == num_workers - 1) + winterface->execute(worker); + else + winterface->launch(worker); + } + + // Encoding ends. + for (i = 0; i < num_workers; i++) { + VP9Worker *const worker = &cpi->workers[i]; + winterface->sync(worker); + } + + for (i = 0; i < num_workers; i++) { + VP9Worker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; + + // Accumulate counters. + if (i < num_workers - 1) { + accumulate_frame_counts(&cpi->common, thread_data->td); + accumulate_rd_opt(&cpi->td, thread_data->td); + } + } +} diff --git a/vp9/encoder/vp9_ethread.h b/vp9/encoder/vp9_ethread.h new file mode 100644 index 0000000..e87c50b --- /dev/null +++ b/vp9/encoder/vp9_ethread.h @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_ETHREAD_H_ +#define VP9_ENCODER_VP9_ETHREAD_H_ + +struct VP9_COMP; +struct ThreadData; + +typedef struct EncWorkerData { + struct VP9_COMP *cpi; + struct ThreadData *td; + int start; +} EncWorkerData; + +void vp9_encode_tiles_mt(struct VP9_COMP *cpi); + +#endif // VP9_ENCODER_VP9_ETHREAD_H_ diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index e86df22..b9fb814 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -355,6 +355,7 @@ static vpx_codec_err_t set_encoder_config( const struct vp9_extracfg *extra_cfg) { const int is_vbr = cfg->rc_end_usage == VPX_VBR; oxcf->profile = cfg->g_profile; + oxcf->max_threads = (int)cfg->g_threads; oxcf->width = cfg->g_w; oxcf->height = cfg->g_h; oxcf->bit_depth = cfg->g_bit_depth; diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 50d2365..8c9552c 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -31,6 +31,8 @@ VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h VP9_CX_SRCS-yes += encoder/vp9_encodemb.c VP9_CX_SRCS-yes += encoder/vp9_encodemv.c +VP9_CX_SRCS-yes += encoder/vp9_ethread.h +VP9_CX_SRCS-yes += encoder/vp9_ethread.c VP9_CX_SRCS-yes += encoder/vp9_extend.c VP9_CX_SRCS-yes += encoder/vp9_firstpass.c VP9_CX_SRCS-yes += encoder/vp9_block.h -- 2.7.4