From 8b0c11c3588963fa02be0cad36a6a23cdb748cf9 Mon Sep 17 00:00:00 2001 From: Ranjit Kumar Tulabandu Date: Tue, 27 Dec 2016 18:45:43 +0530 Subject: [PATCH] Multi-threading of first pass stats collection (yunqingwang) 1. Rebased the patch. Incorporated recent first pass changes. 2. Turned on the first pass unit test. Change-Id: Ia2f7ba8152d0b6dd6bf8efb9dfaf505ba7d8edee --- test/vp9_ethread_test.cc | 9 +- vp9/encoder/vp9_block.h | 4 + vp9/encoder/vp9_encodemb.c | 7 +- vp9/encoder/vp9_encoder.c | 21 + vp9/encoder/vp9_encoder.h | 46 ++ vp9/encoder/vp9_ethread.c | 324 +++++++++++- vp9/encoder/vp9_ethread.h | 35 ++ vp9/encoder/vp9_firstpass.c | 1120 +++++++++++++++++++++++----------------- vp9/encoder/vp9_firstpass.h | 47 ++ vp9/encoder/vp9_job_queue.h | 46 ++ vp9/encoder/vp9_multi_thread.c | 282 ++++++++++ vp9/encoder/vp9_multi_thread.h | 38 ++ vp9/vp9_cx_iface.c | 13 + vp9/vp9cx.mk | 3 + vpx/vp8cx.h | 11 + vpxenc.c | 5 + 16 files changed, 1496 insertions(+), 515 deletions(-) create mode 100644 vp9/encoder/vp9_job_queue.h create mode 100644 vp9/encoder/vp9_multi_thread.c create mode 100644 vp9/encoder/vp9_multi_thread.h diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc index ee52256..d779930 100644 --- a/test/vp9_ethread_test.cc +++ b/test/vp9_ethread_test.cc @@ -82,9 +82,8 @@ class VPxFirstPassEncoderThreadTest encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0); // For now, new_mt_mode only works for 2-pass encoding. - // Enable this once the fp mt patch is checked in. - // if (encoding_mode_ == ::libvpx_test::kTwoPassGood) - // encoder->Control(VP9E_SET_NEW_MT, new_mt_mode_); + if (encoding_mode_ == ::libvpx_test::kTwoPassGood) + encoder->Control(VP9E_SET_NEW_MT, new_mt_mode_); encoder_initialized_ = true; } @@ -131,7 +130,7 @@ static void compare_fp_stats(vpx_fixed_buf_t *fp_stats) { for (j = 0; j < kDbl; ++j) { EXPECT_LE(fabs(*frame_stats1 - *frame_stats2), - fabs(*frame_stats1) / 1000.0); + fabs(*frame_stats1) / 10000.0); frame_stats1++; frame_stats2++; } @@ -146,7 +145,7 @@ static void compare_fp_stats(vpx_fixed_buf_t *fp_stats) { } TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) { - ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 50); + ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60); first_pass_only_ = 1; cfg_.rc_target_bitrate = 1000; diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 0d5075c..91d07e3 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -131,6 +131,10 @@ struct macroblock { int use_lp32x32fdct; int skip_encode; + // In first pass, intra prediction is done based on source pixels + // at tile boundaries + int fp_src_pred; + // use fast quantization process int quant_fp; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 2cb137d..1dc8d34 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -773,9 +773,10 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, } } - vp9_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst, - x->skip_encode ? src_stride : dst_stride, dst, - dst_stride, col, row, plane); + vp9_predict_intra_block( + xd, bwl, tx_size, mode, (x->skip_encode || x->fp_src_pred) ? src : dst, + (x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst, + dst_stride, col, row, plane); #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 26326fc..5dbe62d 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -50,6 +50,7 @@ #include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mbgraph.h" +#include "vp9/encoder/vp9_multi_thread.h" #include "vp9/encoder/vp9_noise_estimate.h" #include "vp9/encoder/vp9_picklpf.h" #include "vp9/encoder/vp9_ratectrl.h" @@ -1563,6 +1564,13 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); #endif + + // Enable multi-threading for first pass. + cpi->new_mt = 0; + if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) && + cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) && + cpi->oxcf.new_mt) + cpi->new_mt = 1; } #ifndef M_LOG2_E @@ -1719,6 +1727,12 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, } #endif +#if ENABLE_MT_BIT_MATCH + CHECK_MEM_ERROR( + cm, cpi->twopass.fp_mb_float_stats, + vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1)); +#endif + cpi->refresh_alt_ref_frame = 0; cpi->multi_arf_last_grp_enabled = 0; @@ -2076,6 +2090,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) { } vpx_free(cpi->tile_thr_data); vpx_free(cpi->workers); + vp9_row_mt_mem_dealloc(cpi); if (cpi->num_workers > 1) { vp9_loop_filter_dealloc(&cpi->lf_row_sync); @@ -2098,6 +2113,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) { } #endif +#if ENABLE_MT_BIT_MATCH + vpx_free(cpi->twopass.fp_mb_float_stats); + cpi->twopass.fp_mb_float_stats = NULL; +#endif + vp9_remove_common(cm); vp9_free_ref_frame_buffers(cm->buffer_pool); #if CONFIG_VP9_POSTPROC @@ -4802,6 +4822,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX; } + cpi->td.mb.fp_src_pred = 0; if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) { const int lossless = is_lossless_requested(oxcf); #if CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 00552e1..e1046f1 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -33,7 +33,9 @@ #include "vp9/encoder/vp9_aq_cyclicrefresh.h" #include "vp9/encoder/vp9_context_tree.h" #include "vp9/encoder/vp9_encodemb.h" +#include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_job_queue.h" #include "vp9/encoder/vp9_lookahead.h" #include "vp9/encoder/vp9_mbgraph.h" #include "vp9/encoder/vp9_mcomp.h" @@ -256,6 +258,8 @@ typedef struct VP9EncoderConfig { int render_width; int render_height; VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode; + + int new_mt; } VP9EncoderConfig; static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) { @@ -269,8 +273,34 @@ typedef struct TileDataEnc { int mode_map[BLOCK_SIZES][MAX_MODES]; int m_search_count; int ex_search_count; + FIRSTPASS_DATA fp_data; + VP9RowMTSync row_mt_sync; } TileDataEnc; +typedef struct RowMTInfo { + JobQueueHandle job_queue_hdl; +#if CONFIG_MULTITHREAD + pthread_mutex_t job_mutex; +#endif +} RowMTInfo; + +typedef struct MultiThreadHandle { + int allocated_tile_rows; + int allocated_tile_cols; + int allocated_vert_unit_rows; + + // Frame level params + int num_tile_vert_sbs[MAX_NUM_TILE_ROWS]; + + // Job Queue structure and handles + JobQueue *job_queue; + + int jobs_per_tile_col; + + RowMTInfo row_mt_info[MAX_NUM_TILE_COLS]; + int thread_id_to_tile_id[MAX_NUM_THREADS]; // Mapping of threads to tiles +} MultiThreadHandle; + typedef struct RD_COUNTS { vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES]; int64_t comp_pred_diff[REFERENCE_MODES]; @@ -629,6 +659,10 @@ typedef struct VP9_COMP { int keep_level_stats; Vp9LevelInfo level_info; + MultiThreadHandle multi_thread_ctxt; + void (*row_mt_sync_read_ptr)(VP9RowMTSync *const, int, int); + void (*row_mt_sync_write_ptr)(VP9RowMTSync *const, int, int, const int); + int new_mt; // Previous Partition Info BLOCK_SIZE *prev_partition; @@ -808,6 +842,18 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) { return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL; } +static INLINE int get_num_vert_units(TileInfo tile, int shift) { + int num_vert_units = + (tile.mi_row_end - tile.mi_row_start + (1 << shift) - 1) >> shift; + return num_vert_units; +} + +static INLINE int get_num_cols(TileInfo tile, int shift) { + int num_cols = + (tile.mi_col_end - tile.mi_col_start + (1 << shift) - 1) >> shift; + return num_cols; +} + static INLINE int get_level_index(VP9_LEVEL level) { int i; for (i = 0; i < VP9_LEVELS; ++i) { diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c index f4f7c7b..6cc103c 100644 --- a/vp9/encoder/vp9_ethread.c +++ b/vp9/encoder/vp9_ethread.c @@ -11,6 +11,8 @@ #include "vp9/encoder/vp9_encodeframe.h" #include "vp9/encoder/vp9_encoder.h" #include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_firstpass.h" +#include "vp9/encoder/vp9_multi_thread.h" #include "vpx_dsp/vpx_dsp_common.h" static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) { @@ -64,15 +66,11 @@ static int get_max_tile_cols(VP9_COMP *cpi) { return (1 << log2_tile_cols); } -void vp9_encode_tiles_mt(VP9_COMP *cpi) { +static void create_enc_workers(VP9_COMP *cpi, int num_workers) { VP9_COMMON *const cm = &cpi->common; - const int tile_cols = 1 << cm->log2_tile_cols; const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); - const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols); int i; - vp9_init_tile_data(cpi); - // Only run once to create threads and allocate thread data. if (cpi->num_workers == 0) { int allocated_workers = num_workers; @@ -123,19 +121,57 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { thread_data->cpi = cpi; thread_data->td = &cpi->td; } - winterface->sync(worker); } } +} + +static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2, + int num_workers) { + const VPxWorkerInterface *const winterface = vpx_get_worker_interface(); + int i; for (i = 0; i < num_workers; i++) { VPxWorker *const worker = &cpi->workers[i]; - EncWorkerData *thread_data; - - worker->hook = (VPxWorkerHook)enc_worker_hook; + worker->hook = (VPxWorkerHook)hook; worker->data1 = &cpi->tile_thr_data[i]; - worker->data2 = NULL; - thread_data = (EncWorkerData *)worker->data1; + worker->data2 = data2; + } + + // Encode a frame + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + + // Set the starting tile for each thread. + thread_data->start = i; + + if (i == cpi->num_workers - 1) + winterface->execute(worker); + else + winterface->launch(worker); + } + + // Encoding ends. + for (i = 0; i < num_workers; i++) { + VPxWorker *const worker = &cpi->workers[i]; + winterface->sync(worker); + } +} + +void vp9_encode_tiles_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols); + int i; + + vp9_init_tile_data(cpi); + + create_enc_workers(cpi, num_workers); + + for (i = 0; i < num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; // Before encoding a frame, copy the thread data from cpi. if (thread_data->td != &cpi->td) { @@ -165,34 +201,266 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { } } - // Encode a frame + launch_enc_workers(cpi, (VPxWorkerHook)enc_worker_hook, NULL, num_workers); + for (i = 0; i < num_workers; i++) { VPxWorker *const worker = &cpi->workers[i]; EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; - // Set the starting tile for each thread. - thread_data->start = i; + // Accumulate counters. + if (i < cpi->num_workers - 1) { + vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0); + accumulate_rd_opt(&cpi->td, thread_data->td); + } + } +} - if (i == cpi->num_workers - 1) - winterface->execute(worker); - else - winterface->launch(worker); +static void accumulate_fp_tile_stat(TileDataEnc *tile_data, + TileDataEnc *tile_data_t) { + tile_data->fp_data.intra_factor += tile_data_t->fp_data.intra_factor; + tile_data->fp_data.brightness_factor += + tile_data_t->fp_data.brightness_factor; + tile_data->fp_data.coded_error += tile_data_t->fp_data.coded_error; + tile_data->fp_data.sr_coded_error += tile_data_t->fp_data.sr_coded_error; + tile_data->fp_data.frame_noise_energy += + tile_data_t->fp_data.frame_noise_energy; + tile_data->fp_data.intra_error += tile_data_t->fp_data.intra_error; + tile_data->fp_data.intercount += tile_data_t->fp_data.intercount; + tile_data->fp_data.second_ref_count += tile_data_t->fp_data.second_ref_count; + tile_data->fp_data.neutral_count += tile_data_t->fp_data.neutral_count; + tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count; + tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount; + tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr; + tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs; + tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc; + tile_data->fp_data.sum_mvc_abs += tile_data_t->fp_data.sum_mvc_abs; + tile_data->fp_data.sum_mvrs += tile_data_t->fp_data.sum_mvrs; + tile_data->fp_data.sum_mvcs += tile_data_t->fp_data.sum_mvcs; + tile_data->fp_data.sum_in_vectors += tile_data_t->fp_data.sum_in_vectors; + tile_data->fp_data.intra_smooth_count += + tile_data_t->fp_data.intra_smooth_count; + tile_data->fp_data.image_data_start_row = + VPXMIN(tile_data->fp_data.image_data_start_row, + tile_data_t->fp_data.image_data_start_row) == INVALID_ROW + ? VPXMAX(tile_data->fp_data.image_data_start_row, + tile_data_t->fp_data.image_data_start_row) + : VPXMIN(tile_data->fp_data.image_data_start_row, + tile_data_t->fp_data.image_data_start_row); +} + +// Allocate memory for row synchronization +void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm, + int rows) { + row_mt_sync->rows = rows; +#if CONFIG_MULTITHREAD + { + int i; + + CHECK_MEM_ERROR(cm, row_mt_sync->mutex_, + vpx_malloc(sizeof(*row_mt_sync->mutex_) * rows)); + if (row_mt_sync->mutex_) { + for (i = 0; i < rows; ++i) { + pthread_mutex_init(&row_mt_sync->mutex_[i], NULL); + } + } + + CHECK_MEM_ERROR(cm, row_mt_sync->cond_, + vpx_malloc(sizeof(*row_mt_sync->cond_) * rows)); + if (row_mt_sync->cond_) { + for (i = 0; i < rows; ++i) { + pthread_cond_init(&row_mt_sync->cond_[i], NULL); + } + } } +#endif // CONFIG_MULTITHREAD - // Encoding ends. - for (i = 0; i < num_workers; i++) { - VPxWorker *const worker = &cpi->workers[i]; - winterface->sync(worker); + CHECK_MEM_ERROR(cm, row_mt_sync->cur_col, + vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows)); + + // Set up nsync. + row_mt_sync->sync_range = 1; +} + +// Deallocate row based multi-threading synchronization related mutex and data +void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync) { + if (row_mt_sync != NULL) { +#if CONFIG_MULTITHREAD + int i; + + if (row_mt_sync->mutex_ != NULL) { + for (i = 0; i < row_mt_sync->rows; ++i) { + pthread_mutex_destroy(&row_mt_sync->mutex_[i]); + } + vpx_free(row_mt_sync->mutex_); + } + if (row_mt_sync->cond_ != NULL) { + for (i = 0; i < row_mt_sync->rows; ++i) { + pthread_cond_destroy(&row_mt_sync->cond_[i]); + } + vpx_free(row_mt_sync->cond_); + } +#endif // CONFIG_MULTITHREAD + vpx_free(row_mt_sync->cur_col); + // clear the structure as the source of this call may be dynamic change + // in tiles in which case this call will be followed by an _alloc() + // which may fail. + vp9_zero(*row_mt_sync); } +} + +void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c) { +#if CONFIG_MULTITHREAD + const int nsync = row_mt_sync->sync_range; + + if (r && !(c & (nsync - 1))) { + pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1]; + pthread_mutex_lock(mutex); + + while (c > row_mt_sync->cur_col[r - 1] - nsync) { + pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex); + } + pthread_mutex_unlock(mutex); + } +#else + (void)row_mt_sync; + (void)r; + (void)c; +#endif // CONFIG_MULTITHREAD +} + +void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c) { + (void)row_mt_sync; + (void)r; + (void)c; + return; +} + +void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols) { +#if CONFIG_MULTITHREAD + const int nsync = row_mt_sync->sync_range; + int cur; + // Only signal when there are enough filtered SB for next row to run. + int sig = 1; + + if (c < cols - 1) { + cur = c; + if (c % nsync) sig = 0; + } else { + cur = cols + nsync; + } + + if (sig) { + pthread_mutex_lock(&row_mt_sync->mutex_[r]); + + row_mt_sync->cur_col[r] = cur; + + pthread_cond_signal(&row_mt_sync->cond_[r]); + pthread_mutex_unlock(&row_mt_sync->mutex_[r]); + } +#else + (void)row_mt_sync; + (void)r; + (void)c; + (void)cols; +#endif // CONFIG_MULTITHREAD +} + +void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols) { + (void)row_mt_sync; + (void)r; + (void)c; + (void)cols; + return; +} + +static int first_pass_worker_hook(EncWorkerData *const thread_data, + MultiThreadHandle *multi_thread_ctxt) { + VP9_COMP *const cpi = thread_data->cpi; + const VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + int tile_row, tile_col; + TileDataEnc *this_tile; + int end_of_frame; + int thread_id = thread_data->thread_id; + int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id]; + JobNode *proc_job = NULL; + FIRSTPASS_DATA fp_acc_data; + MV zero_mv = { 0, 0 }; + MV best_ref_mv; + int mb_row; + + end_of_frame = 0; + while (0 == end_of_frame) { + // Get the next job in the queue + proc_job = + (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id); + if (NULL == proc_job) { + // Query for the status of other tiles + end_of_frame = vp9_get_tiles_proc_status( + multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id, + tile_cols); + } else { + tile_col = proc_job->tile_col_id; + tile_row = proc_job->tile_row_id; + + this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + mb_row = proc_job->vert_unit_row_num; + + best_ref_mv = zero_mv; + vp9_zero(fp_acc_data); + fp_acc_data.image_data_start_row = INVALID_ROW; + vp9_first_pass_encode_tile_mb_row(cpi, thread_data->td, &fp_acc_data, + this_tile, &best_ref_mv, mb_row); + } + } + return 0; +} + +void vp9_encode_fp_row_mt(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + TileDataEnc *first_tile_col; + int num_workers = VPXMAX(cpi->oxcf.max_threads, 1); + int i; + + if (multi_thread_ctxt->allocated_tile_cols < tile_cols || + multi_thread_ctxt->allocated_tile_rows < tile_rows || + multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) { + vp9_row_mt_mem_dealloc(cpi); + vp9_init_tile_data(cpi); + vp9_row_mt_mem_alloc(cpi); + } else { + vp9_init_tile_data(cpi); + } + + create_enc_workers(cpi, num_workers); + + vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers); + + vp9_prepare_job_queue(cpi, FIRST_PASS_JOB); + + vp9_multi_thread_tile_init(cpi); for (i = 0; i < num_workers; i++) { - VPxWorker *const worker = &cpi->workers[i]; - EncWorkerData *const thread_data = (EncWorkerData *)worker->data1; + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; - // Accumulate counters. - if (i < cpi->num_workers - 1) { - vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0); - accumulate_rd_opt(&cpi->td, thread_data->td); + // Before encoding a frame, copy the thread data from cpi. + if (thread_data->td != &cpi->td) { + thread_data->td->mb = cpi->td.mb; } } + + launch_enc_workers(cpi, (VPxWorkerHook)first_pass_worker_hook, + multi_thread_ctxt, num_workers); + + first_tile_col = &cpi->tile_data[0]; + for (i = 1; i < tile_cols; i++) { + TileDataEnc *this_tile = &cpi->tile_data[i]; + accumulate_fp_tile_stat(first_tile_col, this_tile); + } } diff --git a/vp9/encoder/vp9_ethread.h b/vp9/encoder/vp9_ethread.h index 1efa4dc..968e500 100644 --- a/vp9/encoder/vp9_ethread.h +++ b/vp9/encoder/vp9_ethread.h @@ -15,6 +15,10 @@ extern "C" { #endif +#define MAX_NUM_TILE_COLS (1 << 6) +#define MAX_NUM_TILE_ROWS 4 +#define MAX_NUM_THREADS 80 + struct VP9_COMP; struct ThreadData; @@ -22,10 +26,41 @@ typedef struct EncWorkerData { struct VP9_COMP *cpi; struct ThreadData *td; int start; + int thread_id; + int tile_completion_status[MAX_NUM_TILE_COLS]; } EncWorkerData; +// Encoder row synchronization +typedef struct VP9RowMTSyncData { +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_; + pthread_cond_t *cond_; +#endif + // Allocate memory to store the sb/mb block index in each row. + int *cur_col; + int sync_range; + int rows; +} VP9RowMTSync; + void vp9_encode_tiles_mt(struct VP9_COMP *cpi); +void vp9_encode_fp_row_mt(struct VP9_COMP *cpi); + +void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c); +void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols); + +void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c); +void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c, + const int cols); + +// Allocate memory for row based multi-threading synchronization. +void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, struct VP9Common *cm, + int rows); + +// Deallocate row based multi-threading synchronization related mutex and data. +void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync); + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 72e9ac7..59dd536 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -31,6 +31,7 @@ #include "vp9/encoder/vp9_encodemb.h" #include "vp9/encoder/vp9_encodemv.h" #include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ethread.h" #include "vp9/encoder/vp9_extend.h" #include "vp9/encoder/vp9_firstpass.h" #include "vp9/encoder/vp9_mcomp.h" @@ -646,37 +647,150 @@ static int fp_estimate_block_noise(MACROBLOCK *x, BLOCK_SIZE bsize) { return block_noise << 2; // Scale << 2 to account for sampling. } -#define INVALID_ROW -1 -void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { +#if ENABLE_MT_BIT_MATCH +static void accumulate_floating_point_stats(VP9_COMP *cpi, + TileDataEnc *first_tile_col) { + VP9_COMMON *const cm = &cpi->common; int mb_row, mb_col; - MACROBLOCK *const x = &cpi->td.mb; + first_tile_col->fp_data.intra_factor = 0; + first_tile_col->fp_data.brightness_factor = 0; + first_tile_col->fp_data.neutral_count = 0; + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { + const int mb_index = mb_row * cm->mb_cols + mb_col; + first_tile_col->fp_data.intra_factor += + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor; + first_tile_col->fp_data.brightness_factor += + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor; + first_tile_col->fp_data.neutral_count += + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count; + } + } +} +#endif + +static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps, + FIRSTPASS_DATA *fp_acc_data) { + VP9_COMMON *const cm = &cpi->common; + // The minimum error here insures some bit allocation to frames even + // in static regions. The allocation per MB declines for larger formats + // where the typical "real" energy per MB also falls. + // Initial estimate here uses sqrt(mbs) to define the min_err, where the + // number of mbs is proportional to the image area. + const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs + : cpi->common.MBs; + const double min_err = 200 * sqrt(num_mbs); + + // Clamp the image start to rows/2. This number of rows is discarded top + // and bottom as dead data so rows / 2 means the frame is blank. + if ((fp_acc_data->image_data_start_row > cm->mb_rows / 2) || + (fp_acc_data->image_data_start_row == INVALID_ROW)) { + fp_acc_data->image_data_start_row = cm->mb_rows / 2; + } + // Exclude any image dead zone + if (fp_acc_data->image_data_start_row > 0) { + fp_acc_data->intra_skip_count = + VPXMAX(0, fp_acc_data->intra_skip_count - + (fp_acc_data->image_data_start_row * cm->mb_cols * 2)); + } + + fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs; + fp_acc_data->brightness_factor = + fp_acc_data->brightness_factor / (double)num_mbs; + fps->weight = fp_acc_data->intra_factor * fp_acc_data->brightness_factor; + + fps->frame = cm->current_video_frame; + fps->spatial_layer_id = cpi->svc.spatial_layer_id; + fps->coded_error = (double)(fp_acc_data->coded_error >> 8) + min_err; + fps->sr_coded_error = (double)(fp_acc_data->sr_coded_error >> 8) + min_err; + fps->intra_error = (double)(fp_acc_data->intra_error >> 8) + min_err; + fps->frame_noise_energy = + (double)(fp_acc_data->frame_noise_energy) / (double)num_mbs; + fps->count = 1.0; + fps->pcnt_inter = (double)(fp_acc_data->intercount) / num_mbs; + fps->pcnt_second_ref = (double)(fp_acc_data->second_ref_count) / num_mbs; + fps->pcnt_neutral = (double)(fp_acc_data->neutral_count) / num_mbs; + fps->intra_skip_pct = (double)(fp_acc_data->intra_skip_count) / num_mbs; + fps->intra_smooth_pct = (double)(fp_acc_data->intra_smooth_count) / num_mbs; + fps->inactive_zone_rows = (double)(fp_acc_data->image_data_start_row); + // Currently set to 0 as most issues relate to letter boxing. + fps->inactive_zone_cols = (double)0; + + if (fp_acc_data->mvcount > 0) { + fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount; + fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount; + fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount; + fps->mvc_abs = (double)(fp_acc_data->sum_mvc_abs) / fp_acc_data->mvcount; + fps->MVrv = ((double)(fp_acc_data->sum_mvrs) - + ((double)(fp_acc_data->sum_mvr) * (fp_acc_data->sum_mvr) / + fp_acc_data->mvcount)) / + fp_acc_data->mvcount; + fps->MVcv = ((double)(fp_acc_data->sum_mvcs) - + ((double)(fp_acc_data->sum_mvc) * (fp_acc_data->sum_mvc) / + fp_acc_data->mvcount)) / + fp_acc_data->mvcount; + fps->mv_in_out_count = + (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2); + fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs; + } else { + fps->MVr = 0.0; + fps->mvr_abs = 0.0; + fps->MVc = 0.0; + fps->mvc_abs = 0.0; + fps->MVrv = 0.0; + fps->MVcv = 0.0; + fps->mv_in_out_count = 0.0; + fps->pcnt_motion = 0.0; + } +} + +static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile, + FIRSTPASS_DATA *fp_acc_data) { + this_tile->fp_data.intra_factor += fp_acc_data->intra_factor; + this_tile->fp_data.brightness_factor += fp_acc_data->brightness_factor; + this_tile->fp_data.coded_error += fp_acc_data->coded_error; + this_tile->fp_data.sr_coded_error += fp_acc_data->sr_coded_error; + this_tile->fp_data.frame_noise_energy += fp_acc_data->frame_noise_energy; + this_tile->fp_data.intra_error += fp_acc_data->intra_error; + this_tile->fp_data.intercount += fp_acc_data->intercount; + this_tile->fp_data.second_ref_count += fp_acc_data->second_ref_count; + this_tile->fp_data.neutral_count += fp_acc_data->neutral_count; + this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count; + this_tile->fp_data.mvcount += fp_acc_data->mvcount; + this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr; + this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs; + this_tile->fp_data.sum_mvc += fp_acc_data->sum_mvc; + this_tile->fp_data.sum_mvc_abs += fp_acc_data->sum_mvc_abs; + this_tile->fp_data.sum_mvrs += fp_acc_data->sum_mvrs; + this_tile->fp_data.sum_mvcs += fp_acc_data->sum_mvcs; + this_tile->fp_data.sum_in_vectors += fp_acc_data->sum_in_vectors; + this_tile->fp_data.intra_smooth_count += fp_acc_data->intra_smooth_count; + this_tile->fp_data.image_data_start_row = + VPXMIN(this_tile->fp_data.image_data_start_row, + fp_acc_data->image_data_start_row) == INVALID_ROW + ? VPXMAX(this_tile->fp_data.image_data_start_row, + fp_acc_data->image_data_start_row) + : VPXMIN(this_tile->fp_data.image_data_start_row, + fp_acc_data->image_data_start_row); +} + +void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, + FIRSTPASS_DATA *fp_acc_data, + TileDataEnc *tile_data, MV *best_ref_mv, + int mb_row) { + int mb_col; + MACROBLOCK *const x = &td->mb; VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - TileInfo tile; + TileInfo tile = tile_data->tile_info; struct macroblock_plane *const p = x->plane; struct macroblockd_plane *const pd = xd->plane; - const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none; - int i; + const PICK_MODE_CONTEXT *ctx = &td->pc_root->none; + int i, c; + int num_mb_cols = get_num_cols(tile_data->tile_info, 1); int recon_yoffset, recon_uvoffset; - int64_t intra_error = 0; - int64_t coded_error = 0; - int64_t sr_coded_error = 0; - int64_t frame_noise_energy = 0; - - int sum_mvr = 0, sum_mvc = 0; - int sum_mvr_abs = 0, sum_mvc_abs = 0; - int64_t sum_mvrs = 0, sum_mvcs = 0; - int mvcount = 0; - int intercount = 0; - int second_ref_count = 0; const int intrapenalty = INTRA_MODE_PENALTY; - double neutral_count; - int intra_skip_count = 0; - int intra_smooth_count = 0; - int image_data_start_row = INVALID_ROW; - int sum_in_vectors = 0; - TWO_PASS *twopass = &cpi->twopass; const MV zero_mv = { 0, 0 }; int recon_y_stride, recon_uv_stride, uv_mb_height; @@ -688,50 +802,13 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { LAYER_CONTEXT *const lc = is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] : NULL; - double intra_factor; - double brightness_factor; - BufferPool *const pool = cm->buffer_pool; MODE_INFO mi_above, mi_left; // First pass code requires valid last and new frame buffers. assert(new_yv12 != NULL); assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL)); -#if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs); - } -#endif - - vpx_clear_system_state(); - - intra_factor = 0.0; - brightness_factor = 0.0; - neutral_count = 0.0; - - set_first_pass_params(cpi); - vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth)); - if (lc != NULL) { - twopass = &lc->twopass; - - cpi->lst_fb_idx = cpi->svc.spatial_layer_id; - cpi->ref_frame_flags = VP9_LAST_FLAG; - - if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id < - REF_FRAMES) { - cpi->gld_fb_idx = - cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id; - cpi->ref_frame_flags |= VP9_GOLD_FLAG; - cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0); - } else { - cpi->refresh_golden_frame = 0; - } - - if (lc->current_video_frame_in_layer == 0) cpi->ref_frame_flags = 0; - - vp9_scale_references(cpi); - // Use either last frame or alt frame for motion search. if (cpi->ref_frame_flags & VP9_LAST_FLAG) { first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME); @@ -747,28 +824,11 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { } else { gld_yv12 = NULL; } - - set_ref_ptrs(cm, xd, - (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME : NONE, - (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE); - - cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, - &cpi->scaled_source, 0); } - vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); - - vp9_setup_src_planes(x, cpi->Source, 0, 0); - vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0); - - if (!frame_is_intra_only(cm)) { - vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL); - } - - xd->mi = cm->mi_grid_visible; - xd->mi[0] = cm->mi; - - vp9_frame_init_quantizer(cpi); + xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) + + (tile.mi_col_start >> 1); + xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + (tile.mi_col_start >> 1); for (i = 0; i < MAX_MB_PLANE; ++i) { p[i].coeff = ctx->coeff_pbuf[i][1]; @@ -776,276 +836,363 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1]; p[i].eobs = ctx->eobs_pbuf[i][1]; } - x->skip_recode = 0; - - vp9_init_mv_probs(cm); - vp9_initialize_rd_consts(cpi); - - // Tiling is ignored in the first pass. - vp9_tile_init(&tile, cm, 0, 0); recon_y_stride = new_yv12->y_stride; recon_uv_stride = new_yv12->uv_stride; uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height); - for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { - MV best_ref_mv = { 0, 0 }; + // Reset above block coeffs. + recon_yoffset = + (mb_row * recon_y_stride * 16) + (tile.mi_col_start >> 1) * 16; + recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height) + + (tile.mi_col_start >> 1) * uv_mb_height; + + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16); + x->mv_limits.row_max = + ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16; + + for (mb_col = tile.mi_col_start >> 1, c = 0; mb_col < (tile.mi_col_end >> 1); + ++mb_col, c++) { + int this_error; + int this_intra_error; + const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); + const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col); + double log_intra; + int level_sample; +#if ENABLE_MT_BIT_MATCH + const int mb_index = mb_row * cm->mb_cols + mb_col; +#endif - // Reset above block coeffs. - recon_yoffset = (mb_row * recon_y_stride * 16); - recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height); +#if CONFIG_FP_MB_STATS + const int mb_index = mb_row * cm->mb_cols + mb_col; +#endif - // Set up limit values for motion vectors to prevent them extending - // outside the UMV borders. - x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16); - x->mv_limits.row_max = - ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16; + (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c - 1); - for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) { - int this_error; - int this_intra_error; - const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); - const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col); - double log_intra; - int level_sample; + // Adjust to the next column of MBs. + x->plane[0].src.buf = cpi->Source->y_buffer + + mb_row * 16 * x->plane[0].src.stride + mb_col * 16; + x->plane[1].src.buf = cpi->Source->u_buffer + + mb_row * uv_mb_height * x->plane[1].src.stride + + mb_col * uv_mb_height; + x->plane[2].src.buf = cpi->Source->v_buffer + + mb_row * uv_mb_height * x->plane[1].src.stride + + mb_col * uv_mb_height; -#if CONFIG_FP_MB_STATS - const int mb_index = mb_row * cm->mb_cols + mb_col; -#endif + vpx_clear_system_state(); - vpx_clear_system_state(); - - xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; - xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset; - xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset; - xd->mi[0]->sb_type = bsize; - xd->mi[0]->ref_frame[0] = INTRA_FRAME; - set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize], - mb_col << 1, num_8x8_blocks_wide_lookup[bsize], - cm->mi_rows, cm->mi_cols); - // Are edges available for intra prediction? - // Since the firstpass does not populate the mi_grid_visible, - // above_mi/left_mi must be overwritten with a nonzero value when edges - // are available. Required by vp9_predict_intra_block(). - xd->above_mi = (mb_row != 0) ? &mi_above : NULL; - xd->left_mi = (mb_col > tile.mi_col_start) ? &mi_left : NULL; - - // Do intra 16x16 prediction. - x->skip_encode = 0; - xd->mi[0]->mode = DC_PRED; - xd->mi[0]->tx_size = - use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; - - // Set the 16x16 src_diff block to zero, which ensures correct this_error - // calculation for block sizes smaller than 16x16. - vp9_zero_array(x->plane[0].src_diff, 256); - vp9_encode_intra_block_plane(x, bsize, 0, 0); - this_error = vpx_get_mb_ss(x->plane[0].src_diff); - this_intra_error = this_error; - - // Keep a record of blocks that have very low intra error residual - // (i.e. are in effect completely flat and untextured in the intra - // domain). In natural videos this is uncommon, but it is much more - // common in animations, graphics and screen content, so may be used - // as a signal to detect these types of content. - if (this_error < get_ul_intra_threshold(cm)) { - ++intra_skip_count; - } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) { - image_data_start_row = mb_row; - } + xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset; + xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset; + xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset; + xd->mi[0]->sb_type = bsize; + xd->mi[0]->ref_frame[0] = INTRA_FRAME; + set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize], + mb_col << 1, num_8x8_blocks_wide_lookup[bsize], cm->mi_rows, + cm->mi_cols); + // Are edges available for intra prediction? + // Since the firstpass does not populate the mi_grid_visible, + // above_mi/left_mi must be overwritten with a nonzero value when edges + // are available. Required by vp9_predict_intra_block(). + xd->above_mi = (mb_row != 0) ? &mi_above : NULL; + xd->left_mi = ((mb_col << 1) > tile.mi_col_start) ? &mi_left : NULL; + + // Do intra 16x16 prediction. + x->skip_encode = 0; + x->fp_src_pred = 0; + // Do intra prediction based on source pixels for tile boundaries + if ((mb_col == (tile.mi_col_start >> 1)) && mb_col != 0) { + xd->left_mi = &mi_left; + x->fp_src_pred = 1; + } + xd->mi[0]->mode = DC_PRED; + xd->mi[0]->tx_size = + use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4; + // Fix - zero the 16x16 block first. This ensures correct this_error for + // block sizes smaller than 16x16. + vp9_zero_array(x->plane[0].src_diff, 256); + vp9_encode_intra_block_plane(x, bsize, 0, 0); + this_error = vpx_get_mb_ss(x->plane[0].src_diff); + this_intra_error = this_error; + + // Keep a record of blocks that have very low intra error residual + // (i.e. are in effect completely flat and untextured in the intra + // domain). In natural videos this is uncommon, but it is much more + // common in animations, graphics and screen content, so may be used + // as a signal to detect these types of content. + if (this_error < get_ul_intra_threshold(cm)) { + ++(fp_acc_data->intra_skip_count); + } else if ((mb_col > 0) && + (fp_acc_data->image_data_start_row == INVALID_ROW)) { + fp_acc_data->image_data_start_row = mb_row; + } - // Blocks that are mainly smooth in the intra domain. - // Some special accounting for CQ but also these are better for testing - // noise levels. - if (this_error < get_smooth_intra_threshold(cm)) { - ++intra_smooth_count; - } + // Blocks that are mainly smooth in the intra domain. + // Some special accounting for CQ but also these are better for testing + // noise levels. + if (this_error < get_smooth_intra_threshold(cm)) { + ++(fp_acc_data->intra_smooth_count); + } - // Special case noise measurement for first frame. - if (cm->current_video_frame == 0) { - if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) { - frame_noise_energy += fp_estimate_block_noise(x, bsize); - } else { - frame_noise_energy += (int64_t)SECTION_NOISE_DEF; - } + // Special case noise measurement for first frame. + if (cm->current_video_frame == 0) { + if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) { + fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); + } else { + fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; } + } #if CONFIG_VP9_HIGHBITDEPTH - if (cm->use_highbitdepth) { - switch (cm->bit_depth) { - case VPX_BITS_8: break; - case VPX_BITS_10: this_error >>= 4; break; - case VPX_BITS_12: this_error >>= 8; break; - default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); - return; - } + if (cm->use_highbitdepth) { + switch (cm->bit_depth) { + case VPX_BITS_8: break; + case VPX_BITS_10: this_error >>= 4; break; + case VPX_BITS_12: this_error >>= 8; break; + default: + assert(0 && + "cm->bit_depth should be VPX_BITS_8, " + "VPX_BITS_10 or VPX_BITS_12"); + return; } + } #endif // CONFIG_VP9_HIGHBITDEPTH - vpx_clear_system_state(); - log_intra = log(this_error + 1.0); - if (log_intra < 10.0) - intra_factor += 1.0 + ((10.0 - log_intra) * 0.05); - else - intra_factor += 1.0; + vpx_clear_system_state(); + log_intra = log(this_error + 1.0); + if (log_intra < 10.0) { + fp_acc_data->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05); +#if ENABLE_MT_BIT_MATCH + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = + 1.0 + ((10.0 - log_intra) * 0.05); +#endif + } else { + fp_acc_data->intra_factor += 1.0; +#if ENABLE_MT_BIT_MATCH + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = 1.0; +#endif + } #if CONFIG_VP9_HIGHBITDEPTH - if (cm->use_highbitdepth) - level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0]; - else - level_sample = x->plane[0].src.buf[0]; -#else + if (cm->use_highbitdepth) + level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0]; + else level_sample = x->plane[0].src.buf[0]; +#else + level_sample = x->plane[0].src.buf[0]; #endif - if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) - brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample)); - else - brightness_factor += 1.0; + if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) { + fp_acc_data->brightness_factor += + 1.0 + (0.01 * (DARK_THRESH - level_sample)); +#if ENABLE_MT_BIT_MATCH + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor = + 1.0 + (0.01 * (DARK_THRESH - level_sample)); +#endif + } else { + fp_acc_data->brightness_factor += 1.0; +#if ENABLE_MT_BIT_MATCH + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor = 1.0; +#endif + } - // Intrapenalty below deals with situations where the intra and inter - // error scores are very low (e.g. a plain black frame). - // We do not have special cases in first pass for 0,0 and nearest etc so - // all inter modes carry an overhead cost estimate for the mv. - // When the error score is very low this causes us to pick all or lots of - // INTRA modes and throw lots of key frames. - // This penalty adds a cost matching that of a 0,0 mv to the intra case. - this_error += intrapenalty; + // Intrapenalty below deals with situations where the intra and inter + // error scores are very low (e.g. a plain black frame). + // We do not have special cases in first pass for 0,0 and nearest etc so + // all inter modes carry an overhead cost estimate for the mv. + // When the error score is very low this causes us to pick all or lots of + // INTRA modes and throw lots of key frames. + // This penalty adds a cost matching that of a 0,0 mv to the intra case. + this_error += intrapenalty; - // Accumulate the intra error. - intra_error += (int64_t)this_error; + // Accumulate the intra error. + fp_acc_data->intra_error += (int64_t)this_error; #if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - // initialization - cpi->twopass.frame_mb_stats_buf[mb_index] = 0; - } + if (cpi->use_fp_mb_stats) { + // initialization + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + } #endif - // Set up limit values for motion vectors to prevent them extending - // outside the UMV borders. - x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16); - x->mv_limits.col_max = - ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16; - - // Other than for the first frame do a motion search. - if ((lc == NULL && cm->current_video_frame > 0) || - (lc != NULL && lc->current_video_frame_in_layer > 0)) { - int tmp_err, motion_error, raw_motion_error; - // Assume 0,0 motion with no mv overhead. - MV mv = { 0, 0 }, tmp_mv = { 0, 0 }; - struct buf_2d unscaled_last_source_buf_2d; - - xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; + // Set up limit values for motion vectors to prevent them extending + // outside the UMV borders. + x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16); + x->mv_limits.col_max = + ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16; + + // Other than for the first frame do a motion search. + if ((lc == NULL && cm->current_video_frame > 0) || + (lc != NULL && lc->current_video_frame_in_layer > 0)) { + int tmp_err, motion_error, raw_motion_error; + // Assume 0,0 motion with no mv overhead. + MV mv = { 0, 0 }, tmp_mv = { 0, 0 }; + struct buf_2d unscaled_last_source_buf_2d; + + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; #if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - motion_error = highbd_get_prediction_error( - bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); - } else { - motion_error = get_prediction_error(bsize, &x->plane[0].src, - &xd->plane[0].pre[0]); - } -#else + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); + } else { motion_error = get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); + } +#else + motion_error = + get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); #endif // CONFIG_VP9_HIGHBITDEPTH - // Compute the motion error of the 0,0 motion using the last source - // frame as the reference. Skip the further motion search on - // reconstructed frame if this error is small. - unscaled_last_source_buf_2d.buf = - cpi->unscaled_last_source->y_buffer + recon_yoffset; - unscaled_last_source_buf_2d.stride = - cpi->unscaled_last_source->y_stride; + // Compute the motion error of the 0,0 motion using the last source + // frame as the reference. Skip the further motion search on + // reconstructed frame if this error is small. + unscaled_last_source_buf_2d.buf = + cpi->unscaled_last_source->y_buffer + recon_yoffset; + unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride; #if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - raw_motion_error = highbd_get_prediction_error( - bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd); - } else { - raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &unscaled_last_source_buf_2d); - } -#else + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + raw_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd); + } else { raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, &unscaled_last_source_buf_2d); + } +#else + raw_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &unscaled_last_source_buf_2d); #endif // CONFIG_VP9_HIGHBITDEPTH - // TODO(pengchong): Replace the hard-coded threshold - if (raw_motion_error > 25 || lc != NULL) { - // Test last reference frame using the previous best mv as the - // starting point (best reference) for the search. - first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error); - - // If the current best reference mv is not centered on 0,0 then do a - // 0,0 based search as well. - if (!is_zero_mv(&best_ref_mv)) { - tmp_err = INT_MAX; - first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err); - - if (tmp_err < motion_error) { - motion_error = tmp_err; - mv = tmp_mv; - } + // TODO(pengchong): Replace the hard-coded threshold + if (raw_motion_error > 25 || lc != NULL) { + // Test last reference frame using the previous best mv as the + // starting point (best reference) for the search. + first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error); + + // If the current best reference mv is not centered on 0,0 then do a + // 0,0 based search as well. + if (!is_zero_mv(best_ref_mv)) { + tmp_err = INT_MAX; + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err); + + if (tmp_err < motion_error) { + motion_error = tmp_err; + mv = tmp_mv; } + } - // Search in an older reference frame. - if (((lc == NULL && cm->current_video_frame > 1) || - (lc != NULL && lc->current_video_frame_in_layer > 1)) && - gld_yv12 != NULL) { - // Assume 0,0 motion with no mv overhead. - int gf_motion_error; + // Search in an older reference frame. + if (((lc == NULL && cm->current_video_frame > 1) || + (lc != NULL && lc->current_video_frame_in_layer > 1)) && + gld_yv12 != NULL) { + // Assume 0,0 motion with no mv overhead. + int gf_motion_error; - xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; + xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset; #if CONFIG_VP9_HIGHBITDEPTH - if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - gf_motion_error = highbd_get_prediction_error( - bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); - } else { - gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, - &xd->plane[0].pre[0]); - } -#else + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + gf_motion_error = highbd_get_prediction_error( + bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd); + } else { gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]); + } +#else + gf_motion_error = get_prediction_error(bsize, &x->plane[0].src, + &xd->plane[0].pre[0]); #endif // CONFIG_VP9_HIGHBITDEPTH - first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, - &gf_motion_error); - - if (gf_motion_error < motion_error && gf_motion_error < this_error) - ++second_ref_count; - - // Reset to last frame as reference buffer. - xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; - xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset; - xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset; - - // In accumulating a score for the older reference frame take the - // best of the motion predicted score and the intra coded error - // (just as will be done for) accumulation of "coded_error" for - // the last frame. - if (gf_motion_error < this_error) - sr_coded_error += gf_motion_error; - else - sr_coded_error += this_error; - } else { - sr_coded_error += motion_error; - } + first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error); + + if (gf_motion_error < motion_error && gf_motion_error < this_error) + ++(fp_acc_data->second_ref_count); + + // Reset to last frame as reference buffer. + xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset; + xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset; + xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset; + + // In accumulating a score for the older reference frame take the + // best of the motion predicted score and the intra coded error + // (just as will be done for) accumulation of "coded_error" for + // the last frame. + if (gf_motion_error < this_error) + fp_acc_data->sr_coded_error += gf_motion_error; + else + fp_acc_data->sr_coded_error += this_error; } else { - sr_coded_error += motion_error; + fp_acc_data->sr_coded_error += motion_error; } + } else { + fp_acc_data->sr_coded_error += motion_error; + } - // Start by assuming that intra mode is best. - best_ref_mv.row = 0; - best_ref_mv.col = 0; + // Start by assuming that intra mode is best. + best_ref_mv->row = 0; + best_ref_mv->col = 0; + +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // intra prediction statistics + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; + if (this_error > FPMB_ERROR_LARGE_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK; + } else if (this_error < FPMB_ERROR_SMALL_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK; + } + } +#endif + + if (motion_error <= this_error) { + vpx_clear_system_state(); + + // Keep a count of cases where the inter and intra were very close + // and very low. This helps with scene cut detection for example in + // cropped clips with black bars at the sides or top and bottom. + if (((this_error - intrapenalty) * 9 <= motion_error * 10) && + (this_error < (2 * intrapenalty))) { + fp_acc_data->neutral_count += 1.0; +#if ENABLE_MT_BIT_MATCH + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count = 1.0; +#endif + // Also track cases where the intra is not much worse than the inter + // and use this in limiting the GF/arf group length. + } else if ((this_error > NCOUNT_INTRA_THRESH) && + (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) { + fp_acc_data->neutral_count += + (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error); +#if ENABLE_MT_BIT_MATCH + cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count = + (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error); +#endif + } + + mv.row *= 8; + mv.col *= 8; + this_error = motion_error; + xd->mi[0]->mode = NEWMV; + xd->mi[0]->mv[0].as_mv = mv; + xd->mi[0]->tx_size = TX_4X4; + xd->mi[0]->ref_frame[0] = LAST_FRAME; + xd->mi[0]->ref_frame[1] = NONE; + vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize); + vp9_encode_sby_pass1(x, bsize); + fp_acc_data->sum_mvr += mv.row; + fp_acc_data->sum_mvr_abs += abs(mv.row); + fp_acc_data->sum_mvc += mv.col; + fp_acc_data->sum_mvc_abs += abs(mv.col); + fp_acc_data->sum_mvrs += mv.row * mv.row; + fp_acc_data->sum_mvcs += mv.col * mv.col; + ++(fp_acc_data->intercount); + + *best_ref_mv = mv; #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { - // intra prediction statistics + // inter prediction statistics cpi->twopass.frame_mb_stats_buf[mb_index] = 0; - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK; cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; if (this_error > FPMB_ERROR_LARGE_TH) { cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK; @@ -1055,214 +1202,229 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { } #endif - if (motion_error <= this_error) { - vpx_clear_system_state(); - - // Keep a count of cases where the inter and intra were very close - // and very low. This helps with scene cut detection for example in - // cropped clips with black bars at the sides or top and bottom. - if (((this_error - intrapenalty) * 9 <= motion_error * 10) && - (this_error < (2 * intrapenalty))) { - neutral_count += 1.0; - // Also track cases where the intra is not much worse than the inter - // and use this in limiting the GF/arf group length. - } else if ((this_error > NCOUNT_INTRA_THRESH) && - (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) { - neutral_count += - (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error); - } - - mv.row *= 8; - mv.col *= 8; - this_error = motion_error; - xd->mi[0]->mode = NEWMV; - xd->mi[0]->mv[0].as_mv = mv; - xd->mi[0]->tx_size = TX_4X4; - xd->mi[0]->ref_frame[0] = LAST_FRAME; - xd->mi[0]->ref_frame[1] = NONE; - vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize); - vp9_encode_sby_pass1(x, bsize); - sum_mvr += mv.row; - sum_mvr_abs += abs(mv.row); - sum_mvc += mv.col; - sum_mvc_abs += abs(mv.col); - sum_mvrs += mv.row * mv.row; - sum_mvcs += mv.col * mv.col; - ++intercount; - - best_ref_mv = mv; + if (!is_zero_mv(&mv)) { + ++(fp_acc_data->mvcount); #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { - // inter prediction statistics - cpi->twopass.frame_mb_stats_buf[mb_index] = 0; - cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK; - cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK; - if (this_error > FPMB_ERROR_LARGE_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_MOTION_ZERO_MASK; + // check estimated motion direction + if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) { + // right direction cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_ERROR_LARGE_MASK; - } else if (this_error < FPMB_ERROR_SMALL_TH) { + FPMB_MOTION_RIGHT_MASK; + } else if (mv.as_mv.row < 0 && + abs(mv.as_mv.row) >= abs(mv.as_mv.col)) { + // up direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_UP_MASK; + } else if (mv.as_mv.col < 0 && + abs(mv.as_mv.col) >= abs(mv.as_mv.row)) { + // left direction cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_ERROR_SMALL_MASK; + FPMB_MOTION_LEFT_MASK; + } else { + // down direction + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_MOTION_DOWN_MASK; } } #endif - if (!is_zero_mv(&mv)) { - ++mvcount; + // Does the row vector point inwards or outwards? + if (mb_row < cm->mb_rows / 2) { + if (mv.row > 0) + --(fp_acc_data->sum_in_vectors); + else if (mv.row < 0) + ++(fp_acc_data->sum_in_vectors); + } else if (mb_row > cm->mb_rows / 2) { + if (mv.row > 0) + ++(fp_acc_data->sum_in_vectors); + else if (mv.row < 0) + --(fp_acc_data->sum_in_vectors); + } + + // Does the col vector point inwards or outwards? + if (mb_col < cm->mb_cols / 2) { + if (mv.col > 0) + --(fp_acc_data->sum_in_vectors); + else if (mv.col < 0) + ++(fp_acc_data->sum_in_vectors); + } else if (mb_col > cm->mb_cols / 2) { + if (mv.col > 0) + ++(fp_acc_data->sum_in_vectors); + else if (mv.col < 0) + --(fp_acc_data->sum_in_vectors); + } + fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; + } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) { + fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); + } else { // 0,0 mv but high error + fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; + } + } else { // Intra < inter error + if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) + fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize); + else + fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF; + } + } else { + fp_acc_data->sr_coded_error += (int64_t)this_error; + } + fp_acc_data->coded_error += (int64_t)this_error; + + recon_yoffset += 16; + recon_uvoffset += uv_mb_height; + + // Accumulate row level stats to the corresponding tile stats + if (cpi->new_mt && mb_col == (tile.mi_col_end >> 1) - 1) + accumulate_fp_mb_row_stat(tile_data, fp_acc_data); + + (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, mb_row, c, + num_mb_cols); + } + vpx_clear_system_state(); +} + +static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) { + VP9_COMMON *const cm = &cpi->common; + int mb_row; + TileDataEnc tile_data; + TileInfo *tile = &tile_data.tile_info; + MV zero_mv = { 0, 0 }; + MV best_ref_mv; + // Tiling is ignored in the first pass. + vp9_tile_init(tile, cm, 0, 0); + + for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) { + best_ref_mv = zero_mv; + vp9_first_pass_encode_tile_mb_row(cpi, &cpi->td, fp_acc_data, &tile_data, + &best_ref_mv, mb_row); + } +} + +void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { + MACROBLOCK *const x = &cpi->td.mb; + VP9_COMMON *const cm = &cpi->common; + MACROBLOCKD *const xd = &x->e_mbd; + TWO_PASS *twopass = &cpi->twopass; + + YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME); + YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm); + const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12; + + LAYER_CONTEXT *const lc = + is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id] + : NULL; + BufferPool *const pool = cm->buffer_pool; + + FIRSTPASS_DATA fp_temp_data; + FIRSTPASS_DATA *fp_acc_data = &fp_temp_data; + + vpx_clear_system_state(); + vp9_zero(fp_temp_data); + fp_acc_data->image_data_start_row = INVALID_ROW; + + // First pass code requires valid last and new frame buffers. + assert(new_yv12 != NULL); + assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL)); #if CONFIG_FP_MB_STATS - if (cpi->use_fp_mb_stats) { - cpi->twopass.frame_mb_stats_buf[mb_index] &= - ~FPMB_MOTION_ZERO_MASK; - // check estimated motion direction - if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) { - // right direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_RIGHT_MASK; - } else if (mv.as_mv.row < 0 && - abs(mv.as_mv.row) >= abs(mv.as_mv.col)) { - // up direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_UP_MASK; - } else if (mv.as_mv.col < 0 && - abs(mv.as_mv.col) >= abs(mv.as_mv.row)) { - // left direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_LEFT_MASK; - } else { - // down direction - cpi->twopass.frame_mb_stats_buf[mb_index] |= - FPMB_MOTION_DOWN_MASK; - } - } + if (cpi->use_fp_mb_stats) { + vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs); + } #endif - // Does the row vector point inwards or outwards? - if (mb_row < cm->mb_rows / 2) { - if (mv.row > 0) - --sum_in_vectors; - else if (mv.row < 0) - ++sum_in_vectors; - } else if (mb_row > cm->mb_rows / 2) { - if (mv.row > 0) - ++sum_in_vectors; - else if (mv.row < 0) - --sum_in_vectors; - } + set_first_pass_params(cpi); + vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth)); - // Does the col vector point inwards or outwards? - if (mb_col < cm->mb_cols / 2) { - if (mv.col > 0) - --sum_in_vectors; - else if (mv.col < 0) - ++sum_in_vectors; - } else if (mb_col > cm->mb_cols / 2) { - if (mv.col > 0) - ++sum_in_vectors; - else if (mv.col < 0) - --sum_in_vectors; - } - frame_noise_energy += (int64_t)SECTION_NOISE_DEF; - } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) { - frame_noise_energy += fp_estimate_block_noise(x, bsize); - } else { // 0,0 mv but high error - frame_noise_energy += (int64_t)SECTION_NOISE_DEF; - } - } else { // Intra < inter error - if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) - frame_noise_energy += fp_estimate_block_noise(x, bsize); - else - frame_noise_energy += (int64_t)SECTION_NOISE_DEF; - } - } else { - sr_coded_error += (int64_t)this_error; - } - coded_error += (int64_t)this_error; + if (lc != NULL) { + twopass = &lc->twopass; - // Adjust to the next column of MBs. - x->plane[0].src.buf += 16; - x->plane[1].src.buf += uv_mb_height; - x->plane[2].src.buf += uv_mb_height; + cpi->lst_fb_idx = cpi->svc.spatial_layer_id; + cpi->ref_frame_flags = VP9_LAST_FLAG; - recon_yoffset += 16; - recon_uvoffset += uv_mb_height; + if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id < + REF_FRAMES) { + cpi->gld_fb_idx = + cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id; + cpi->ref_frame_flags |= VP9_GOLD_FLAG; + cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0); + } else { + cpi->refresh_golden_frame = 0; } - // Adjust to the next row of MBs. - x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols; - x->plane[1].src.buf += - uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols; - x->plane[2].src.buf += - uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols; + if (lc->current_video_frame_in_layer == 0) cpi->ref_frame_flags = 0; - vpx_clear_system_state(); - } + vp9_scale_references(cpi); - // Clamp the image start to rows/2. This number of rows is discarded top - // and bottom as dead data so rows / 2 means the frame is blank. - if ((image_data_start_row > cm->mb_rows / 2) || - (image_data_start_row == INVALID_ROW)) { - image_data_start_row = cm->mb_rows / 2; + // Use either last frame or alt frame for motion search. + if (cpi->ref_frame_flags & VP9_LAST_FLAG) { + first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME); + if (first_ref_buf == NULL) + first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME); + } + + if (cpi->ref_frame_flags & VP9_GOLD_FLAG) { + gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME); + if (gld_yv12 == NULL) { + gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME); + } + } else { + gld_yv12 = NULL; + } + + set_ref_ptrs(cm, xd, + (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME : NONE, + (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE); + + cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source, + &cpi->scaled_source, 0); } - // Exclude any image dead zone - if (image_data_start_row > 0) { - intra_skip_count = - VPXMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2)); + + vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y); + + vp9_setup_src_planes(x, cpi->Source, 0, 0); + vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0); + + if (!frame_is_intra_only(cm)) { + vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL); } + xd->mi = cm->mi_grid_visible; + xd->mi[0] = cm->mi; + + vp9_frame_init_quantizer(cpi); + + x->skip_recode = 0; + + vp9_init_mv_probs(cm); + vp9_initialize_rd_consts(cpi); + + cm->log2_tile_rows = 0; + { FIRSTPASS_STATS fps; - // The minimum error here insures some bit allocation to frames even - // in static regions. The allocation per MB declines for larger formats - // where the typical "real" energy per MB also falls. - // Initial estimate here uses sqrt(mbs) to define the min_err, where the - // number of mbs is proportional to the image area. - const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) - ? cpi->initial_mbs - : cpi->common.MBs; - const double min_err = 200 * sqrt(num_mbs); - - intra_factor = intra_factor / (double)num_mbs; - brightness_factor = brightness_factor / (double)num_mbs; - fps.weight = intra_factor * brightness_factor; - - fps.frame = cm->current_video_frame; - fps.spatial_layer_id = cpi->svc.spatial_layer_id; - fps.coded_error = (double)(coded_error >> 8) + min_err; - fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err; - fps.intra_error = (double)(intra_error >> 8) + min_err; - fps.frame_noise_energy = (double)frame_noise_energy / (double)num_mbs; - fps.count = 1.0; - fps.pcnt_inter = (double)intercount / num_mbs; - fps.pcnt_second_ref = (double)second_ref_count / num_mbs; - fps.pcnt_neutral = (double)neutral_count / num_mbs; - fps.intra_skip_pct = (double)intra_skip_count / num_mbs; - fps.intra_smooth_pct = (double)intra_smooth_count / num_mbs; - fps.inactive_zone_rows = (double)image_data_start_row; - // Currently set to 0 as most issues relate to letter boxing. - fps.inactive_zone_cols = (double)0; - - if (mvcount > 0) { - fps.MVr = (double)sum_mvr / mvcount; - fps.mvr_abs = (double)sum_mvr_abs / mvcount; - fps.MVc = (double)sum_mvc / mvcount; - fps.mvc_abs = (double)sum_mvc_abs / mvcount; - fps.MVrv = - ((double)sum_mvrs - ((double)sum_mvr * sum_mvr / mvcount)) / mvcount; - fps.MVcv = - ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount; - fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2); - fps.pcnt_motion = (double)mvcount / num_mbs; + TileDataEnc *first_tile_col; + if (!cpi->new_mt) { + cm->log2_tile_cols = 0; + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy; + first_pass_encode(cpi, fp_acc_data); + first_pass_stat_calc(cpi, &fps, fp_acc_data); } else { - fps.MVr = 0.0; - fps.mvr_abs = 0.0; - fps.MVc = 0.0; - fps.mvc_abs = 0.0; - fps.MVrv = 0.0; - fps.MVcv = 0.0; - fps.mv_in_out_count = 0.0; - fps.pcnt_motion = 0.0; + cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read; + cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write; +#if ENABLE_MT_BIT_MATCH + cm->log2_tile_cols = 0; + vp9_zero_array(cpi->twopass.fp_mb_float_stats, cm->MBs); +#endif + vp9_encode_fp_row_mt(cpi); + first_tile_col = &cpi->tile_data[0]; +#if ENABLE_MT_BIT_MATCH + accumulate_floating_point_stats(cpi, first_tile_col); +#endif + first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data)); } // Dont allow a value of 0 for duration. diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 5541893..ee6d5f3 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -39,6 +39,40 @@ typedef struct { } FIRSTPASS_MB_STATS; #endif +#define INVALID_ROW -1 + +#define ENABLE_MT_BIT_MATCH 0 +#if ENABLE_MT_BIT_MATCH +typedef struct { + double frame_mb_intra_factor; + double frame_mb_brightness_factor; + double frame_mb_neutral_count; +} FP_MB_FLOAT_STATS; +#endif + +typedef struct { + double intra_factor; + double brightness_factor; + int64_t coded_error; + int64_t sr_coded_error; + int64_t frame_noise_energy; + int64_t intra_error; + int intercount; + int second_ref_count; + double neutral_count; + int intra_skip_count; + int image_data_start_row; + int mvcount; + int sum_mvr; + int sum_mvr_abs; + int sum_mvc; + int sum_mvc_abs; + int64_t sum_mvrs; + int64_t sum_mvcs; + int sum_in_vectors; + int intra_smooth_count; +} FIRSTPASS_DATA; + typedef struct { double frame; double weight; @@ -114,6 +148,11 @@ typedef struct { uint8_t *this_frame_mb_stats; FIRSTPASS_MB_STATS firstpass_mb_stats; #endif + +#if ENABLE_MT_BIT_MATCH + FP_MB_FLOAT_STATS *fp_mb_float_stats; +#endif + // An indication of the content type of the current frame FRAME_CONTENT_TYPE fr_content_type; @@ -141,12 +180,20 @@ typedef struct { } TWO_PASS; struct VP9_COMP; +struct ThreadData; +struct TileDataEnc; void vp9_init_first_pass(struct VP9_COMP *cpi); void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi); void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source); void vp9_end_first_pass(struct VP9_COMP *cpi); +void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi, + struct ThreadData *td, + FIRSTPASS_DATA *fp_acc_data, + struct TileDataEnc *tile_data, + MV *best_ref_mv, int mb_row); + void vp9_init_second_pass(struct VP9_COMP *cpi); void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi); void vp9_twopass_postencode_update(struct VP9_COMP *cpi); diff --git a/vp9/encoder/vp9_job_queue.h b/vp9/encoder/vp9_job_queue.h new file mode 100644 index 0000000..89c08f2 --- /dev/null +++ b/vp9/encoder/vp9_job_queue.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_JOB_QUEUE_H_ +#define VP9_ENCODER_VP9_JOB_QUEUE_H_ + +typedef enum { + FIRST_PASS_JOB, + ENCODE_JOB, + ARNR_JOB, + NUM_JOB_TYPES, +} JOB_TYPE; + +// Encode job parameters +typedef struct { + int vert_unit_row_num; // Index of the vertical unit row + int tile_col_id; // tile col id within a tile + int tile_row_id; // tile col id within a tile +} JobNode; + +// Job queue element parameters +typedef struct { + // Pointer to the next link in the job queue + void *next; + + // Job information context of the module + JobNode job_info; +} JobQueue; + +// Job queue handle +typedef struct { + // Pointer to the next link in the job queue + void *next; + + // Counter to store the number of jobs picked up for processing + int num_jobs_acquired; +} JobQueueHandle; + +#endif // VP9_ENCODER_VP9_JOB_QUEUE_H_ diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c new file mode 100644 index 0000000..23b0b42 --- /dev/null +++ b/vp9/encoder/vp9_multi_thread.c @@ -0,0 +1,282 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_ethread.h" +#include "vp9/encoder/vp9_multi_thread.h" + +void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt, + int tile_id) { + RowMTInfo *row_mt_info; + JobQueueHandle *job_queue_hdl = NULL; + void *next = NULL; + JobNode *job_info = NULL; +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex_handle = NULL; +#endif + + row_mt_info = (RowMTInfo *)(&multi_thread_ctxt->row_mt_info[tile_id]); + job_queue_hdl = (JobQueueHandle *)&row_mt_info->job_queue_hdl; +#if CONFIG_MULTITHREAD + mutex_handle = &row_mt_info->job_mutex; +#endif + +// lock the mutex for queue access +#if CONFIG_MULTITHREAD + pthread_mutex_lock(mutex_handle); +#endif + next = job_queue_hdl->next; + if (NULL != next) { + JobQueue *job_queue = (JobQueue *)next; + job_info = &job_queue->job_info; + // Update the next job in the queue + job_queue_hdl->next = job_queue->next; + job_queue_hdl->num_jobs_acquired++; + } + +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(mutex_handle); +#endif + + return job_info; +} + +void vp9_row_mt_mem_alloc(VP9_COMP *cpi) { + struct VP9Common *cm = &cpi->common; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int tile_row, tile_col; + const int tile_cols = 1 << cm->log2_tile_cols; + const int tile_rows = 1 << cm->log2_tile_rows; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + int jobs_per_tile_col, total_jobs; + + jobs_per_tile_col = VPXMAX(cm->mb_rows, sb_rows); + // Calculate the total number of jobs + total_jobs = jobs_per_tile_col * tile_cols; + + multi_thread_ctxt->allocated_tile_cols = tile_cols; + multi_thread_ctxt->allocated_tile_rows = tile_rows; + multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col; + + multi_thread_ctxt->job_queue = + (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue)); + +#if CONFIG_MULTITHREAD + // Create mutex for each tile + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col]; + pthread_mutex_init(&row_mt_info->job_mutex, NULL); + } +#endif + + // Allocate memory for row based multi-threading + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_col]; + vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col); + } + + // Assign the sync pointer of tile row zero for every tile row > 0 + for (tile_row = 1; tile_row < tile_rows; tile_row++) { + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col]; + TileDataEnc *this_col_tile = &cpi->tile_data[tile_col]; + this_tile->row_mt_sync = this_col_tile->row_mt_sync; + } + } + + // Calculate the number of vertical units in the given tile row + for (tile_row = 0; tile_row < tile_rows; tile_row++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols]; + TileInfo *tile_info = &this_tile->tile_info; + multi_thread_ctxt->num_tile_vert_sbs[tile_row] = + get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2); + } +} + +void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) { + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + int tile_col; + + // Deallocate memory for job queue + if (multi_thread_ctxt->job_queue) vpx_free(multi_thread_ctxt->job_queue); + +#if CONFIG_MULTITHREAD + // Destroy mutex for each tile + for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols; + tile_col++) { + RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col]; + if (row_mt_info) pthread_mutex_destroy(&row_mt_info->job_mutex); + } +#endif + + // Free row based multi-threading sync memory + for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols; + tile_col++) { + TileDataEnc *this_tile = &cpi->tile_data[tile_col]; + vp9_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync); + } +} + +void vp9_multi_thread_tile_init(VP9_COMP *cpi) { + VP9_COMMON *const cm = &cpi->common; + const int tile_cols = 1 << cm->log2_tile_cols; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + int i; + + for (i = 0; i < tile_cols; i++) { + TileDataEnc *this_tile = &cpi->tile_data[i]; + int jobs_per_tile_col = cpi->oxcf.pass == 1 ? cm->mb_rows : sb_rows; + + // Initialize cur_col to -1 for all rows. + memset(this_tile->row_mt_sync.cur_col, -1, + sizeof(*this_tile->row_mt_sync.cur_col) * jobs_per_tile_col); + vp9_zero(this_tile->fp_data); + this_tile->fp_data.image_data_start_row = INVALID_ROW; + } +} + +void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt, + int tile_cols, int num_workers) { + int tile_id = 0; + int i; + + // Allocating the threads for the tiles + for (i = 0; i < num_workers; i++) { + multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++; + if (tile_id == tile_cols) tile_id = 0; + } +} + +int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt, + int cur_tile_id) { + RowMTInfo *row_mt_info; + JobQueueHandle *job_queue_hndl; +#if CONFIG_MULTITHREAD + pthread_mutex_t *mutex; +#endif + int num_jobs_remaining; + + row_mt_info = &multi_thread_ctxt->row_mt_info[cur_tile_id]; + job_queue_hndl = &row_mt_info->job_queue_hdl; +#if CONFIG_MULTITHREAD + mutex = &row_mt_info->job_mutex; +#endif + +#if CONFIG_MULTITHREAD + pthread_mutex_lock(mutex); +#endif + num_jobs_remaining = + multi_thread_ctxt->jobs_per_tile_col - job_queue_hndl->num_jobs_acquired; +#if CONFIG_MULTITHREAD + pthread_mutex_unlock(mutex); +#endif + + return (num_jobs_remaining); +} + +void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type) { + VP9_COMMON *const cm = &cpi->common; + MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt; + JobQueue *job_queue = multi_thread_ctxt->job_queue; + const int tile_cols = 1 << cm->log2_tile_cols; + int job_row_num, jobs_per_tile, jobs_per_tile_col, total_jobs; + const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + int tile_col, i; + + jobs_per_tile_col = (job_type != ENCODE_JOB) ? cm->mb_rows : sb_rows; + total_jobs = jobs_per_tile_col * tile_cols; + + multi_thread_ctxt->jobs_per_tile_col = jobs_per_tile_col; + // memset the entire job queue buffer to zero + memset(job_queue, 0, total_jobs * sizeof(JobQueue)); + + // Job queue preparation + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + RowMTInfo *tile_ctxt = &multi_thread_ctxt->row_mt_info[tile_col]; + JobQueue *job_queue_curr, *job_queue_temp; + int tile_row = 0; + + tile_ctxt->job_queue_hdl.next = (void *)job_queue; + tile_ctxt->job_queue_hdl.num_jobs_acquired = 0; + + job_queue_curr = job_queue; + job_queue_temp = job_queue; + + // loop over all the vertical rows + for (job_row_num = 0, jobs_per_tile = 0; job_row_num < jobs_per_tile_col; + job_row_num++, jobs_per_tile++) { + job_queue_curr->job_info.vert_unit_row_num = job_row_num; + job_queue_curr->job_info.tile_col_id = tile_col; + job_queue_curr->job_info.tile_row_id = tile_row; + job_queue_curr->next = (void *)(job_queue_temp + 1); + job_queue_curr = ++job_queue_temp; + + if (ENCODE_JOB == job_type) { + if (jobs_per_tile >= + multi_thread_ctxt->num_tile_vert_sbs[tile_row] - 1) { + tile_row++; + jobs_per_tile = -1; + } + } + } + + // Set the last pointer to NULL + job_queue_curr += -1; + job_queue_curr->next = (void *)NULL; + + // Move to the next tile + job_queue += jobs_per_tile_col; + } + + for (i = 0; i < cpi->num_workers; i++) { + EncWorkerData *thread_data; + thread_data = &cpi->tile_thr_data[i]; + thread_data->thread_id = i; + + for (tile_col = 0; tile_col < tile_cols; tile_col++) + thread_data->tile_completion_status[tile_col] = 0; + } +} + +int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt, + int *tile_completion_status, int *cur_tile_id, + int tile_cols) { + int tile_col; + int tile_id = -1; // Stores the tile ID with minimum proc done + int max_num_jobs_remaining = 0; + int num_jobs_remaining; + + // Mark the completion to avoid check in the loop + tile_completion_status[*cur_tile_id] = 1; + // Check for the status of all the tiles + for (tile_col = 0; tile_col < tile_cols; tile_col++) { + if (tile_completion_status[tile_col] == 0) { + num_jobs_remaining = + vp9_get_job_queue_status(multi_thread_ctxt, tile_col); + // Mark the completion to avoid checks during future switches across tiles + if (num_jobs_remaining == 0) tile_completion_status[tile_col] = 1; + if (num_jobs_remaining > max_num_jobs_remaining) { + max_num_jobs_remaining = num_jobs_remaining; + tile_id = tile_col; + } + } + } + + if (-1 == tile_id) { + return 1; + } else { + // Update the cur ID to the next tile ID that will be processed, + // which will be the least processed tile + *cur_tile_id = tile_id; + return 0; + } +} diff --git a/vp9/encoder/vp9_multi_thread.h b/vp9/encoder/vp9_multi_thread.h new file mode 100644 index 0000000..bfc0c0a --- /dev/null +++ b/vp9/encoder/vp9_multi_thread.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_MULTI_THREAD_H +#define VP9_ENCODER_VP9_MULTI_THREAD_H + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_job_queue.h" + +void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt, + int tile_id); + +void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type); + +int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt, + int cur_tile_id); + +void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt, + int tile_cols, int num_workers); + +void vp9_multi_thread_tile_init(VP9_COMP *cpi); + +void vp9_row_mt_mem_alloc(VP9_COMP *cpi); + +void vp9_row_mt_mem_dealloc(VP9_COMP *cpi); + +int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt, + int *tile_completion_status, int *cur_tile_id, + int tile_cols); + +#endif // VP9_ENCODER_VP9_MULTI_THREAD_H diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 1d5ed7c..c239ca6 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -51,6 +51,7 @@ struct vp9_extracfg { vpx_color_range_t color_range; int render_width; int render_height; + unsigned int new_mt; }; static struct vp9_extracfg default_extra_cfg = { @@ -82,6 +83,7 @@ static struct vp9_extracfg default_extra_cfg = { 0, // color range 0, // render width 0, // render height + 1, // new_mt }; struct vpx_codec_alg_priv { @@ -245,6 +247,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, "kf_min_dist not supported in auto mode, use 0 " "or kf_max_dist instead."); + RANGE_CHECK(extra_cfg, new_mt, 0, 1); RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2); RANGE_CHECK(extra_cfg, cpu_used, -8, 8); RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6); @@ -554,6 +557,8 @@ static vpx_codec_err_t set_encoder_config( oxcf->target_level = extra_cfg->target_level; + oxcf->new_mt = extra_cfg->new_mt; + for (sl = 0; sl < oxcf->ss_number_layers; ++sl) { #if CONFIG_SPATIAL_SVC oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl]; @@ -842,6 +847,13 @@ static vpx_codec_err_t ctrl_set_target_level(vpx_codec_alg_priv_t *ctx, return update_extra_cfg(ctx, &extra_cfg); } +static vpx_codec_err_t ctrl_set_new_mt(vpx_codec_alg_priv_t *ctx, + va_list args) { + struct vp9_extracfg extra_cfg = ctx->extra_cfg; + extra_cfg.new_mt = CAST(VP9E_SET_NEW_MT, args); + return update_extra_cfg(ctx, &extra_cfg); +} + static vpx_codec_err_t ctrl_get_level(vpx_codec_alg_priv_t *ctx, va_list args) { int *const arg = va_arg(args, int *); if (arg == NULL) return VPX_CODEC_INVALID_PARAM; @@ -1594,6 +1606,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config }, { VP9E_SET_RENDER_SIZE, ctrl_set_render_size }, { VP9E_SET_TARGET_LEVEL, ctrl_set_target_level }, + { VP9E_SET_NEW_MT, ctrl_set_new_mt }, // Getters { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer }, diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index a8ca0d5..87d9a77 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -39,9 +39,12 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemb.h VP9_CX_SRCS-yes += encoder/vp9_encodemv.h VP9_CX_SRCS-yes += encoder/vp9_extend.h VP9_CX_SRCS-yes += encoder/vp9_firstpass.h +VP9_CX_SRCS-yes += encoder/vp9_job_queue.h VP9_CX_SRCS-yes += encoder/vp9_lookahead.c VP9_CX_SRCS-yes += encoder/vp9_lookahead.h VP9_CX_SRCS-yes += encoder/vp9_mcomp.h +VP9_CX_SRCS-yes += encoder/vp9_multi_thread.c +VP9_CX_SRCS-yes += encoder/vp9_multi_thread.h VP9_CX_SRCS-yes += encoder/vp9_encoder.h VP9_CX_SRCS-yes += encoder/vp9_quantize.h VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index cc90159..a04d7dd 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -547,6 +547,14 @@ enum vp8e_enc_control_id { */ VP9E_SET_TARGET_LEVEL, + /*!\brief Codec control function to set row level multi-threading. + * + * 0 : off, 1 : on + * + * Supported in codecs: VP9 + */ + VP9E_SET_NEW_MT, + /*!\brief Codec control function to get bitstream level. * * Supported in codecs: VP9 @@ -838,6 +846,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *) VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int) #define VPX_CTRL_VP9E_SET_TARGET_LEVEL +VPX_CTRL_USE_TYPE(VP9E_SET_NEW_MT, unsigned int) +#define VPX_CTRL_VP9E_SET_NEW_MT + VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *) #define VPX_CTRL_VP9E_GET_LEVEL diff --git a/vpxenc.c b/vpxenc.c index 9cd10ab..abb3bae 100644 --- a/vpxenc.c +++ b/vpxenc.c @@ -470,6 +470,9 @@ static const arg_def_t target_level = ARG_DEF( NULL, "target-level", 1, "Target level (255: off (default); 0: only keep level stats; 10: level 1.0;" " 11: level 1.1; ... 62: level 6.2)"); + +static const arg_def_t new_mt = + ARG_DEF(NULL, "new-mt", 1, "Enable row based multi-threading in VP9"); #endif #if CONFIG_VP9_ENCODER @@ -498,6 +501,7 @@ static const arg_def_t *vp9_args[] = { &cpu_used_vp9, &min_gf_interval, &max_gf_interval, &target_level, + &new_mt, #if CONFIG_VP9_HIGHBITDEPTH &bitdeptharg, &inbitdeptharg, @@ -528,6 +532,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED, VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL, VP9E_SET_TARGET_LEVEL, + VP9E_SET_NEW_MT, 0 }; #endif -- 2.7.4