From: Yunqing Wang Date: Tue, 6 Jan 2015 22:14:26 +0000 (-0800) Subject: vp9_ethread: add parallel loopfilter X-Git-Tag: v1.4.0~240^2 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=e76eaf05b1a20b906d166b7e8db402a9883c6025;p=platform%2Fupstream%2Flibvpx.git vp9_ethread: add parallel loopfilter 1. Added row-based loopfilter in encoder; 2. Moved common multi-threaded loopfilter functions from decoder to common; 3. Merged multi-threaded loopfilter code, and made encoder/ decoder call same function to reduce code duplication. Encoder tests showed that 1% - 2% speedup was seen for good-quality 2-pass mode(at speed 3); 1% - 3% speedup using 2 threads and 4% - 6% speedup using 4 threads were seen for real-time mode(at speed 7). Change-Id: I8a4ac51c2ad9bab9fa7b864e90743931c53ec1c4 --- diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index 58b2da7..2101ec5 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -1149,10 +1149,10 @@ static void highbd_filter_selectively_vert(uint16_t *s, int pitch, } #endif // CONFIG_VP9_HIGHBITDEPTH -static void filter_block_plane_non420(VP9_COMMON *cm, - struct macroblockd_plane *plane, - MODE_INFO *mi_8x8, - int mi_row, int mi_col) { +void vp9_filter_block_plane_non420(VP9_COMMON *cm, + struct macroblockd_plane *plane, + MODE_INFO *mi_8x8, + int mi_row, int mi_col) { const int ss_x = plane->subsampling_x; const int ss_y = plane->subsampling_y; const int row_step = 1 << ss_y; @@ -1598,8 +1598,8 @@ void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer, if (use_420) vp9_filter_block_plane(cm, &planes[plane], mi_row, &lfm); else - filter_block_plane_non420(cm, &planes[plane], mi + mi_col, - mi_row, mi_col); + vp9_filter_block_plane_non420(cm, &planes[plane], mi + mi_col, + mi_row, mi_col); } } } diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h index 4c15e6b..6d7cabf 100644 --- a/vp9/common/vp9_loopfilter.h +++ b/vp9/common/vp9_loopfilter.h @@ -97,6 +97,11 @@ void vp9_filter_block_plane(struct VP9Common *const cm, int mi_row, LOOP_FILTER_MASK *lfm); +void vp9_filter_block_plane_non420(struct VP9Common *cm, + struct macroblockd_plane *plane, + MODE_INFO *mi_8x8, + int mi_row, int mi_col); + void vp9_loop_filter_init(struct VP9Common *cm); // Update the loop filter for the current frame. diff --git a/vp9/decoder/vp9_dthread.c b/vp9/common/vp9_loopfilter_thread.c similarity index 78% rename from vp9/decoder/vp9_dthread.c rename to vp9/common/vp9_loopfilter_thread.c index 3d2d0dd..bd29ee0 100644 --- a/vp9/decoder/vp9_dthread.c +++ b/vp9/common/vp9_loopfilter_thread.c @@ -9,14 +9,10 @@ */ #include "./vpx_config.h" - #include "vpx_mem/vpx_mem.h" - +#include "vp9/common/vp9_loopfilter_thread.h" #include "vp9/common/vp9_reconinter.h" -#include "vp9/decoder/vp9_dthread.h" -#include "vp9/decoder/vp9_decoder.h" - #if CONFIG_MULTITHREAD static INLINE void mutex_lock(pthread_mutex_t *const mutex) { const int kMaxTryLocks = 4000; @@ -88,21 +84,23 @@ static INLINE void sync_write(VP9LfSync *const lf_sync, int r, int c, } // Implement row loopfiltering for each thread. -static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, - VP9_COMMON *const cm, - struct macroblockd_plane planes[MAX_MB_PLANE], - int start, int stop, int y_only, - VP9LfSync *const lf_sync) { +static INLINE +void thread_loop_filter_rows(const YV12_BUFFER_CONFIG *const frame_buffer, + VP9_COMMON *const cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int start, int stop, int y_only, + VP9LfSync *const lf_sync) { const int num_planes = y_only ? 1 : MAX_MB_PLANE; - int r, c; // SB row and col const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2; + int mi_row, mi_col; - for (r = start; r < stop; r += lf_sync->num_workers) { - const int mi_row = r << MI_BLOCK_SIZE_LOG2; + for (mi_row = start; mi_row < stop; + mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) { MODE_INFO *const mi = cm->mi + mi_row * cm->mi_stride; - for (c = 0; c < sb_cols; ++c) { - const int mi_col = c << MI_BLOCK_SIZE_LOG2; + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) { + const int r = mi_row >> MI_BLOCK_SIZE_LOG2; + const int c = mi_col >> MI_BLOCK_SIZE_LOG2; LOOP_FILTER_MASK lfm; int plane; @@ -123,37 +121,33 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer, // Row-based multi-threaded loopfilter hook static int loop_filter_row_worker(VP9LfSync *const lf_sync, LFWorkerData *const lf_data) { - loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes, - lf_data->start, lf_data->stop, lf_data->y_only, lf_sync); + thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes, + lf_data->start, lf_data->stop, lf_data->y_only, + lf_sync); return 1; } -// VP9 decoder: Implement multi-threaded loopfilter that uses the tile -// threads. -void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, - YV12_BUFFER_CONFIG *frame, - struct macroblockd_plane planes[MAX_MB_PLANE], - VP9_COMMON *cm, - VP9Worker *workers, int nworkers, - int frame_filter_level, - int y_only) { +static void loop_filter_rows_mt(YV12_BUFFER_CONFIG *frame, + VP9_COMMON *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int start, int stop, int y_only, + VP9Worker *workers, int nworkers, + VP9LfSync *lf_sync) { const VP9WorkerInterface *const winterface = vp9_get_worker_interface(); // Number of superblock rows and cols const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2; + // Decoder may allocate more threads than number of tiles based on user's + // input. const int tile_cols = 1 << cm->log2_tile_cols; const int num_workers = MIN(nworkers, tile_cols); int i; - if (!frame_filter_level) return; - if (!lf_sync->sync_range || cm->last_height != cm->height || num_workers > lf_sync->num_workers) { vp9_loop_filter_dealloc(lf_sync); vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers); } - vp9_loop_filter_frame_init(cm, frame_filter_level); - // Initialize cur_sb_col to -1 for all SB rows. vpx_memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows); @@ -175,8 +169,8 @@ void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, // Loopfilter data vp9_loop_filter_data_reset(lf_data, frame, cm, planes); - lf_data->start = i; - lf_data->stop = sb_rows; + lf_data->start = start + i * MI_BLOCK_SIZE; + lf_data->stop = stop; lf_data->y_only = y_only; // Start loopfiltering @@ -193,8 +187,33 @@ void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, } } +void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, + VP9_COMMON *cm, + struct macroblockd_plane planes[MAX_MB_PLANE], + int frame_filter_level, + int y_only, int partial_frame, + VP9Worker *workers, int num_workers, + VP9LfSync *lf_sync) { + int start_mi_row, end_mi_row, mi_rows_to_filter; + + if (!frame_filter_level) return; + + start_mi_row = 0; + mi_rows_to_filter = cm->mi_rows; + if (partial_frame && cm->mi_rows > 8) { + start_mi_row = cm->mi_rows >> 1; + start_mi_row &= 0xfffffff8; + mi_rows_to_filter = MAX(cm->mi_rows / 8, 8); + } + end_mi_row = start_mi_row + mi_rows_to_filter; + vp9_loop_filter_frame_init(cm, frame_filter_level); + + loop_filter_rows_mt(frame, cm, planes, start_mi_row, end_mi_row, + y_only, workers, num_workers, lf_sync); +} + // Set up nsync by width. -static int get_sync_range(int width) { +static INLINE int get_sync_range(int width) { // nsync numbers are picked by testing. For example, for 4k // video, using 4 gives best performance. if (width < 640) diff --git a/vp9/decoder/vp9_dthread.h b/vp9/common/vp9_loopfilter_thread.h similarity index 71% rename from vp9/decoder/vp9_dthread.h rename to vp9/common/vp9_loopfilter_thread.h index 664aaa3..bca357e 100644 --- a/vp9/decoder/vp9_dthread.h +++ b/vp9/common/vp9_loopfilter_thread.h @@ -8,23 +8,13 @@ * be found in the AUTHORS file in the root of the source tree. */ -#ifndef VP9_DECODER_VP9_DTHREAD_H_ -#define VP9_DECODER_VP9_DTHREAD_H_ - +#ifndef VP9_COMMON_VP9_LOOPFILTER_THREAD_H_ +#define VP9_COMMON_VP9_LOOPFILTER_THREAD_H_ #include "./vpx_config.h" +#include "vp9/common/vp9_loopfilter.h" #include "vp9/common/vp9_thread.h" -#include "vp9/decoder/vp9_reader.h" -#include "vpx/internal/vpx_codec_internal.h" struct VP9Common; -struct VP9Decoder; - -typedef struct TileWorkerData { - struct VP9Common *cm; - vp9_reader bit_reader; - DECLARE_ALIGNED(16, struct macroblockd, xd); - struct vpx_internal_error_info error_info; -} TileWorkerData; // Loopfilter row synchronization typedef struct VP9LfSyncData { @@ -45,19 +35,19 @@ typedef struct VP9LfSyncData { } VP9LfSync; // Allocate memory for loopfilter row synchronization. -void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows, +void vp9_loop_filter_alloc(VP9LfSync *lf_sync, struct VP9Common *cm, int rows, int width, int num_workers); // Deallocate loopfilter synchronization related mutex and data. void vp9_loop_filter_dealloc(VP9LfSync *lf_sync); // Multi-threaded loopfilter that uses the tile threads. -void vp9_loop_filter_frame_mt(VP9LfSync *lf_sync, - YV12_BUFFER_CONFIG *frame, - struct macroblockd_plane planes[MAX_MB_PLANE], +void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, - VP9Worker *workers, int num_workers, + struct macroblockd_plane planes[MAX_MB_PLANE], int frame_filter_level, - int y_only); + int y_only, int partial_frame, + VP9Worker *workers, int num_workers, + VP9LfSync *lf_sync); -#endif // VP9_DECODER_VP9_DTHREAD_H_ +#endif // VP9_COMMON_VP9_LOOPFILTER_THREAD_H_ diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index d135199..ea4edbf 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -36,7 +36,6 @@ #include "vp9/decoder/vp9_decodemv.h" #include "vp9/decoder/vp9_decoder.h" #include "vp9/decoder/vp9_dsubexp.h" -#include "vp9/decoder/vp9_dthread.h" #include "vp9/decoder/vp9_read_bit_buffer.h" #include "vp9/decoder/vp9_reader.h" @@ -1591,9 +1590,9 @@ void vp9_decode_frame(VP9Decoder *pbi, if (!xd->corrupted) { // If multiple threads are used to decode tiles, then we use those threads // to do parallel loopfiltering. - vp9_loop_filter_frame_mt(&pbi->lf_row_sync, new_fb, pbi->mb.plane, cm, - pbi->tile_workers, pbi->num_tile_workers, - cm->lf.filter_level, 0); + vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level, + 0, 0, pbi->tile_workers, pbi->num_tile_workers, + &pbi->lf_row_sync); } else { vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME, "Decode failed. Frame data is corrupted."); diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c index 1d254d2..7bef265 100644 --- a/vp9/decoder/vp9_decoder.c +++ b/vp9/decoder/vp9_decoder.c @@ -32,7 +32,6 @@ #include "vp9/decoder/vp9_decodeframe.h" #include "vp9/decoder/vp9_decoder.h" #include "vp9/decoder/vp9_detokenize.h" -#include "vp9/decoder/vp9_dthread.h" static void initialize_dec(void) { static volatile int init_done = 0; diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index 25b7339..1415019 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -15,12 +15,11 @@ #include "vpx/vpx_codec.h" #include "vpx_scale/yv12config.h" - +#include "vp9/common/vp9_loopfilter_thread.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_ppflags.h" #include "vp9/common/vp9_thread.h" - -#include "vp9/decoder/vp9_dthread.h" +#include "vp9/decoder/vp9_reader.h" #ifdef __cplusplus extern "C" { @@ -33,6 +32,13 @@ typedef struct TileData { DECLARE_ALIGNED(16, MACROBLOCKD, xd); } TileData; +typedef struct TileWorkerData { + VP9_COMMON *cm; + vp9_reader bit_reader; + DECLARE_ALIGNED(16, MACROBLOCKD, xd); + struct vpx_internal_error_info error_info; +} TileWorkerData; + typedef struct VP9Decoder { DECLARE_ALIGNED(16, MACROBLOCKD, mb); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 7cfe0de..35fea57 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -1786,7 +1786,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) { for (t = 0; t < cpi->num_workers; ++t) { VP9Worker *const worker = &cpi->workers[t]; - EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; + EncWorkerData *const thread_data = &cpi->tile_thr_data[t]; // Deallocate allocated threads. vp9_get_worker_interface()->end(worker); @@ -1797,11 +1797,13 @@ void vp9_remove_compressor(VP9_COMP *cpi) { vp9_free_pc_tree(thread_data->td); vpx_free(thread_data->td); } - - vpx_free(worker->data1); } + vpx_free(cpi->tile_thr_data); vpx_free(cpi->workers); + if (cpi->num_workers > 1) + vp9_loop_filter_dealloc(&cpi->lf_row_sync); + dealloc_compressor_data(cpi); for (i = 0; i < sizeof(cpi->mbgraph_stats) / @@ -2437,7 +2439,13 @@ static void loopfilter_frame(VP9_COMP *cpi, VP9_COMMON *cm) { } if (lf->filter_level > 0) { - vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0); + if (cpi->num_workers > 1) + vp9_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane, + lf->filter_level, 0, 0, + cpi->workers, cpi->num_workers, + &cpi->lf_row_sync); + else + vp9_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0); } vp9_extend_frame_inner_borders(cm->frame_to_show); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index a41284a..cf269c1 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -19,6 +19,7 @@ #include "vp9/common/vp9_ppflags.h" #include "vp9/common/vp9_entropymode.h" +#include "vp9/common/vp9_loopfilter_thread.h" #include "vp9/common/vp9_onyxc_int.h" #include "vp9/common/vp9_thread.h" @@ -36,6 +37,7 @@ #include "vp9/encoder/vp9_svc_layercontext.h" #include "vp9/encoder/vp9_tokenize.h" #include "vp9/encoder/vp9_variance.h" + #if CONFIG_VP9_TEMPORAL_DENOISING #include "vp9/encoder/vp9_denoiser.h" #endif @@ -262,6 +264,8 @@ typedef struct ThreadData { PC_TREE *pc_root; } ThreadData; +struct EncWorkerData; + typedef struct VP9_COMP { QUANTS quants; ThreadData td; @@ -447,6 +451,8 @@ typedef struct VP9_COMP { // Multi-threading int num_workers; VP9Worker *workers; + struct EncWorkerData *tile_thr_data; + VP9LfSync lf_row_sync; } VP9_COMP; void vp9_initialize_enc(void); diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c index 9ee3236..12fb4d1 100644 --- a/vp9/encoder/vp9_ethread.c +++ b/vp9/encoder/vp9_ethread.c @@ -167,16 +167,15 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { CHECK_MEM_ERROR(cm, cpi->workers, vpx_malloc(num_workers * sizeof(*cpi->workers))); + CHECK_MEM_ERROR(cm, cpi->tile_thr_data, + vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data))); + for (i = 0; i < num_workers; i++) { VP9Worker *const worker = &cpi->workers[i]; - EncWorkerData *thread_data; + EncWorkerData *thread_data = &cpi->tile_thr_data[i]; ++cpi->num_workers; - winterface->init(worker); - CHECK_MEM_ERROR(cm, worker->data1, - (EncWorkerData*)vpx_calloc(1, sizeof(EncWorkerData))); - thread_data = (EncWorkerData*)worker->data1; if (i < num_workers - 1) { thread_data->cpi = cpi; @@ -205,17 +204,18 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) { thread_data->td = &cpi->td; } - // data2 is unused. - worker->data2 = NULL; - winterface->sync(worker); - worker->hook = (VP9WorkerHook)enc_worker_hook; } } for (i = 0; i < num_workers; i++) { VP9Worker *const worker = &cpi->workers[i]; - EncWorkerData *const thread_data = (EncWorkerData*)worker->data1; + EncWorkerData *thread_data; + + worker->hook = (VP9WorkerHook)enc_worker_hook; + worker->data1 = &cpi->tile_thr_data[i]; + worker->data2 = NULL; + thread_data = (EncWorkerData*)worker->data1; // Before encoding a frame, copy the thread data from cpi. thread_data->td->mb = cpi->td.mb; diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c index 01cc519..a95f0f4 100644 --- a/vp9/encoder/vp9_picklpf.c +++ b/vp9/encoder/vp9_picklpf.c @@ -39,8 +39,14 @@ static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd, VP9_COMMON *const cm = &cpi->common; int64_t filt_err; - vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, 1, - partial_frame); + if (cpi->num_workers > 1) + vp9_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane, + filt_level, 1, partial_frame, + cpi->workers, cpi->num_workers, &cpi->lf_row_sync); + else + vp9_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level, + 1, partial_frame); + #if CONFIG_VP9_HIGHBITDEPTH if (cm->use_highbitdepth) { filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show); diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 76602c2..06096a6 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -33,6 +33,7 @@ VP9_COMMON_SRCS-yes += common/vp9_entropymv.h VP9_COMMON_SRCS-yes += common/vp9_enums.h VP9_COMMON_SRCS-yes += common/vp9_idct.h VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h +VP9_COMMON_SRCS-yes += common/vp9_loopfilter_thread.h VP9_COMMON_SRCS-yes += common/vp9_mv.h VP9_COMMON_SRCS-yes += common/vp9_onyxc_int.h VP9_COMMON_SRCS-yes += common/vp9_pred_common.h @@ -56,6 +57,7 @@ VP9_COMMON_SRCS-yes += common/vp9_tile_common.h VP9_COMMON_SRCS-yes += common/vp9_tile_common.c VP9_COMMON_SRCS-yes += common/vp9_loopfilter.c VP9_COMMON_SRCS-yes += common/vp9_loopfilter_filters.c +VP9_COMMON_SRCS-yes += common/vp9_loopfilter_thread.c VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h VP9_COMMON_SRCS-yes += common/vp9_quant_common.c diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk index 1fcb36f..603158a 100644 --- a/vp9/vp9dx.mk +++ b/vp9/vp9dx.mk @@ -21,8 +21,6 @@ VP9_DX_SRCS-yes += decoder/vp9_decodemv.c VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h VP9_DX_SRCS-yes += decoder/vp9_detokenize.c -VP9_DX_SRCS-yes += decoder/vp9_dthread.c -VP9_DX_SRCS-yes += decoder/vp9_dthread.h VP9_DX_SRCS-yes += decoder/vp9_reader.h VP9_DX_SRCS-yes += decoder/vp9_reader.c VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.c