From 0d87098e080707947378dc7b192a56503c12c6bf Mon Sep 17 00:00:00 2001 From: Yunqing Wang Date: Thu, 23 Jun 2011 13:54:02 -0400 Subject: [PATCH] Copy macroblock data to a buffer before encoding it I got this idea from Pascal (Thanks). Before encoding a macroblock, copy it to a 16x16 buffer, and then read source data from there instead. This will help keep the source data in cache, and help with the performance. Change-Id: Id05f4cb601299150511d59dcba0ae62c49b5b757 --- vp8/encoder/block.h | 2 ++ vp8/encoder/encodeframe.c | 16 +++++++++++++--- vp8/encoder/encodeintra.c | 4 ++-- vp8/encoder/encodemb.c | 8 ++++++-- vp8/encoder/ethreading.c | 3 +++ vp8/encoder/firstpass.c | 3 +++ vp8/encoder/pickinter.c | 7 ++++--- vp8/encoder/rdopt.c | 28 +++++++++++++++------------- 8 files changed, 48 insertions(+), 23 deletions(-) diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h index e8a5b78..7d44493 100644 --- a/vp8/encoder/block.h +++ b/vp8/encoder/block.h @@ -65,7 +65,9 @@ typedef struct { DECLARE_ALIGNED(16, short, src_diff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y DECLARE_ALIGNED(16, short, coeff[400]); // 16x16 Y 8x8 U 8x8 V 4x4 2nd Y + DECLARE_ALIGNED(16, unsigned char, thismb[256]); + unsigned char *thismb_ptr; // 16 Y blocks, 4 U blocks, 4 V blocks, 1 DC 2nd order block each with 16 entries BLOCK block[25]; diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 892284e..7343dcc 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -292,6 +292,9 @@ static void build_activity_map( VP8_COMP *cpi ) xd->left_available = (mb_col != 0); recon_yoffset += 16; #endif + //Copy current mb to a buffer + RECON_INVOKE(&xd->rtcd->recon, copy16x16)(x->src.y_buffer, x->src.y_stride, x->thismb, 16); + // measure activity mb_activity = mb_activity_measure( cpi, x, mb_row, mb_col ); @@ -431,6 +434,9 @@ void encode_mb_row(VP8_COMP *cpi, x->rddiv = cpi->RDDIV; x->rdmult = cpi->RDMULT; + //Copy current mb to a buffer + RECON_INVOKE(&xd->rtcd->recon, copy16x16)(x->src.y_buffer, x->src.y_stride, x->thismb, 16); + #if CONFIG_MULTITHREAD if ((cpi->b_multi_threaded != 0) && (mb_row != 0)) { @@ -1014,14 +1020,18 @@ void vp8_build_block_offsets(MACROBLOCK *x) vp8_build_block_doffsets(&x->e_mbd); // y blocks + x->thismb_ptr = &x->thismb[0]; for (br = 0; br < 4; br++) { for (bc = 0; bc < 4; bc++) { BLOCK *this_block = &x->block[block]; - this_block->base_src = &x->src.y_buffer; - this_block->src_stride = x->src.y_stride; - this_block->src = 4 * br * this_block->src_stride + 4 * bc; + //this_block->base_src = &x->src.y_buffer; + //this_block->src_stride = x->src.y_stride; + //this_block->src = 4 * br * this_block->src_stride + 4 * bc; + this_block->base_src = &x->thismb_ptr; + this_block->src_stride = 16; + this_block->src = 4 * br * 16 + 4 * bc; ++block; } } diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index f076bbb..59db025 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -91,11 +91,11 @@ void vp8_encode_intra4x4mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb) void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { - int b; + BLOCK *b = &x->block[0]; RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mby)(&x->e_mbd); - ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride); + ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride); vp8_transform_intra_mby(x); diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c index fd42ee4..3ed16b6 100644 --- a/vp8/encoder/encodemb.c +++ b/vp8/encoder/encodemb.c @@ -101,7 +101,9 @@ void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, in static void vp8_subtract_mb(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { - ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride); + BLOCK *b = &x->block[0]; + + ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride); ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); } @@ -598,9 +600,11 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) /* this funciton is used by first pass only */ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { + BLOCK *b = &x->block[0]; + vp8_build_inter16x16_predictors_mby(&x->e_mbd); - ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride); + ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, *(b->base_src), x->e_mbd.predictor, b->src_stride); transform_mby(x); diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index 420ed8e..1a37f03 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -148,6 +148,9 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data) x->rddiv = cpi->RDDIV; x->rdmult = cpi->RDMULT; + //Copy current mb to a buffer + RECON_INVOKE(&xd->rtcd->recon, copy16x16)(x->src.y_buffer, x->src.y_stride, x->thismb, 16); + if (cpi->oxcf.tuning == VP8_TUNE_SSIM) vp8_activity_masking(cpi, x); diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index a045760..2188f25 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -577,6 +577,9 @@ void vp8_first_pass(VP8_COMP *cpi) xd->dst.v_buffer = new_yv12->v_buffer + recon_uvoffset; xd->left_available = (mb_col != 0); + //Copy current mb to a buffer + RECON_INVOKE(&xd->rtcd->recon, copy16x16)(x->src.y_buffer, x->src.y_stride, x->thismb, 16); + // do intra 16x16 prediction this_error = vp8_encode_intra(cpi, x, use_dc_pred); diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index b60d241..99d69f6 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -610,7 +610,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rate2 += rate; distortion2 = VARIANCE_INVOKE (&cpi->rtcd.variance, var16x16)( - x->src.y_buffer, x->src.y_stride, + *(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); @@ -635,7 +635,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby) (&x->e_mbd); distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16) - (x->src.y_buffer, x->src.y_stride, + (*(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode]; this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); @@ -904,6 +904,7 @@ void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_) MB_PREDICTION_MODE mode, best_mode = DC_PRED; int this_rd; unsigned int sse; + BLOCK *b = &x->block[0]; x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; @@ -915,7 +916,7 @@ void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_) RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby) (&x->e_mbd); distortion = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16) - (x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, &sse); + (*(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); rate = x->mbmode_cost[x->e_mbd.frame_type][mode]; this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 5ce61a0..c6e3461 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -543,8 +543,8 @@ static void macro_block_yrd( MACROBLOCK *mb, BLOCK *beptr; int d; - ENCODEMB_INVOKE(rtcd, submby)( mb->src_diff, mb->src.y_buffer, - mb->e_mbd.predictor, mb->src.y_stride ); + ENCODEMB_INVOKE(rtcd, submby)( mb->src_diff, *(mb->block[0].base_src), + mb->e_mbd.predictor, mb->block[0].src_stride ); // Fdct and building the 2nd order block for (beptr = mb->block; beptr < mb->block + 16; beptr += 2) @@ -1633,6 +1633,8 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse { int near_sad[8] = {0}; // 0-cf above, 1-cf left, 2-cf aboveleft, 3-lf current, 4-lf above, 5-lf left, 6-lf right, 7-lf below + BLOCK *b = &x->block[0]; + unsigned char *src_y_ptr = *(b->base_src); //calculate sad for current frame 3 nearby MBs. if( xd->mb_to_top_edge==0 && xd->mb_to_left_edge ==0) @@ -1641,16 +1643,16 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse }else if(xd->mb_to_top_edge==0) { //only has left MB for sad calculation. near_sad[0] = near_sad[2] = INT_MAX; - near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff); + near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff); }else if(xd->mb_to_left_edge ==0) { //only has left MB for sad calculation. near_sad[1] = near_sad[2] = INT_MAX; - near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff); + near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff); }else { - near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff); - near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff); - near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, 0x7fffffff); + near_sad[0] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16,xd->dst.y_stride, 0x7fffffff); + near_sad[1] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - 16,xd->dst.y_stride, 0x7fffffff); + near_sad[2] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, xd->dst.y_buffer - xd->dst.y_stride *16 -16,xd->dst.y_stride, 0x7fffffff); } if(cpi->common.last_frame_type != KEY_FRAME) @@ -1665,14 +1667,14 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse if(xd->mb_to_bottom_edge==0) near_sad[7] = INT_MAX; if(near_sad[4] != INT_MAX) - near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, 0x7fffffff); + near_sad[4] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - pre_y_stride *16, pre_y_stride, 0x7fffffff); if(near_sad[5] != INT_MAX) - near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer - 16, pre_y_stride, 0x7fffffff); - near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer, pre_y_stride, 0x7fffffff); + near_sad[5] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer - 16, pre_y_stride, 0x7fffffff); + near_sad[3] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer, pre_y_stride, 0x7fffffff); if(near_sad[6] != INT_MAX) - near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer + 16, pre_y_stride, 0x7fffffff); + near_sad[6] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + 16, pre_y_stride, 0x7fffffff); if(near_sad[7] != INT_MAX) - near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(x->src.y_buffer, x->src.y_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride, 0x7fffffff); + near_sad[7] = cpi->fn_ptr[BLOCK_16X16].sdf(src_y_ptr, b->src_stride, pre_y_buffer + pre_y_stride *16, pre_y_stride, 0x7fffffff); } if(cpi->common.last_frame_type != KEY_FRAME) @@ -2165,7 +2167,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int threshold = x->encode_breakout; var = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16) - (x->src.y_buffer, x->src.y_stride, + (*(b->base_src), b->src_stride, x->e_mbd.predictor, 16, &sse); if (sse < threshold) -- 2.7.4