From de8280944449db88e96f9f4909f383a542c35de0 Mon Sep 17 00:00:00 2001 From: Attila Nagy Date: Tue, 18 Oct 2011 09:48:50 +0300 Subject: [PATCH] Reduce partial frame copy in encoder's pick_filter_level_fast The partial frame copy function used to copy an extra 8 lines above and below. The partial frame filtering can only modify 3 pixel rows above the partial frame. Reduce copy to bare minimum needed, which is 4 lines, so that partial filtering on copied frame is possible. Define the "magic" fraction number for partial filtering in loopfilter.h . Change-Id: I4791ffc541b6884b12759a0d0714a8faf16147ec --- vp8/common/loopfilter.c | 20 ++++----- vp8/common/loopfilter.h | 5 ++- vp8/encoder/arm/arm_csystemdependent.c | 12 +++--- vp8/encoder/arm/neon/picklpf_arm.c | 40 +++++++++-------- vp8/encoder/arm/neon/vp8_memcpy_neon.asm | 8 ++-- vp8/encoder/generic/csystemdependent.c | 6 ++- vp8/encoder/picklpf.c | 73 +++++++++++++++++--------------- 7 files changed, 85 insertions(+), 79 deletions(-) diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c index fe0644b..a38b49e 100644 --- a/vp8/common/loopfilter.c +++ b/vp8/common/loopfilter.c @@ -506,7 +506,8 @@ void vp8_loop_filter_partial_frame unsigned char *y_ptr; int mb_row; int mb_col; - int mb_cols = post->y_width >> 4; + int mb_cols = post->y_width >> 4; + int mb_rows = post->y_height >> 4; int linestocopy, i; @@ -521,15 +522,9 @@ void vp8_loop_filter_partial_frame int lvl_seg[MAX_MB_SEGMENTS]; - mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); - - /* 3 is a magic number. 4 is probably magic too */ - linestocopy = (post->y_height >> (4 + 3)); - - if (linestocopy < 1) - linestocopy = 1; - - linestocopy <<= 4; + /* number of MB rows to use in partial filtering */ + linestocopy = mb_rows / PARTIAL_FRAME_FRACTION; + linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */ /* Note the baseline filter values for each segment */ /* See vp8_loop_filter_frame_init. Rather than call that for each change @@ -554,8 +549,9 @@ void vp8_loop_filter_partial_frame } } - /* Set up the buffer pointers */ - y_ptr = post->y_buffer + (post->y_height >> 5) * 16 * post->y_stride; + /* Set up the buffer pointers; partial image starts at ~middle of frame */ + y_ptr = post->y_buffer + ((post->y_height >> 5) * 16) * post->y_stride; + mode_info_context = cm->mi + (post->y_height >> 5) * (mb_cols + 1); /* vp8_filter each macro block */ for (mb_row = 0; mb_row<(linestocopy >> 4); mb_row++) diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h index 9887cf5..340339a 100644 --- a/vp8/common/loopfilter.h +++ b/vp8/common/loopfilter.h @@ -15,7 +15,10 @@ #include "vpx_ports/mem.h" #include "vpx_config.h" -#define MAX_LOOP_FILTER 63 +#define MAX_LOOP_FILTER 63 +/* fraction of total macroblock rows to be used in fast filter level picking */ +/* has to be > 2 */ +#define PARTIAL_FRAME_FRACTION 8 typedef enum { diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c index 210a5a5..918d7d9 100644 --- a/vp8/encoder/arm/arm_csystemdependent.c +++ b/vp8/encoder/arm/arm_csystemdependent.c @@ -14,9 +14,9 @@ #include "vp8/encoder/variance.h" #include "vp8/encoder/onyx_int.h" -extern void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); -extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); -extern void vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); +extern void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc); void vp8_arch_arm_encoder_init(VP8_COMP *cpi) { @@ -123,15 +123,15 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi) cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_neon; cpi->rtcd.quantize.fastquantb_pair = vp8_fast_quantize_b_pair_neon; } -#endif +#endif /* HAVE_ARMV7 */ +#endif /* CONFIG_RUNTIME_CPU_DETECT */ #if HAVE_ARMV7 #if CONFIG_RUNTIME_CPU_DETECT if (flags & HAS_NEON) #endif { - vp8_yv12_copy_partial_frame_ptr = vpxyv12_copy_partial_frame_neon; + vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame_neon; } #endif -#endif } diff --git a/vp8/encoder/arm/neon/picklpf_arm.c b/vp8/encoder/arm/neon/picklpf_arm.c index 3fb370c..6610d2d 100644 --- a/vp8/encoder/arm/neon/picklpf_arm.c +++ b/vp8/encoder/arm/neon/picklpf_arm.c @@ -8,20 +8,16 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include "vp8/common/loopfilter.h" +#include "vpx_scale/yv12config.h" -#include "vp8/common/onyxc_int.h" -#include "vp8/encoder/onyx_int.h" -#include "vp8/encoder/quantize.h" -#include "vpx_mem/vpx_mem.h" -#include "vpx_scale/yv12extend.h" -#include "vpx_scale/vpxscale.h" -#include "vp8/common/alloccommon.h" +extern void vp8_memcpy_partial_neon(unsigned char *dst_ptr, + unsigned char *src_ptr, + int sz); -extern void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz); - -void -vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) +void vp8_yv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { unsigned char *src_y, *dst_y; int yheight; @@ -34,17 +30,19 @@ vpxyv12_copy_partial_frame_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG yheight = src_ybc->y_height; ystride = src_ybc->y_stride; - linestocopy = (yheight >> (Fraction + 4)); - - if (linestocopy < 1) - linestocopy = 1; - - linestocopy <<= 4; - - yoffset = ystride * ((yheight >> 5) * 16 - 8); + /* number of MB rows to use in partial filtering */ + linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION; + linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */ + + /* Copy extra 4 so that full filter context is available if filtering done + * on the copied partial frame and not original. Partial filter does mb + * filtering for top row also, which can modify3 pixels above. + */ + linestocopy += 4; + /* partial image starts at ~middle of frame (macroblock border) */ + yoffset = ystride * (((yheight >> 5) * 16) - 4); src_y = src_ybc->y_buffer + yoffset; dst_y = dst_ybc->y_buffer + yoffset; - //vpx_memcpy (dst_y, src_y, ystride * (linestocopy +16)); - vp8_memcpy_neon((unsigned char *)dst_y, (unsigned char *)src_y, (int)(ystride *(linestocopy + 16))); + vp8_memcpy_partial_neon(dst_y, src_y, ystride * linestocopy); } diff --git a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm index b0450e5..5b9f11e 100644 --- a/vp8/encoder/arm/neon/vp8_memcpy_neon.asm +++ b/vp8/encoder/arm/neon/vp8_memcpy_neon.asm @@ -9,7 +9,7 @@ ; - EXPORT |vp8_memcpy_neon| + EXPORT |vp8_memcpy_partial_neon| ARM REQUIRE8 @@ -17,8 +17,10 @@ AREA ||.text||, CODE, READONLY, ALIGN=2 ;========================================= -;void vp8_memcpy_neon(unsigned char *dst_ptr, unsigned char *src_ptr, int sz); -|vp8_memcpy_neon| PROC +;this is not a full memcpy function!!! +;void vp8_memcpy_partial_neon(unsigned char *dst_ptr, unsigned char *src_ptr, +; int sz); +|vp8_memcpy_partial_neon| PROC ;pld [r1] ;preload pred data ;pld [r1, #128] ;pld [r1, #256] diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c index 28526f3..1a6fce9 100644 --- a/vp8/encoder/generic/csystemdependent.c +++ b/vp8/encoder/generic/csystemdependent.c @@ -17,8 +17,10 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi); void vp8_arch_arm_encoder_init(VP8_COMP *cpi); -void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); -extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction); +void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc); +extern void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc); void vp8_cmachine_specific_config(VP8_COMP *cpi) { diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c index e01c59e..c1e5f77 100644 --- a/vp8/encoder/picklpf.c +++ b/vp8/encoder/picklpf.c @@ -29,12 +29,11 @@ extern int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, #define IF_RTCD(x) NULL #endif -extern void -(*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc, - int Fraction); -void -vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst_ybc, int Fraction) +extern void (*vp8_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc); + +void vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, + YV12_BUFFER_CONFIG *dst_ybc) { unsigned char *src_y, *dst_y; int yheight; @@ -47,21 +46,26 @@ vp8_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc, YV12_BUFFER_CONFIG *dst yheight = src_ybc->y_height; ystride = src_ybc->y_stride; - linestocopy = (yheight >> (Fraction + 4)); - - if (linestocopy < 1) - linestocopy = 1; - - linestocopy <<= 4; - - yoffset = ystride * ((yheight >> 5) * 16 - 8); + /* number of MB rows to use in partial filtering */ + linestocopy = (yheight >> 4) / PARTIAL_FRAME_FRACTION; + linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */ + + /* Copy extra 4 so that full filter context is available if filtering done + * on the copied partial frame and not original. Partial filter does mb + * filtering for top row also, which can modify3 pixels above. + */ + linestocopy += 4; + /* partial image starts at ~middle of frame (macroblock border)*/ + yoffset = ystride * (((yheight >> 5) * 16) - 4); src_y = src_ybc->y_buffer + yoffset; dst_y = dst_ybc->y_buffer + yoffset; - vpx_memcpy(dst_y, src_y, ystride *(linestocopy + 16)); + vpx_memcpy(dst_y, src_y, ystride * linestocopy); } -static int vp8_calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, int Fraction, const vp8_variance_rtcd_vtable_t *rtcd) +static int calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *dest, + const vp8_variance_rtcd_vtable_t *rtcd) { int i, j; int Total = 0; @@ -69,17 +73,16 @@ static int vp8_calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONF unsigned char *src = source->y_buffer; unsigned char *dst = dest->y_buffer; - int linestocopy = (source->y_height >> (Fraction + 4)); - (void)rtcd; - - if (linestocopy < 1) - linestocopy = 1; + int linestocopy; - linestocopy <<= 4; + /* number of MB rows to use in partial filtering */ + linestocopy = (source->y_height >> 4) / PARTIAL_FRAME_FRACTION; + linestocopy = linestocopy ? linestocopy << 4 : 16; /* 16 lines per MB */ - srcoffset = source->y_stride * (dest->y_height >> 5) * 16; - dstoffset = dest->y_stride * (dest->y_height >> 5) * 16; + /* partial image starts at ~middle of frame (macroblock border)*/ + srcoffset = source->y_stride * ((dest->y_height >> 5) * 16); + dstoffset = dest->y_stride * ((dest->y_height >> 5) * 16); src += srcoffset; dst += dstoffset; @@ -90,7 +93,9 @@ static int vp8_calc_partial_ssl_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONF for (j = 0; j < source->y_width; j += 16) { unsigned int sse; - Total += VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse); + Total += VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, + dst + j, dest->y_stride, + &sse); } src += 16 * source->y_stride; @@ -105,7 +110,8 @@ static int get_min_filter_level(VP8_COMP *cpi, int base_qindex) { int min_filter_level; - if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && !cpi->common.refresh_alt_ref_frame) + if (cpi->source_alt_ref_active && cpi->common.refresh_golden_frame && + !cpi->common.refresh_alt_ref_frame) min_filter_level = 0; else { @@ -148,7 +154,7 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) int best_filt_val = cm->filter_level; // Make a copy of the unfiltered / processed recon buffer - vp8_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf, 3); + vp8_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf); if (cm->frame_type == KEY_FRAME) cm->sharpness_level = 0; @@ -173,10 +179,10 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) // Get the err using the previous frame's filter value. vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); - best_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance)); + best_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance)); // Re-instate the unfiltered frame - vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3); + vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show); filt_val -= (1 + ((filt_val > 10) ? 1 : 0)); @@ -187,11 +193,10 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); // Get the err for filtered frame - filt_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance)); + filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance)); // Re-instate the unfiltered frame - vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3); - + vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show); // Update the best case record or exit loop. if (filt_err < best_err) @@ -220,10 +225,10 @@ void vp8cx_pick_filter_level_fast(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) vp8_loop_filter_partial_frame(cm, &cpi->mb.e_mbd, filt_val); // Get the err for filtered frame - filt_err = vp8_calc_partial_ssl_err(sd, cm->frame_to_show, 3, IF_RTCD(&cpi->rtcd.variance)); + filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, IF_RTCD(&cpi->rtcd.variance)); // Re-instate the unfiltered frame - vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3); + vp8_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show); // Update the best case record or exit loop. if (filt_err < best_err) -- 2.7.4