From fd903902ef2c6fef505829dffa4aad6b4d85950e Mon Sep 17 00:00:00 2001 From: Johann Date: Fri, 27 Jan 2012 10:23:52 -0800 Subject: [PATCH] RFC: Reorganize MFQE loops Break MFQE code into it's own file. It is currently only valid for 16x16 and 8x8 Y blocks. It also filters 4x4 U/V blocks. Refactor filtering and add associated assembly. Limited test cases show --mfqe introduces a penalty of ~20% with HD content. The assembly reduces the penalty to ~15% Change-Id: I4b8de6b5cdff5413037de5b6c42f437033ee55bf --- vp8/common/asm_com_offsets.c | 16 +++ vp8/common/mfqe.c | 271 +++++++++++++++++++++++++++++++++++++++++ vp8/common/postproc.c | 209 -------------------------------- vp8/common/postproc.h | 4 + vp8/common/rtcd_defs.sh | 9 ++ vp8/common/x86/mfqe_sse2.asm | 281 +++++++++++++++++++++++++++++++++++++++++++ vp8/vp8_common.mk | 2 + 7 files changed, 583 insertions(+), 209 deletions(-) create mode 100644 vp8/common/mfqe.c create mode 100644 vp8/common/x86/mfqe_sse2.asm diff --git a/vp8/common/asm_com_offsets.c b/vp8/common/asm_com_offsets.c index 5cf1519..ae22b5f 100644 --- a/vp8/common/asm_com_offsets.c +++ b/vp8/common/asm_com_offsets.c @@ -15,6 +15,10 @@ #include "vpx_scale/yv12config.h" #include "vp8/common/blockd.h" +#if CONFIG_POSTPROC +#include "postproc.h" +#endif /* CONFIG_POSTPROC */ + BEGIN /* vpx_scale */ @@ -30,6 +34,11 @@ DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_b DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border)); DEFINE(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS); +#if CONFIG_POSTPROC +/* mfqe.c / filter_by_weight */ +DEFINE(MFQE_PRECISION_VAL, MFQE_PRECISION); +#endif /* CONFIG_POSTPROC */ + END /* add asserts for any offset that is not supported by assembly code */ @@ -53,3 +62,10 @@ ct_assert(B_HU_PRED, B_HU_PRED == 9); /* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */ ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32) #endif + +#if HAVE_SSE2 +#if CONFIG_POSTPROC +/* vp8_filter_by_weight16x16 and 8x8 */ +ct_assert(MFQE_PRECISION_VAL, MFQE_PRECISION == 4) +#endif /* CONFIG_POSTPROC */ +#endif /* HAVE_SSE2 */ diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c new file mode 100644 index 0000000..05d9539 --- /dev/null +++ b/vp8/common/mfqe.c @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +/* MFQE: Multiframe Quality Enhancement + * In rate limited situations keyframes may cause significant visual artifacts + * commonly referred to as "popping." This file implements a postproccesing + * algorithm which blends data from the preceeding frame when there is no + * motion and the q from the previous frame is lower which indicates that it is + * higher quality. + */ + +#include "postproc.h" +#include "variance.h" +#include "vpx_mem/vpx_mem.h" +#include "vpx_rtcd.h" +#include "vpx_scale/yv12config.h" + +#include +#include + + +static inline void filter_by_weight(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int block_size, int src_weight) +{ + int dst_weight = (1 << MFQE_PRECISION) - src_weight; + int rounding_bit = 1 << (MFQE_PRECISION - 1); + int r, c; + + for (r = 0; r < block_size; r++) + { + for (c = 0; c < block_size; c++) + { + dst[c] = (src[c] * src_weight + + dst[c] * dst_weight + + rounding_bit) >> MFQE_PRECISION; + } + src += src_stride; + dst += dst_stride; + } +} + +void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int src_weight) +{ + filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight); +} + +void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int src_weight) +{ + filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight); +} + +void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride, + int src_weight) +{ + filter_by_weight(src, src_stride, dst, dst_stride, 4, src_weight); +} + +static inline void apply_ifactor(unsigned char *y_src, + int y_src_stride, + unsigned char *y_dst, + int y_dst_stride, + unsigned char *u_src, + unsigned char *v_src, + int uv_src_stride, + unsigned char *u_dst, + unsigned char *v_dst, + int uv_dst_stride, + int block_size, + int src_weight) +{ + if (block_size == 16) + { + vp8_filter_by_weight16x16(y_src, y_src_stride, y_dst, y_dst_stride, src_weight); + vp8_filter_by_weight8x8(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight); + vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight); + } + else /* if (block_size == 8) */ + { + vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride, src_weight); + vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight); + vp8_filter_by_weight4x4(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight); + } +} + +static void multiframe_quality_enhance_block +( + int blksize, /* Currently only values supported are 16 and 8 */ + int qcurr, + int qprev, + unsigned char *y, + unsigned char *u, + unsigned char *v, + int y_stride, + int uv_stride, + unsigned char *yd, + unsigned char *ud, + unsigned char *vd, + int yd_stride, + int uvd_stride +) +{ + static const unsigned char VP8_ZEROS[16]= + { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + }; + + int uvblksize = blksize >> 1; + int qdiff = qcurr - qprev; + + int i; + unsigned char *up; + unsigned char *udp; + unsigned char *vp; + unsigned char *vdp; + + unsigned int act, sad, thr, sse; + + if (blksize == 16) + { + act = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8; + sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, INT_MAX)+128)>>8; + } + else /* if (blksize == 8) */ + { + act = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6; + sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, INT_MAX)+32)>>6; + } + + /* thr = qdiff/8 + log2(act) + log4(qprev) */ + thr = (qdiff>>3); + while (act>>=1) thr++; + while (qprev>>=2) thr++; + + if (sad < thr) + { + int ifactor = (sad << MFQE_PRECISION) / thr; + ifactor >>= (qdiff >> 5); + + if (ifactor) + { + apply_ifactor(y, y_stride, yd, yd_stride, + u, v, uv_stride, + ud, vd, uvd_stride, + blksize, ifactor); + } + /* else implicitly copy from previous frame */ + } + else + { + if (blksize == 16) + { + vp8_copy_mem16x16(y, y_stride, yd, yd_stride); + vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride); + vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride); + } + else /* if (blksize == 8) */ + { + vp8_copy_mem8x8(y, y_stride, yd, yd_stride); + for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride) + vpx_memcpy(udp, up, uvblksize); + for (vp = v, vdp = vd, i = 0; i < uvblksize; ++i, vp += uv_stride, vdp += uvd_stride) + vpx_memcpy(vdp, vp, uvblksize); + } + } +} + +void vp8_multiframe_quality_enhance +( + VP8_COMMON *cm +) +{ + YV12_BUFFER_CONFIG *show = cm->frame_to_show; + YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer; + + FRAME_TYPE frame_type = cm->frame_type; + /* Point at base of Mb MODE_INFO list has motion vectors etc */ + const MODE_INFO *mode_info_context = cm->mi; + int mb_row; + int mb_col; + int qcurr = cm->base_qindex; + int qprev = cm->postproc_state.last_base_qindex; + + unsigned char *y_ptr, *u_ptr, *v_ptr; + unsigned char *yd_ptr, *ud_ptr, *vd_ptr; + + /* Set up the buffer pointers */ + y_ptr = show->y_buffer; + u_ptr = show->u_buffer; + v_ptr = show->v_buffer; + yd_ptr = dest->y_buffer; + ud_ptr = dest->u_buffer; + vd_ptr = dest->v_buffer; + + /* postprocess each macro block */ + for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) + { + for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) + { + /* if motion is high there will likely be no benefit */ + if (((frame_type == INTER_FRAME && + abs(mode_info_context->mbmi.mv.as_mv.row) <= 10 && + abs(mode_info_context->mbmi.mv.as_mv.col) <= 10) || + (frame_type == KEY_FRAME))) + { + if (mode_info_context->mbmi.mode == B_PRED || mode_info_context->mbmi.mode == SPLITMV) + { + int i, j; + for (i=0; i<2; ++i) + for (j=0; j<2; ++j) + multiframe_quality_enhance_block(8, qcurr, qprev, + y_ptr + 8*(i*show->y_stride+j), + u_ptr + 4*(i*show->uv_stride+j), + v_ptr + 4*(i*show->uv_stride+j), + show->y_stride, + show->uv_stride, + yd_ptr + 8*(i*dest->y_stride+j), + ud_ptr + 4*(i*dest->uv_stride+j), + vd_ptr + 4*(i*dest->uv_stride+j), + dest->y_stride, + dest->uv_stride); + } + else + { + multiframe_quality_enhance_block(16, qcurr, qprev, y_ptr, + u_ptr, v_ptr, + show->y_stride, + show->uv_stride, + yd_ptr, ud_ptr, vd_ptr, + dest->y_stride, + dest->uv_stride); + } + } + else + { + vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride); + vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride); + vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride); + } + y_ptr += 16; + u_ptr += 8; + v_ptr += 8; + yd_ptr += 16; + ud_ptr += 8; + vd_ptr += 8; + mode_info_context++; /* step to next MB */ + } + + y_ptr += show->y_stride * 16 - 16 * cm->mb_cols; + u_ptr += show->uv_stride * 8 - 8 * cm->mb_cols; + v_ptr += show->uv_stride * 8 - 8 * cm->mb_cols; + yd_ptr += dest->y_stride * 16 - 16 * cm->mb_cols; + ud_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols; + vd_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols; + + mode_info_context++; /* Skip border mb */ + } +} diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c index 280ce02..4f578c4 100644 --- a/vp8/common/postproc.c +++ b/vp8/common/postproc.c @@ -17,7 +17,6 @@ #include "vpx_scale/yv12extend.h" #include "vpx_scale/vpxscale.h" #include "systemdependent.h" -#include "variance.h" #include #include @@ -30,7 +29,6 @@ ( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128) /* global constants */ -#define MFQE_PRECISION 4 #if CONFIG_POSTPROC_VISUALIZER static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] = { @@ -694,213 +692,6 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei } -static void multiframe_quality_enhance_block -( - int blksize, /* Currently only values supported are 16, 8, 4 */ - int qcurr, - int qprev, - unsigned char *y, - unsigned char *u, - unsigned char *v, - int y_stride, - int uv_stride, - unsigned char *yd, - unsigned char *ud, - unsigned char *vd, - int yd_stride, - int uvd_stride -) -{ - static const unsigned char VP8_ZEROS[16]= - { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 - }; - int blksizeby2 = blksize >> 1; - int qdiff = qcurr - qprev; - - int i, j; - unsigned char *yp; - unsigned char *ydp; - unsigned char *up; - unsigned char *udp; - unsigned char *vp; - unsigned char *vdp; - - unsigned int act, sse, sad, thr; - if (blksize == 16) - { - act = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8; - sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, INT_MAX)+128)>>8; - } - else if (blksize == 8) - { - act = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6; - sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, INT_MAX)+32)>>6; - } - else - { - act = (vp8_variance4x4(yd, yd_stride, VP8_ZEROS, 0, &sse)+8)>>4; - sad = (vp8_sad4x4(y, y_stride, yd, yd_stride, INT_MAX)+8)>>4; - } - /* thr = qdiff/8 + log2(act) + log4(qprev) */ - thr = (qdiff>>3); - while (act>>=1) thr++; - while (qprev>>=2) thr++; - if (sad < thr) - { - static const int roundoff = (1 << (MFQE_PRECISION - 1)); - int ifactor = (sad << MFQE_PRECISION) / thr; - ifactor >>= (qdiff >> 5); - // TODO: SIMD optimize this section - if (ifactor) - { - int icfactor = (1 << MFQE_PRECISION) - ifactor; - for (yp = y, ydp = yd, i = 0; i < blksize; ++i, yp += y_stride, ydp += yd_stride) - { - for (j = 0; j < blksize; ++j) - ydp[j] = (int)((yp[j] * ifactor + ydp[j] * icfactor + roundoff) >> MFQE_PRECISION); - } - for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride) - { - for (j = 0; j < blksizeby2; ++j) - udp[j] = (int)((up[j] * ifactor + udp[j] * icfactor + roundoff) >> MFQE_PRECISION); - } - for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride) - { - for (j = 0; j < blksizeby2; ++j) - vdp[j] = (int)((vp[j] * ifactor + vdp[j] * icfactor + roundoff) >> MFQE_PRECISION); - } - } - } - else - { - if (blksize == 16) - { - vp8_copy_mem16x16(y, y_stride, yd, yd_stride); - vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride); - vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride); - } - else if (blksize == 8) - { - vp8_copy_mem8x8(y, y_stride, yd, yd_stride); - for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride) - vpx_memcpy(udp, up, blksizeby2); - for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride) - vpx_memcpy(vdp, vp, blksizeby2); - } - else - { - for (yp = y, ydp = yd, i = 0; i < blksize; ++i, yp += y_stride, ydp += yd_stride) - vpx_memcpy(ydp, yp, blksize); - for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride) - vpx_memcpy(udp, up, blksizeby2); - for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride) - vpx_memcpy(vdp, vp, blksizeby2); - } - } -} - -void vp8_multiframe_quality_enhance -( - VP8_COMMON *cm -) -{ - YV12_BUFFER_CONFIG *show = cm->frame_to_show; - YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer; - - FRAME_TYPE frame_type = cm->frame_type; - /* Point at base of Mb MODE_INFO list has motion vectors etc */ - const MODE_INFO *mode_info_context = cm->mi; - int mb_row; - int mb_col; - int qcurr = cm->base_qindex; - int qprev = cm->postproc_state.last_base_qindex; - - unsigned char *y_ptr, *u_ptr, *v_ptr; - unsigned char *yd_ptr, *ud_ptr, *vd_ptr; - - /* Set up the buffer pointers */ - y_ptr = show->y_buffer; - u_ptr = show->u_buffer; - v_ptr = show->v_buffer; - yd_ptr = dest->y_buffer; - ud_ptr = dest->u_buffer; - vd_ptr = dest->v_buffer; - - /* postprocess each macro block */ - for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) - { - for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) - { - /* if motion is high there will likely be no benefit */ - if (((frame_type == INTER_FRAME && - abs(mode_info_context->mbmi.mv.as_mv.row) <= 10 && - abs(mode_info_context->mbmi.mv.as_mv.col) <= 10) || - (frame_type == KEY_FRAME))) - { - if (mode_info_context->mbmi.mode == B_PRED || mode_info_context->mbmi.mode == SPLITMV) - { - int i, j; - for (i=0; i<2; ++i) - for (j=0; j<2; ++j) - multiframe_quality_enhance_block(8, - qcurr, - qprev, - y_ptr + 8*(i*show->y_stride+j), - u_ptr + 4*(i*show->uv_stride+j), - v_ptr + 4*(i*show->uv_stride+j), - show->y_stride, - show->uv_stride, - yd_ptr + 8*(i*dest->y_stride+j), - ud_ptr + 4*(i*dest->uv_stride+j), - vd_ptr + 4*(i*dest->uv_stride+j), - dest->y_stride, - dest->uv_stride); - } - else - { - multiframe_quality_enhance_block(16, - qcurr, - qprev, - y_ptr, - u_ptr, - v_ptr, - show->y_stride, - show->uv_stride, - yd_ptr, - ud_ptr, - vd_ptr, - dest->y_stride, - dest->uv_stride); - - } - } - else - { - vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride); - vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride); - vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride); - } - y_ptr += 16; - u_ptr += 8; - v_ptr += 8; - yd_ptr += 16; - ud_ptr += 8; - vd_ptr += 8; - mode_info_context++; /* step to next MB */ - } - - y_ptr += show->y_stride * 16 - 16 * cm->mb_cols; - u_ptr += show->uv_stride * 8 - 8 * cm->mb_cols; - v_ptr += show->uv_stride * 8 - 8 * cm->mb_cols; - yd_ptr += dest->y_stride * 16 - 16 * cm->mb_cols; - ud_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols; - vd_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols; - - mode_info_context++; /* Skip border mb */ - } -} - int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags) { int q = oci->filter_level * 10 / 6; diff --git a/vp8/common/postproc.h b/vp8/common/postproc.h index 1db7437..4a792dc 100644 --- a/vp8/common/postproc.h +++ b/vp8/common/postproc.h @@ -40,4 +40,8 @@ void vp8_deblock(YV12_BUFFER_CONFIG *source, int q, int low_var_thresh, int flag); + +#define MFQE_PRECISION 4 + +void vp8_multiframe_quality_enhance(struct VP8Common *cm); #endif diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh index ff8e30c..0fdb4fa 100644 --- a/vp8/common/rtcd_defs.sh +++ b/vp8/common/rtcd_defs.sh @@ -166,6 +166,15 @@ if [ "$CONFIG_POSTPROC" = "yes" ]; then prototype void vp8_blend_b "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride" # no asm yet + + prototype void vp8_filter_by_weight16x16 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight" + specialize vp8_filter_by_weight16x16 sse2 + + prototype void vp8_filter_by_weight8x8 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight" + specialize vp8_filter_by_weight8x8 sse2 + + prototype void vp8_filter_by_weight4x4 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight" + # no asm yet fi # diff --git a/vp8/common/x86/mfqe_sse2.asm b/vp8/common/x86/mfqe_sse2.asm new file mode 100644 index 0000000..10d21f3 --- /dev/null +++ b/vp8/common/x86/mfqe_sse2.asm @@ -0,0 +1,281 @@ +; +; Copyright (c) 2012 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp8_filter_by_weight16x16_sse2 +;( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride, +; int src_weight +;) +global sym(vp8_filter_by_weight16x16_sse2) +sym(vp8_filter_by_weight16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movd xmm0, arg(4) ; src_weight + pshuflw xmm0, xmm0, 0x0 ; replicate to all low words + punpcklqdq xmm0, xmm0 ; replicate to all hi words + + movdqa xmm1, [GLOBAL(tMFQE)] + psubw xmm1, xmm0 ; dst_weight + + mov rax, arg(0) ; src + mov rsi, arg(1) ; src_stride + mov rdx, arg(2) ; dst + mov rdi, arg(3) ; dst_stride + + mov rcx, 16 ; loop count + pxor xmm6, xmm6 + +.combine + movdqa xmm2, [rax] + movdqa xmm4, [rdx] + add rax, rsi + + ; src * src_weight + movdqa xmm3, xmm2 + punpcklbw xmm2, xmm6 + punpckhbw xmm3, xmm6 + pmullw xmm2, xmm0 + pmullw xmm3, xmm0 + + ; dst * dst_weight + movdqa xmm5, xmm4 + punpcklbw xmm4, xmm6 + punpckhbw xmm5, xmm6 + pmullw xmm4, xmm1 + pmullw xmm5, xmm1 + + ; sum, round and shift + paddw xmm2, xmm4 + paddw xmm3, xmm5 + paddw xmm2, [GLOBAL(tMFQE_round)] + paddw xmm3, [GLOBAL(tMFQE_round)] + psrlw xmm2, 4 + psrlw xmm3, 4 + + packuswb xmm2, xmm3 + movdqa [rdx], xmm2 + add rdx, rdi + + dec rcx + jnz .combine + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + + ret + +;void vp8_filter_by_weight8x8_sse2 +;( +; unsigned char *src, +; int src_stride, +; unsigned char *dst, +; int dst_stride, +; int src_weight +;) +global sym(vp8_filter_by_weight8x8_sse2) +sym(vp8_filter_by_weight8x8_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + movd xmm0, arg(4) ; src_weight + pshuflw xmm0, xmm0, 0x0 ; replicate to all low words + punpcklqdq xmm0, xmm0 ; replicate to all hi words + + movdqa xmm1, [GLOBAL(tMFQE)] + psubw xmm1, xmm0 ; dst_weight + + mov rax, arg(0) ; src + mov rsi, arg(1) ; src_stride + mov rdx, arg(2) ; dst + mov rdi, arg(3) ; dst_stride + + mov rcx, 8 ; loop count + pxor xmm4, xmm4 + +.combine + movq xmm2, [rax] + movq xmm3, [rdx] + add rax, rsi + + ; src * src_weight + punpcklbw xmm2, xmm4 + pmullw xmm2, xmm0 + + ; dst * dst_weight + punpcklbw xmm3, xmm4 + pmullw xmm3, xmm1 + + ; sum, round and shift + paddw xmm2, xmm3 + paddw xmm2, [GLOBAL(tMFQE_round)] + psrlw xmm2, 4 + + packuswb xmm2, xmm4 + movq [rdx], xmm2 + add rdx, rdi + + dec rcx + jnz .combine + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + + ret + +;void vp8_variance_and_sad_16x16_sse2 | arg +;( +; unsigned char *src1, 0 +; int stride1, 1 +; unsigned char *src2, 2 +; int stride2, 3 +; unsigned int *variance, 4 +; unsigned int *sad, 5 +;) +global sym(vp8_variance_and_sad_16x16_sse2) +sym(vp8_variance_and_sad_16x16_sse2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + mov rax, arg(0) ; src1 + mov rcx, arg(1) ; stride1 + mov rdx, arg(2) ; src2 + mov rdi, arg(3) ; stride2 + + mov rsi, 16 ; block height + + ; Prep accumulator registers + pxor xmm3, xmm3 ; SAD + pxor xmm4, xmm4 ; sum of src2 + pxor xmm5, xmm5 ; sum of src2^2 + + ; Because we're working with the actual output frames + ; we can't depend on any kind of data alignment. +.accumulate + movdqa xmm0, [rax] ; src1 + movdqa xmm1, [rdx] ; src2 + add rax, rcx ; src1 + stride1 + add rdx, rdi ; src2 + stride2 + + ; SAD(src1, src2) + psadbw xmm0, xmm1 + paddusw xmm3, xmm0 + + ; SUM(src2) + pxor xmm2, xmm2 + psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 + paddusw xmm4, xmm2 + + ; pmaddubsw would be ideal if it took two unsigned values. instead, + ; it expects a signed and an unsigned value. so instead we zero extend + ; and operate on words. + pxor xmm2, xmm2 + movdqa xmm0, xmm1 + punpcklbw xmm0, xmm2 + punpckhbw xmm1, xmm2 + pmaddwd xmm0, xmm0 + pmaddwd xmm1, xmm1 + paddd xmm5, xmm0 + paddd xmm5, xmm1 + + sub rsi, 1 + jnz .accumulate + + ; phaddd only operates on adjacent double words. + ; Finalize SAD and store + movdqa xmm0, xmm3 + psrldq xmm0, 8 + paddusw xmm0, xmm3 + paddd xmm0, [GLOBAL(t128)] + psrld xmm0, 8 + + mov rax, arg(5) + movd [rax], xmm0 + + ; Accumulate sum of src2 + movdqa xmm0, xmm4 + psrldq xmm0, 8 + paddusw xmm0, xmm4 + ; Square src2. Ignore high value + pmuludq xmm0, xmm0 + psrld xmm0, 8 + + ; phaddw could be used to sum adjacent values but we want + ; all the values summed. promote to doubles, accumulate, + ; shift and sum + pxor xmm2, xmm2 + movdqa xmm1, xmm5 + punpckldq xmm1, xmm2 + punpckhdq xmm5, xmm2 + paddd xmm1, xmm5 + movdqa xmm2, xmm1 + psrldq xmm1, 8 + paddd xmm1, xmm2 + + psubd xmm1, xmm0 + + ; (variance + 128) >> 8 + paddd xmm1, [GLOBAL(t128)] + psrld xmm1, 8 + mov rax, arg(4) + + movd [rax], xmm1 + + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +t128: + ddq 128 +align 16 +tMFQE: ; 1 << MFQE_PRECISION + times 8 dw 0x10 +align 16 +tMFQE_round: ; 1 << (MFQE_PRECISION - 1) + times 8 dw 0x08 + diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index f68d007..3403557 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -79,6 +79,7 @@ VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c +VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm @@ -112,6 +113,7 @@ VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/sad_sse4.asm ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.c VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm endif -- 2.7.4