From f188a1e0ca12822fd6c607924169d678c7254838 Mon Sep 17 00:00:00 2001 From: Daniel Kang Date: Sun, 5 Jun 2011 18:33:23 -0400 Subject: [PATCH] H.264: Add x86 assembly for 10-bit MC Chroma H.264 functions. Mainly ported from 8-bit H.264 MC Chroma. Signed-off-by: Ronald S. Bultje --- libavcodec/x86/Makefile | 1 + libavcodec/x86/dsputil_mmx.c | 32 ++++ libavcodec/x86/h264_chromamc_10bit.asm | 273 +++++++++++++++++++++++++++++++++ 3 files changed, 306 insertions(+) create mode 100644 libavcodec/x86/h264_chromamc_10bit.asm diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile index 1c451c8..ea57bd1 100644 --- a/libavcodec/x86/Makefile +++ b/libavcodec/x86/Makefile @@ -44,6 +44,7 @@ MMX-OBJS-$(HAVE_YASM) += x86/dsputil_yasm.o \ x86/deinterlace.o \ x86/fmtconvert.o \ x86/h264_chromamc.o \ + x86/h264_chromamc_10bit.o \ $(YASM-OBJS-yes) MMX-OBJS-$(CONFIG_FFT) += x86/fft.o diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c index 214c6a3..b174b83 100644 --- a/libavcodec/x86/dsputil_mmx.c +++ b/libavcodec/x86/dsputil_mmx.c @@ -1938,6 +1938,19 @@ void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src, void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, int stride, int h, int x, int y); +#define CHROMA_MC(OP, NUM, DEPTH, OPT) \ +void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \ + (uint8_t *dst, uint8_t *src,\ + int stride, int h, int x, int y); + +CHROMA_MC(put, 2, 10, mmxext) +CHROMA_MC(avg, 2, 10, mmxext) +CHROMA_MC(put, 4, 10, mmxext) +CHROMA_MC(avg, 4, 10, mmxext) +CHROMA_MC(put, 8, 10, sse2) +CHROMA_MC(avg, 8, 10, sse2) +CHROMA_MC(put, 8, 10, avx) +CHROMA_MC(avg, 8, 10, avx) /* CAVS specific */ void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) { @@ -2420,6 +2433,7 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) { int mm_flags = av_get_cpu_flags(); const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8; + const int bit_depth = avctx->bits_per_raw_sample; if (avctx->dsp_mask) { if (avctx->dsp_mask & AV_CPU_FLAG_FORCE) @@ -2651,6 +2665,12 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2; c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2; } + if (bit_depth == 10) { + c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext; + c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext; + c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext; + c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_10_mmxext; + } c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2; #endif @@ -2756,6 +2776,10 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) H264_QPEL_FUNCS(3, 2, sse2); H264_QPEL_FUNCS(3, 3, sse2); } + if (bit_depth == 10) { + c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_sse2; + c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_sse2; + } } #if HAVE_SSSE3 if(mm_flags & AV_CPU_FLAG_SSSE3){ @@ -2854,6 +2878,14 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx) } #endif } +#if HAVE_AVX + if (mm_flags & AV_CPU_FLAG_AVX) { + if (bit_depth == 10) { + c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx; + c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx; + } + } +#endif } if (CONFIG_ENCODERS) diff --git a/libavcodec/x86/h264_chromamc_10bit.asm b/libavcodec/x86/h264_chromamc_10bit.asm new file mode 100644 index 0000000..9d07543 --- /dev/null +++ b/libavcodec/x86/h264_chromamc_10bit.asm @@ -0,0 +1,273 @@ +;***************************************************************************** +;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code +;***************************************************************************** +;* Copyright (C) 2005-2011 x264 project +;* +;* Authors: Daniel Kang +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "x86inc.asm" +%include "x86util.asm" + +SECTION_RODATA + +cextern pw_4 +cextern pw_8 +cextern pw_32 +cextern pw_64 + +SECTION .text + + +%macro MV0_PIXELS_MC8 0 + lea r4, [r2*3 ] + lea r5, [r2*4 ] +.next4rows + movu m0, [r1 ] + movu m1, [r1+r2 ] + CHROMAMC_AVG m0, [r0 ] + CHROMAMC_AVG m1, [r0+r2 ] + mova [r0 ], m0 + mova [r0+r2 ], m1 + movu m0, [r1+r2*2] + movu m1, [r1+r4 ] + CHROMAMC_AVG m0, [r0+r2*2] + CHROMAMC_AVG m1, [r0+r4 ] + mova [r0+r2*2], m0 + mova [r0+r4 ], m1 + add r1, r5 + add r0, r5 + sub r3d, 4 + jne .next4rows +%endmacro + +;----------------------------------------------------------------------------- +; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my) +;----------------------------------------------------------------------------- +%macro CHROMA_MC8 2 +; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, +; int stride, int h, int mx, int my) +cglobal %1_h264_chroma_mc8_10_%2, 6,7,8 + movsxdifnidn r2, r2d + mov r6d, r5d + or r6d, r4d + jne .at_least_one_non_zero + ; mx == 0 AND my == 0 - no filter needed + MV0_PIXELS_MC8 + REP_RET + +.at_least_one_non_zero + mov r6d, 2 + test r5d, r5d + je .x_interpolation + mov r6, r2 ; dxy = x ? 1 : stride + test r4d, r4d + jne .xy_interpolation +.x_interpolation + ; mx == 0 XOR my == 0 - 1 dimensional filter only + or r4d, r5d ; x + y + movd m5, r4d + mova m4, [pw_8] + mova m6, [pw_4] ; mm6 = rnd >> 3 + SPLATW m5, m5 ; mm5 = B = x + psubw m4, m5 ; mm4 = A = 8-x + +.next1drow + movu m0, [r1 ] ; mm0 = src[0..7] + movu m2, [r1+r6] ; mm2 = src[1..8] + + pmullw m0, m4 ; mm0 = A * src[0..7] + pmullw m2, m5 ; mm2 = B * src[1..8] + + paddw m0, m6 + paddw m0, m2 + psrlw m0, 3 + CHROMAMC_AVG m0, [r0] + mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 + + add r0, r2 + add r1, r2 + dec r3d + jne .next1drow + REP_RET + +.xy_interpolation ; general case, bilinear + movd m4, r4m ; x + movd m6, r5m ; y + + SPLATW m4, m4 ; mm4 = x words + SPLATW m6, m6 ; mm6 = y words + psllw m5, m4, 3 ; mm5 = 8x + pmullw m4, m6 ; mm4 = x * y + psllw m6, 3 ; mm6 = 8y + paddw m1, m5, m6 ; mm7 = 8x+8y + mova m7, m4 ; DD = x * y + psubw m5, m4 ; mm5 = B = 8x - xy + psubw m6, m4 ; mm6 = C = 8y - xy + paddw m4, [pw_64] + psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64 + + movu m0, [r1 ] ; mm0 = src[0..7] + movu m1, [r1+2] ; mm1 = src[1..8] +.next2drow + add r1, r2 + + pmullw m2, m0, m4 + pmullw m1, m5 + paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8] + + movu m0, [r1] + movu m1, [r1+2] + pmullw m3, m0, m6 + paddw m2, m3 ; mm2 += C * src[0..7+strde] + pmullw m3, m1, m7 + paddw m2, m3 ; mm2 += D * src[1..8+strde] + + paddw m2, [pw_32] + psrlw m2, 6 + CHROMAMC_AVG m2, [r0] + mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6 + + add r0, r2 + dec r3d + jne .next2drow + REP_RET +%endmacro + +;----------------------------------------------------------------------------- +; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my) +;----------------------------------------------------------------------------- +;TODO: xmm mc4 +%macro MC4_OP 2 + movq %1, [r1 ] + movq m1, [r1+2] + add r1, r2 + pmullw %1, m4 + pmullw m1, m2 + paddw m1, %1 + mova %1, m1 + + pmullw %2, m5 + pmullw m1, m3 + paddw %2, [pw_32] + paddw m1, %2 + psrlw m1, 6 + CHROMAMC_AVG m1, %2, [r0] + movq [r0], m1 + add r0, r2 +%endmacro + +%macro CHROMA_MC4 2 +cglobal %1_h264_chroma_mc4_10_%2, 6,6,7 + movsxdifnidn r2, r2d + movd m2, r4m ; x + movd m3, r5m ; y + mova m4, [pw_8] + mova m5, m4 + SPLATW m2, m2 + SPLATW m3, m3 + psubw m4, m2 + psubw m5, m3 + + movq m0, [r1 ] + movq m6, [r1+2] + add r1, r2 + pmullw m0, m4 + pmullw m6, m2 + paddw m6, m0 + +.next2rows + MC4_OP m0, m6 + MC4_OP m6, m0 + sub r3d, 2 + jnz .next2rows + REP_RET +%endmacro + +;----------------------------------------------------------------------------- +; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my) +;----------------------------------------------------------------------------- +%macro CHROMA_MC2 2 +cglobal %1_h264_chroma_mc2_10_%2, 6,7 + movsxdifnidn r2, r2d + mov r6d, r4d + shl r4d, 16 + sub r4d, r6d + add r4d, 8 + imul r5d, r4d ; x*y<<16 | y*(8-x) + shl r4d, 3 + sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) + + movd m5, r4d + movd m6, r5d + punpckldq m5, m5 ; mm5 = {A,B,A,B} + punpckldq m6, m6 ; mm6 = {C,D,C,D} + pxor m7, m7 + pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2] + +.nextrow + add r1, r2 + movq m1, m2 + pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] + pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2] + movq m2, m0 + pmaddwd m0, m6 + paddw m1, [pw_32] + paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] + psrlw m1, 6 + packssdw m1, m7 + CHROMAMC_AVG m1, m3, [r0] + movd [r0], m1 + add r0, r2 + dec r3d + jnz .nextrow + REP_RET +%endmacro + +%macro NOTHING 2-3 +%endmacro +%macro AVG 2-3 +%if %0==3 + movq %2, %3 +%endif + PAVG %1, %2 +%endmacro + +%define CHROMAMC_AVG NOTHING +INIT_XMM +CHROMA_MC8 put, sse2 +%ifdef HAVE_AVX +INIT_AVX +CHROMA_MC8 put, avx +%endif +INIT_MMX +CHROMA_MC4 put, mmxext +CHROMA_MC2 put, mmxext + +%define CHROMAMC_AVG AVG +%define PAVG pavgw +INIT_XMM +CHROMA_MC8 avg, sse2 +%ifdef HAVE_AVX +INIT_AVX +CHROMA_MC8 avg, avx +%endif +INIT_MMX +CHROMA_MC4 avg, mmxext +CHROMA_MC2 avg, mmxext -- 2.7.4