Add x264 SSE2 iDCT functions to H.264 decoder.

author Jason Garrett-Glaser <darkshikari@gmail.com>

Sat, 3 Jan 2009 00:46:17 +0000 (00:46 +0000)

committer Jason Garrett-Glaser <darkshikari@gmail.com>

Sat, 3 Jan 2009 00:46:17 +0000 (00:46 +0000)
author Jason Garrett-Glaser <darkshikari@gmail.com>
Sat, 3 Jan 2009 00:46:17 +0000 (00:46 +0000)
committer Jason Garrett-Glaser <darkshikari@gmail.com>
Sat, 3 Jan 2009 00:46:17 +0000 (00:46 +0000)
diff --git a/libavcodec/Makefile b/libavcodec/Makefile

index ca94d16..e4217d6 100644 (file)
--- a/libavcodec/Makefile
+++ b/libavcodec/Makefile
@@ -407,6 +407,7 @@ MMX-OBJS-$(CONFIG_VP6F_DECODER)        += x86/vp3dsp_mmx.o x86/vp3dsp_sse2.o
  MMX-OBJS-$(CONFIG_WMV3_DECODER)        += x86/vc1dsp_mmx.o
  MMX-OBJS-$(HAVE_YASM)                  += x86/dsputil_yasm.o            \
                                            x86/h264_deblock_sse2.o       \
+                                          x86/h264_idct_sse2.o          \
  
  OBJS-$(HAVE_MMX)                       += x86/cpuid.o                   \
                                            x86/dnxhd_mmx.o               \
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c

index 52eaccd..c6588ed 100644 (file)
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -2872,14 +2872,17 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
              c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_mmxext;
              c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_mmxext;
  #endif
-#if defined(ARCH_X86_64) || !defined(__ICC) || __ICC > 1100
              if( mm_flags&FF_MM_SSE2 ){
+#if defined(ARCH_X86_64) || !defined(__ICC) || __ICC > 1100
                  c->h264_v_loop_filter_luma = ff_x264_deblock_v_luma_sse2;
                  c->h264_h_loop_filter_luma = ff_x264_deblock_h_luma_sse2;
                  c->h264_v_loop_filter_luma_intra = ff_x264_deblock_v_luma_intra_sse2;
                  c->h264_h_loop_filter_luma_intra = ff_x264_deblock_h_luma_intra_sse2;
-            }
  #endif
+                c->h264_idct_add16 = ff_h264_idct_add16_sse2;
+                c->h264_idct_add8  = ff_h264_idct_add8_sse2;
+                c->h264_idct_add16intra = ff_h264_idct_add16intra_sse2;
+            }
          }
  #endif
  
diff --git a/libavcodec/x86/h264_idct_sse2.asm b/libavcodec/x86/h264_idct_sse2.asm

new file mode 100755 (executable)

index 0000000..a46cd97
--- /dev/null
+++ b/libavcodec/x86/h264_idct_sse2.asm
@@ -0,0 +1,61 @@
+;*****************************************************************************\r
+;* dct-a.asm: h264 encoder library\r
+;*****************************************************************************\r
+;* Copyright (C) 2003-2008 x264 project\r
+;*\r
+;* Authors: Laurent Aimar <fenrir@via.ecp.fr>\r
+;*          Loren Merritt <lorenm@u.washington.edu>\r
+;*          Holger Lubitz <hal@duncan.ol.sub.de>\r
+;*          Min Chen <chenm001.163.com>\r
+;*\r
+;* This program is free software; you can redistribute it and/or modify\r
+;* it under the terms of the GNU General Public License as published by\r
+;* the Free Software Foundation; either version 2 of the License, or\r
+;* (at your option) any later version.\r
+;*\r
+;* This program is distributed in the hope that it will be useful,\r
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of\r
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\r
+;* GNU General Public License for more details.\r
+;*\r
+;* You should have received a copy of the GNU General Public License\r
+;* along with this program; if not, write to the Free Software\r
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.\r
+;*****************************************************************************\r
+\r
+%include "x86inc.asm"\r
+%include "x86util.asm"\r
+\r
+SECTION_RODATA\r
+pw_32: times 8 dw 32\r
+\r
+SECTION .text\r
+\r
+%macro IDCT4_1D 6\r
+    SUMSUB_BA   m%3, m%1\r
+    SUMSUBD2_AB m%2, m%4, m%6, m%5\r
+    SUMSUB_BADC m%2, m%3, m%5, m%1\r
+    SWAP %1, %2, %5, %4, %3\r
+%endmacro\r
+\r
+INIT_XMM\r
+cglobal x264_add8x4_idct_sse2, 3,3\r
+    movq   m0, [r1+ 0]\r
+    movq   m1, [r1+ 8]\r
+    movq   m2, [r1+16]\r
+    movq   m3, [r1+24]\r
+    movhps m0, [r1+32]\r
+    movhps m1, [r1+40]\r
+    movhps m2, [r1+48]\r
+    movhps m3, [r1+56]\r
+    IDCT4_1D 0,1,2,3,4,5\r
+    TRANSPOSE2x4x4W 0,1,2,3,4\r
+    paddw m0, [pw_32 GLOBAL]\r
+    IDCT4_1D 0,1,2,3,4,5\r
+    pxor  m7, m7\r
+    STORE_DIFF  m0, m4, m7, [r0]\r
+    STORE_DIFF  m1, m4, m7, [r0+r2]\r
+    lea   r0, [r0+r2*2]\r
+    STORE_DIFF  m2, m4, m7, [r0]\r
+    STORE_DIFF  m3, m4, m7, [r0+r2]\r
+    RET\r
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c

index 7d19f99..511aeb4 100644 (file)
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -472,6 +472,78 @@ static void ff_h264_idct_add8_mmx2(uint8_t **dest, const int *block_offset, DCTE
      }
  }
  
+#if defined(CONFIG_GPL) && defined(HAVE_YASM)
+static void ff_h264_idct_dc_add8_mmx2(uint8_t *dst, int16_t *block, int stride)
+{
+    __asm__ volatile(
+        "movd             %0, %%mm0 \n\t"   //  0 0 X D
+        "punpcklwd        %1, %%mm0 \n\t"   //  x X d D
+        "paddsw           %2, %%mm0 \n\t"
+        "psraw            $6, %%mm0 \n\t"
+        "punpcklwd     %%mm0, %%mm0 \n\t"   //  d d D D
+        "pxor          %%mm1, %%mm1 \n\t"   //  0 0 0 0
+        "psubw         %%mm0, %%mm1 \n\t"   // -d-d-D-D
+        "packuswb      %%mm1, %%mm0 \n\t"   // -d-d-D-D d d D D
+        "pshufw $0xFA, %%mm0, %%mm1 \n\t"   // -d-d-d-d-D-D-D-D
+        "punpcklwd     %%mm0, %%mm0 \n\t"   //  d d d d D D D D
+        ::"m"(block[ 0]),
+          "m"(block[16]),
+          "m"(ff_pw_32)
+    );
+    __asm__ volatile(
+        "movq          %0, %%mm2 \n\t"
+        "movq          %1, %%mm3 \n\t"
+        "movq          %2, %%mm4 \n\t"
+        "movq          %3, %%mm5 \n\t"
+        "paddusb    %%mm0, %%mm2 \n\t"
+        "paddusb    %%mm0, %%mm3 \n\t"
+        "paddusb    %%mm0, %%mm4 \n\t"
+        "paddusb    %%mm0, %%mm5 \n\t"
+        "psubusb    %%mm1, %%mm2 \n\t"
+        "psubusb    %%mm1, %%mm3 \n\t"
+        "psubusb    %%mm1, %%mm4 \n\t"
+        "psubusb    %%mm1, %%mm5 \n\t"
+        "movq       %%mm2, %0    \n\t"
+        "movq       %%mm3, %1    \n\t"
+        "movq       %%mm4, %2    \n\t"
+        "movq       %%mm5, %3    \n\t"
+        :"+m"(*(uint64_t*)(dst+0*stride)),
+         "+m"(*(uint64_t*)(dst+1*stride)),
+         "+m"(*(uint64_t*)(dst+2*stride)),
+         "+m"(*(uint64_t*)(dst+3*stride))
+    );
+}
+
+extern void ff_x264_add8x4_idct_sse2(uint8_t *dst, int16_t *block, int stride);
+
+static void ff_h264_idct_add16_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=2)
+        if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
+            ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
+}
+
+static void ff_h264_idct_add16intra_sse2(uint8_t *dst, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=0; i<16; i+=2){
+        if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
+            ff_x264_add8x4_idct_sse2 (dst + block_offset[i], block + i*16, stride);
+        else if(block[i*16]|block[i*16+16])
+            ff_h264_idct_dc_add8_mmx2(dst + block_offset[i], block + i*16, stride);
+    }
+}
+
+static void ff_h264_idct_add8_sse2(uint8_t **dest, const int *block_offset, DCTELEM *block, int stride, const uint8_t nnzc[6*8]){
+    int i;
+    for(i=16; i<16+8; i++){
+        if(nnzc[ scan8[i+0] ]|nnzc[ scan8[i+1] ])
+            ff_x264_add8x4_idct_sse2 (dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+        else if(block[i*16]|block[i*16+16])
+            ff_h264_idct_dc_add8_mmx2(dest[(i&4)>>2] + block_offset[i], block + i*16, stride);
+    }
+}
+#endif
+
  /***********************************/
  /* deblocking */
  
diff --git a/libavcodec/x86/x86util.asm b/libavcodec/x86/x86util.asm

new file mode 100644 (file)

index 0000000..2e318ef
--- /dev/null
+++ b/libavcodec/x86/x86util.asm
@@ -0,0 +1,240 @@
+;*****************************************************************************
+;* x86inc.asm
+;*****************************************************************************
+;* Copyright (C) 2008 Loren Merritt <lorenm@u.washington.edu>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*****************************************************************************
+
+%macro SBUTTERFLY 4
+    mova      m%4, m%2
+    punpckl%1 m%2, m%3
+    punpckh%1 m%4, m%3
+    SWAP %3, %4
+%endmacro
+
+%macro TRANSPOSE4x4W 5
+    SBUTTERFLY wd, %1, %2, %5
+    SBUTTERFLY wd, %3, %4, %5
+    SBUTTERFLY dq, %1, %3, %5
+    SBUTTERFLY dq, %2, %4, %5
+    SWAP %2, %3
+%endmacro
+
+%macro TRANSPOSE2x4x4W 5
+    SBUTTERFLY wd,  %1, %2, %5
+    SBUTTERFLY wd,  %3, %4, %5
+    SBUTTERFLY dq,  %1, %3, %5
+    SBUTTERFLY dq,  %2, %4, %5
+    SBUTTERFLY qdq, %1, %2, %5
+    SBUTTERFLY qdq, %3, %4, %5
+%endmacro
+
+%macro TRANSPOSE4x4D 5
+    SBUTTERFLY dq,  %1, %2, %5
+    SBUTTERFLY dq,  %3, %4, %5
+    SBUTTERFLY qdq, %1, %3, %5
+    SBUTTERFLY qdq, %2, %4, %5
+    SWAP %2, %3
+%endmacro
+
+%macro TRANSPOSE8x8W 9-11
+%ifdef ARCH_X86_64
+    SBUTTERFLY wd,  %1, %2, %9
+    SBUTTERFLY wd,  %3, %4, %9
+    SBUTTERFLY wd,  %5, %6, %9
+    SBUTTERFLY wd,  %7, %8, %9
+    SBUTTERFLY dq,  %1, %3, %9
+    SBUTTERFLY dq,  %2, %4, %9
+    SBUTTERFLY dq,  %5, %7, %9
+    SBUTTERFLY dq,  %6, %8, %9
+    SBUTTERFLY qdq, %1, %5, %9
+    SBUTTERFLY qdq, %2, %6, %9
+    SBUTTERFLY qdq, %3, %7, %9
+    SBUTTERFLY qdq, %4, %8, %9
+    SWAP %2, %5
+    SWAP %4, %7
+%else
+; in:  m0..m7, unless %11 in which case m6 is in %9
+; out: m0..m7, unless %11 in which case m4 is in %10
+; spills into %9 and %10
+%if %0<11
+    movdqa %9, m%7
+%endif
+    SBUTTERFLY wd,  %1, %2, %7
+    movdqa %10, m%2
+    movdqa m%7, %9
+    SBUTTERFLY wd,  %3, %4, %2
+    SBUTTERFLY wd,  %5, %6, %2
+    SBUTTERFLY wd,  %7, %8, %2
+    SBUTTERFLY dq,  %1, %3, %2
+    movdqa %9, m%3
+    movdqa m%2, %10
+    SBUTTERFLY dq,  %2, %4, %3
+    SBUTTERFLY dq,  %5, %7, %3
+    SBUTTERFLY dq,  %6, %8, %3
+    SBUTTERFLY qdq, %1, %5, %3
+    SBUTTERFLY qdq, %2, %6, %3
+    movdqa %10, m%2
+    movdqa m%3, %9
+    SBUTTERFLY qdq, %3, %7, %2
+    SBUTTERFLY qdq, %4, %8, %2
+    SWAP %2, %5
+    SWAP %4, %7
+%if 0<11
+    movdqa m%5, %10
+%endif
+%endif
+%endmacro
+
+%macro ABS1_MMX 2    ; a, tmp
+    pxor    %2, %2
+    psubw   %2, %1
+    pmaxsw  %1, %2
+%endmacro
+
+%macro ABS2_MMX 4    ; a, b, tmp0, tmp1
+    pxor    %3, %3
+    pxor    %4, %4
+    psubw   %3, %1
+    psubw   %4, %2
+    pmaxsw  %1, %3
+    pmaxsw  %2, %4
+%endmacro
+
+%macro ABS1_SSSE3 2
+    pabsw   %1, %1
+%endmacro
+
+%macro ABS2_SSSE3 4
+    pabsw   %1, %1
+    pabsw   %2, %2
+%endmacro
+
+%define ABS1 ABS1_MMX
+%define ABS2 ABS2_MMX
+
+%macro ABS4 6
+    ABS2 %1, %2, %5, %6
+    ABS2 %3, %4, %5, %6
+%endmacro
+
+%macro SPLATB_MMX 3
+    movd      %1, [%2-3] ;to avoid crossing a cacheline
+    punpcklbw %1, %1
+%if mmsize==16
+    pshuflw   %1, %1, 0xff
+    punpcklqdq %1, %1
+%else
+    pshufw    %1, %1, 0xff
+%endif
+%endmacro
+
+%macro SPLATB_SSSE3 3
+    movd      %1, [%2-3]
+    pshufb    %1, %3
+%endmacro
+
+%macro PALIGNR_MMX 4
+    %ifnidn %4, %2
+    mova    %4, %2
+    %endif
+    %if mmsize == 8
+    psllq   %1, (8-%3)*8
+    psrlq   %4, %3*8
+    %else
+    pslldq  %1, 16-%3
+    psrldq  %4, %3
+    %endif
+    por     %1, %4
+%endmacro
+
+%macro PALIGNR_SSSE3 4
+    palignr %1, %2, %3
+%endmacro
+
+%macro SUMSUB_BA 2
+    paddw   %1, %2
+    paddw   %2, %2
+    psubw   %2, %1
+%endmacro
+
+%macro SUMSUB_BADC 4
+    paddw   %1, %2
+    paddw   %3, %4
+    paddw   %2, %2
+    paddw   %4, %4
+    psubw   %2, %1
+    psubw   %4, %3
+%endmacro
+
+%macro HADAMARD8_1D 8
+    SUMSUB_BADC %1, %5, %2, %6
+    SUMSUB_BADC %3, %7, %4, %8
+    SUMSUB_BADC %1, %3, %2, %4
+    SUMSUB_BADC %5, %7, %6, %8
+    SUMSUB_BADC %1, %2, %3, %4
+    SUMSUB_BADC %5, %6, %7, %8
+%endmacro
+
+%macro SUMSUB2_AB 3
+    mova    %3, %1
+    paddw   %1, %1
+    paddw   %1, %2
+    psubw   %3, %2
+    psubw   %3, %2
+%endmacro
+
+%macro SUMSUBD2_AB 4
+    mova    %4, %1
+    mova    %3, %2
+    psraw   %2, 1
+    psraw   %4, 1
+    paddw   %1, %2
+    psubw   %4, %3
+%endmacro
+
+%macro LOAD_DIFF 5
+%ifidn %3, none
+    movh       %1, %4
+    movh       %2, %5
+    punpcklbw  %1, %2
+    punpcklbw  %2, %2
+    psubw      %1, %2
+%else
+    movh       %1, %4
+    punpcklbw  %1, %3
+    movh       %2, %5
+    punpcklbw  %2, %3
+    psubw      %1, %2
+%endif
+%endmacro
+
+%macro LOAD_DIFF_8x4P 6-8 r0,r2 ; 4x dest, 2x temp, 2x pointer
+    LOAD_DIFF %1, %5, none, [%7],      [%8]
+    LOAD_DIFF %2, %6, none, [%7+r1],   [%8+r3]
+    LOAD_DIFF %3, %5, none, [%7+2*r1], [%8+2*r3]
+    LOAD_DIFF %4, %6, none, [%7+r4],   [%8+r5]
+%endmacro
+
+%macro STORE_DIFF 4
+    psraw      %1, 6
+    movh       %2, %4
+    punpcklbw  %2, %3
+    paddsw     %1, %2
+    packuswb   %1, %1
+    movh       %4, %1
+%endmacro
+
author	Jason Garrett-Glaser <darkshikari@gmail.com>
	Sat, 3 Jan 2009 00:46:17 +0000 (00:46 +0000)
committer	Jason Garrett-Glaser <darkshikari@gmail.com>
	Sat, 3 Jan 2009 00:46:17 +0000 (00:46 +0000)
libavcodec/Makefile		patch \| blob \| history
libavcodec/x86/dsputil_mmx.c		patch \| blob \| history
libavcodec/x86/h264_idct_sse2.asm	[new file with mode: 0755]	patch \| blob
libavcodec/x86/h264dsp_mmx.c		patch \| blob \| history
libavcodec/x86/x86util.asm	[new file with mode: 0644]	patch \| blob