Use H264 MMX chroma functions to accelerate RV40 decoding.

author Mathieu Velten <matmaul@gmail.com>

Sun, 4 Jan 2009 01:36:11 +0000 (01:36 +0000)

committer Carl Eugen Hoyos <cehoyos@rainbow.studorg.tuwien.ac.at>

Sun, 4 Jan 2009 01:36:11 +0000 (01:36 +0000)
author Mathieu Velten <matmaul@gmail.com>
Sun, 4 Jan 2009 01:36:11 +0000 (01:36 +0000)
committer Carl Eugen Hoyos <cehoyos@rainbow.studorg.tuwien.ac.at>
Sun, 4 Jan 2009 01:36:11 +0000 (01:36 +0000)
diff --git a/libavcodec/x86/dsputil_h264_template_mmx.c b/libavcodec/x86/dsputil_h264_template_mmx.c

index 0bf8732e352fb90c5b558ae7306d4fc0170cfc5e..43f4393098bc6a2f4edfa68c223afb440ad2e5ec 100644 (file)
--- a/libavcodec/x86/dsputil_h264_template_mmx.c
+++ b/libavcodec/x86/dsputil_h264_template_mmx.c
@@ -25,9 +25,8 @@
   * H264_CHROMA_OP must be defined to empty for put and pavgb/pavgusb for avg
   * H264_CHROMA_MC8_MV0 must be defined to a (put|avg)_pixels8 function
   */
-static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, int rnd)
+static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg)
  {
-    const uint64_t *rnd_reg;
      DECLARE_ALIGNED_8(uint64_t, AA);
      DECLARE_ALIGNED_8(uint64_t, DD);
      int i;
@@ -45,17 +44,15 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
          /* 1 dimensional filter only */
          const int dxy = x ? 1 : stride;
  
-        rnd_reg = rnd ? &ff_pw_4 : &ff_pw_3;
-
          __asm__ volatile(
              "movd %0, %%mm5\n\t"
              "movq %1, %%mm4\n\t"
-            "movq %2, %%mm6\n\t"         /* mm6 = rnd */
+            "movq %2, %%mm6\n\t"         /* mm6 = rnd >> 3 */
              "punpcklwd %%mm5, %%mm5\n\t"
              "punpckldq %%mm5, %%mm5\n\t" /* mm5 = B = x */
              "pxor %%mm7, %%mm7\n\t"
              "psubw %%mm5, %%mm4\n\t"     /* mm4 = A = 8-x */
-            :: "rm"(x+y), "m"(ff_pw_8), "m"(*rnd_reg));
+            :: "rm"(x+y), "m"(ff_pw_8), "m"(*(rnd_reg+1)));
  
          for(i=0; i<h; i++) {
              __asm__ volatile(
@@ -78,7 +75,7 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
                  "pmullw %%mm5, %%mm2\n\t"
                  "pmullw %%mm5, %%mm3\n\t"
  
-                /* dst[0..7] = (A * src[0..7] + B * src[1..8] + 4) >> 3 */
+                /* dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 */
                  "paddw %%mm6, %%mm0\n\t"
                  "paddw %%mm6, %%mm1\n\t"
                  "paddw %%mm2, %%mm0\n\t"
@@ -97,7 +94,6 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
      }
  
      /* general case, bilinear */
-    rnd_reg = rnd ? &ff_pw_32.a : &ff_pw_28.a;
      __asm__ volatile("movd %2, %%mm4\n\t"
                   "movd %3, %%mm6\n\t"
                   "punpcklwd %%mm4, %%mm4\n\t"
@@ -172,7 +168,7 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
              : : "m" (src[0]), "m" (src[1]), "m" (DD));
  
          __asm__ volatile(
-            /* dst[0..7] = ([mm2,mm3] + 32) >> 6 */
+            /* dst[0..7] = ([mm2,mm3] + rnd) >> 6 */
              "paddw %1, %%mm2\n\t"
              "paddw %1, %%mm3\n\t"
              "psrlw $6, %%mm2\n\t"
@@ -185,7 +181,7 @@ static void H264_CHROMA_MC8_TMPL(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*
      }
  }
  
-static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y, const uint64_t *rnd_reg)
  {
      __asm__ volatile(
          "pxor   %%mm7, %%mm7        \n\t"
@@ -249,7 +245,7 @@ static void H264_CHROMA_MC4_TMPL(uint8_t *dst/*align 4*/, uint8_t *src/*align 1*
          "sub $2, %2                 \n\t"
          "jnz 1b                     \n\t"
          : "+r"(dst), "+r"(src), "+r"(h)
-        : "r"((x86_reg)stride), "m"(ff_pw_32), "m"(x), "m"(y)
+        : "r"((x86_reg)stride), "m"(*rnd_reg), "m"(x), "m"(y)
      );
  }
  
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c

index c6588ed02d17df32b574ca3c7153b974c4eae478..f76a4fa33a9a6ab6e989e20f7063c1b39eb78d12 100644 (file)
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -1733,6 +1733,7 @@ PREFETCH(prefetch_3dnow, prefetch)
  #undef PREFETCH
  
  #include "h264dsp_mmx.c"
+#include "rv40dsp_mmx.c"
  
  /* CAVS specific */
  void ff_cavsdsp_init_mmx2(DSPContext* c, AVCodecContext *avctx);
@@ -2638,6 +2639,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
          c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_mmx;
          c->put_no_rnd_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_mmx_nornd;
  
+        c->put_rv40_chroma_pixels_tab[0]= put_rv40_chroma_mc8_mmx;
+        c->put_rv40_chroma_pixels_tab[1]= put_rv40_chroma_mc4_mmx;
+
          c->h264_idct_dc_add=
          c->h264_idct_add= ff_h264_idct_add_mmx;
          c->h264_idct8_dc_add=
@@ -2723,6 +2727,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
              SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
              SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
  
+            c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_mmx2;
+            c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_mmx2;
+
              c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_mmx2_rnd;
              c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_mmx2;
              c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_mmx2;
@@ -2808,6 +2815,9 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
              c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_3dnow_rnd;
              c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_3dnow;
  
+            c->avg_rv40_chroma_pixels_tab[0]= avg_rv40_chroma_mc8_3dnow;
+            c->avg_rv40_chroma_pixels_tab[1]= avg_rv40_chroma_mc4_3dnow;
+
              if (ENABLE_CAVS_DECODER)
                  ff_cavsdsp_init_3dnow(c, avctx);
          }
diff --git a/libavcodec/x86/h264dsp_mmx.c b/libavcodec/x86/h264dsp_mmx.c

index 511aeb4c827dc2e7ef9127266652c1f8db01ad98..8eeb65779bd30e6216d6acfaab43035ff4d583f8 100644 (file)
--- a/libavcodec/x86/h264dsp_mmx.c
+++ b/libavcodec/x86/h264dsp_mmx.c
@@ -2084,22 +2084,30 @@ H264_MC_816(H264_MC_H, ssse3)
  H264_MC_816(H264_MC_HV, ssse3)
  #endif
  
+/* rnd interleaved with rnd div 8, use p+1 to access rnd div 8 */
+DECLARE_ALIGNED_8(static const uint64_t, h264_rnd_reg[4]) = {
+    0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL
+};
  
  #define H264_CHROMA_OP(S,D)
  #define H264_CHROMA_OP4(S,D,T)
-#define H264_CHROMA_MC8_TMPL put_h264_chroma_mc8_mmx
-#define H264_CHROMA_MC4_TMPL put_h264_chroma_mc4_mmx
+#define H264_CHROMA_MC8_TMPL put_h264_chroma_generic_mc8_mmx
+#define H264_CHROMA_MC4_TMPL put_h264_chroma_generic_mc4_mmx
  #define H264_CHROMA_MC2_TMPL put_h264_chroma_mc2_mmx2
  #define H264_CHROMA_MC8_MV0 put_pixels8_mmx
  #include "dsputil_h264_template_mmx.c"
  
  static void put_h264_chroma_mc8_mmx_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  {
-    put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 1);
+    put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
  }
  static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  {
-    put_h264_chroma_mc8_mmx(dst, src, stride, h, x, y, 0);
+    put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, h264_rnd_reg+2);
+}
+static void put_h264_chroma_mc4_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, h264_rnd_reg);
  }
  
  #undef H264_CHROMA_OP
@@ -2112,14 +2120,18 @@ static void put_h264_chroma_mc8_mmx_nornd(uint8_t *dst/*align 8*/, uint8_t *src/
  #define H264_CHROMA_OP(S,D) "pavgb " #S ", " #D " \n\t"
  #define H264_CHROMA_OP4(S,D,T) "movd  " #S ", " #T " \n\t"\
                                 "pavgb " #T ", " #D " \n\t"
-#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_mmx2
-#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_mmx2
+#define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_mmx2
+#define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_mmx2
  #define H264_CHROMA_MC2_TMPL avg_h264_chroma_mc2_mmx2
  #define H264_CHROMA_MC8_MV0 avg_pixels8_mmx2
  #include "dsputil_h264_template_mmx.c"
  static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  {
-    avg_h264_chroma_mc8_mmx2(dst, src, stride, h, x, y, 1);
+    avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
+}
+static void avg_h264_chroma_mc4_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, h264_rnd_reg);
  }
  #undef H264_CHROMA_OP
  #undef H264_CHROMA_OP4
@@ -2131,13 +2143,17 @@ static void avg_h264_chroma_mc8_mmx2_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*
  #define H264_CHROMA_OP(S,D) "pavgusb " #S ", " #D " \n\t"
  #define H264_CHROMA_OP4(S,D,T) "movd " #S ", " #T " \n\t"\
                                 "pavgusb " #T ", " #D " \n\t"
-#define H264_CHROMA_MC8_TMPL avg_h264_chroma_mc8_3dnow
-#define H264_CHROMA_MC4_TMPL avg_h264_chroma_mc4_3dnow
+#define H264_CHROMA_MC8_TMPL avg_h264_chroma_generic_mc8_3dnow
+#define H264_CHROMA_MC4_TMPL avg_h264_chroma_generic_mc4_3dnow
  #define H264_CHROMA_MC8_MV0 avg_pixels8_3dnow
  #include "dsputil_h264_template_mmx.c"
  static void avg_h264_chroma_mc8_3dnow_rnd(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
  {
-    avg_h264_chroma_mc8_3dnow(dst, src, stride, h, x, y, 1);
+    avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
+}
+static void avg_h264_chroma_mc4_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, h264_rnd_reg);
  }
  #undef H264_CHROMA_OP
  #undef H264_CHROMA_OP4
diff --git a/libavcodec/x86/rv40dsp_mmx.c b/libavcodec/x86/rv40dsp_mmx.c

new file mode 100644 (file)

index 0000000..47461c6
--- /dev/null
+++ b/libavcodec/x86/rv40dsp_mmx.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2008 Konstantin Shishkov, Mathieu Velten
+ *
+ * MMX-optimized DSP functions for RV40, based on H.264 optimizations by
+ * Michael Niedermayer and Loren Merritt
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "dsputil_mmx.h"
+
+/* bias interleaved with bias div 8, use p+1 to access bias div 8 */
+DECLARE_ALIGNED_8(static const uint64_t, rv40_bias_reg[4][8]) = {
+    { 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0010001000100010ULL, 0x0002000200020002ULL,
+      0x0020002000200020ULL, 0x0004000400040004ULL, 0x0010001000100010ULL, 0x0002000200020002ULL },
+    { 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL,
+      0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL },
+    { 0x0000000000000000ULL, 0x0000000000000000ULL, 0x0020002000200020ULL, 0x0004000400040004ULL,
+      0x0010001000100010ULL, 0x0002000200020002ULL, 0x0020002000200020ULL, 0x0004000400040004ULL },
+    { 0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL,
+      0x0020002000200020ULL, 0x0004000400040004ULL, 0x001C001C001C001CULL, 0x0003000300030003ULL }
+};
+
+static void put_rv40_chroma_mc8_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    put_h264_chroma_generic_mc8_mmx(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]);
+}
+static void put_rv40_chroma_mc4_mmx(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    put_h264_chroma_generic_mc4_mmx(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]);
+}
+static void avg_rv40_chroma_mc8_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_generic_mc8_mmx2(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]);
+}
+static void avg_rv40_chroma_mc4_mmx2(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_generic_mc4_mmx2(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]);
+}
+static void avg_rv40_chroma_mc8_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_generic_mc8_3dnow(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]);
+}
+static void avg_rv40_chroma_mc4_3dnow(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y)
+{
+    avg_h264_chroma_generic_mc4_3dnow(dst, src, stride, h, x, y, &rv40_bias_reg[y>>1][x&(~1)]);
+}
author	Mathieu Velten <matmaul@gmail.com>
	Sun, 4 Jan 2009 01:36:11 +0000 (01:36 +0000)
committer	Carl Eugen Hoyos <cehoyos@rainbow.studorg.tuwien.ac.at>
	Sun, 4 Jan 2009 01:36:11 +0000 (01:36 +0000)
libavcodec/x86/dsputil_h264_template_mmx.c		patch \| blob \| history
libavcodec/x86/dsputil_mmx.c		patch \| blob \| history
libavcodec/x86/h264dsp_mmx.c		patch \| blob \| history
libavcodec/x86/rv40dsp_mmx.c	[new file with mode: 0644]	patch \| blob