h264 mmx weighted prediction. up to 3% overall speedup.

author Loren Merritt <lorenm@u.washington.edu>

Sun, 9 Oct 2005 23:38:52 +0000 (23:38 +0000)

committer Loren Merritt <lorenm@u.washington.edu>

Sun, 9 Oct 2005 23:38:52 +0000 (23:38 +0000)
author Loren Merritt <lorenm@u.washington.edu>
Sun, 9 Oct 2005 23:38:52 +0000 (23:38 +0000)
committer Loren Merritt <lorenm@u.washington.edu>
Sun, 9 Oct 2005 23:38:52 +0000 (23:38 +0000)
diff --git a/libavcodec/i386/dsputil_mmx.c b/libavcodec/i386/dsputil_mmx.c

index 12d9900f66cdc738337bc5bf3ffc688f05320b03..44e72b889ec5f65e3e64dc5c7e88befcb66c2fed 100644 (file)
--- a/libavcodec/i386/dsputil_mmx.c
+++ b/libavcodec/i386/dsputil_mmx.c
@@ -2832,6 +2832,24 @@ void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
              c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_mmx2;
              c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_mmx2;
  
+            c->weight_h264_pixels_tab[0]= ff_h264_weight_16x16_mmx2;
+            c->weight_h264_pixels_tab[1]= ff_h264_weight_16x8_mmx2;
+            c->weight_h264_pixels_tab[2]= ff_h264_weight_8x16_mmx2;
+            c->weight_h264_pixels_tab[3]= ff_h264_weight_8x8_mmx2;
+            c->weight_h264_pixels_tab[4]= ff_h264_weight_8x4_mmx2;
+            c->weight_h264_pixels_tab[5]= ff_h264_weight_4x8_mmx2;
+            c->weight_h264_pixels_tab[6]= ff_h264_weight_4x4_mmx2;
+            c->weight_h264_pixels_tab[7]= ff_h264_weight_4x2_mmx2;
+
+            c->biweight_h264_pixels_tab[0]= ff_h264_biweight_16x16_mmx2;
+            c->biweight_h264_pixels_tab[1]= ff_h264_biweight_16x8_mmx2;
+            c->biweight_h264_pixels_tab[2]= ff_h264_biweight_8x16_mmx2;
+            c->biweight_h264_pixels_tab[3]= ff_h264_biweight_8x8_mmx2;
+            c->biweight_h264_pixels_tab[4]= ff_h264_biweight_8x4_mmx2;
+            c->biweight_h264_pixels_tab[5]= ff_h264_biweight_4x8_mmx2;
+            c->biweight_h264_pixels_tab[6]= ff_h264_biweight_4x4_mmx2;
+            c->biweight_h264_pixels_tab[7]= ff_h264_biweight_4x2_mmx2;
+
  #ifdef CONFIG_ENCODERS
              c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_mmx2;
  #endif //CONFIG_ENCODERS
diff --git a/libavcodec/i386/h264dsp_mmx.c b/libavcodec/i386/h264dsp_mmx.c

index cf2a445278dca8e225bca9e646bb2013c25f33e7..c278affc8d8a82037aeadabfecaf6fe633492715 100644 (file)
--- a/libavcodec/i386/h264dsp_mmx.c
+++ b/libavcodec/i386/h264dsp_mmx.c
@@ -909,3 +909,100 @@ H264_MC(avg_, 16,mmx2)
  #undef H264_CHROMA_OP
  #undef H264_CHROMA_MC8_TMPL
  
+/***********************************/
+/* weighted prediction */
+
+static inline void ff_h264_weight_WxH_mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset, int w, int h)
+{
+    int x, y;
+    offset <<= log2_denom;
+    offset += (1 << log2_denom) >> 1;
+    asm volatile(
+        "movd    %0, %%mm4        \n\t"
+        "movd    %1, %%mm5        \n\t"
+        "movd    %2, %%mm6        \n\t"
+        "pshufw  $0, %%mm4, %%mm4 \n\t"
+        "pshufw  $0, %%mm5, %%mm5 \n\t"
+        "pxor    %%mm7, %%mm7     \n\t"
+        :: "g"(weight), "g"(offset), "g"(log2_denom)
+    );
+    for(y=0; y<h; y+=2){
+        for(x=0; x<w; x+=4){
+            asm volatile(
+                "movd      %0,    %%mm0 \n\t"
+                "movd      %1,    %%mm1 \n\t"
+                "punpcklbw %%mm7, %%mm0 \n\t"
+                "punpcklbw %%mm7, %%mm1 \n\t"
+                "pmullw    %%mm4, %%mm0 \n\t"
+                "pmullw    %%mm4, %%mm1 \n\t"
+                "paddw     %%mm5, %%mm0 \n\t"
+                "paddw     %%mm5, %%mm1 \n\t"
+                "psraw     %%mm6, %%mm0 \n\t"
+                "psraw     %%mm6, %%mm1 \n\t"
+                "packuswb  %%mm7, %%mm0 \n\t"
+                "packuswb  %%mm7, %%mm1 \n\t"
+                "movd      %%mm0, %0    \n\t"
+                "movd      %%mm1, %1    \n\t"
+                : "+m"(*(uint32_t*)(dst+x)),
+                  "+m"(*(uint32_t*)(dst+x+stride))
+            );
+        }
+        dst += 2*stride;
+    }
+}
+
+static inline void ff_h264_biweight_WxH_mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets, int w, int h)
+{
+    int x, y;
+    int offset = ((offsets + offsetd + 1) | 1) << log2_denom;
+    asm volatile(
+        "movd    %0, %%mm3        \n\t"
+        "movd    %1, %%mm4        \n\t"
+        "movd    %2, %%mm5        \n\t"
+        "movd    %3, %%mm6        \n\t"
+        "pshufw  $0, %%mm3, %%mm3 \n\t"
+        "pshufw  $0, %%mm4, %%mm4 \n\t"
+        "pshufw  $0, %%mm5, %%mm5 \n\t"
+        "pxor    %%mm7, %%mm7     \n\t"
+        :: "g"(weightd), "g"(weights), "g"(offset), "g"(log2_denom+1)
+    );
+    for(y=0; y<h; y++){
+        for(x=0; x<w; x+=4){
+            asm volatile(
+                "movd      %0,    %%mm0 \n\t"
+                "movd      %1,    %%mm1 \n\t"
+                "punpcklbw %%mm7, %%mm0 \n\t"
+                "punpcklbw %%mm7, %%mm1 \n\t"
+                "pmullw    %%mm3, %%mm0 \n\t"
+                "pmullw    %%mm4, %%mm1 \n\t"
+                "paddw     %%mm5, %%mm0 \n\t"
+                "paddw     %%mm1, %%mm0 \n\t"
+                "psraw     %%mm6, %%mm0 \n\t"
+                "packuswb  %%mm0, %%mm0 \n\t"
+                "movd      %%mm0, %0    \n\t"
+                : "+m"(*(uint32_t*)(dst+x))
+                :  "m"(*(uint32_t*)(src+x))
+            );
+        }
+        src += stride;
+        dst += stride;
+    }
+}
+
+#define H264_WEIGHT(W,H) \
+static void ff_h264_biweight_ ## W ## x ## H ## _mmx2(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offsetd, int offsets){ \
+    ff_h264_biweight_WxH_mmx2(dst, src, stride, log2_denom, weightd, weights, offsetd, offsets, W, H); \
+} \
+static void ff_h264_weight_ ## W ## x ## H ## _mmx2(uint8_t *dst, int stride, int log2_denom, int weight, int offset){ \
+    ff_h264_weight_WxH_mmx2(dst, stride, log2_denom, weight, offset, W, H); \
+}
+
+H264_WEIGHT(16,16)
+H264_WEIGHT(16, 8)
+H264_WEIGHT( 8,16)
+H264_WEIGHT( 8, 8)
+H264_WEIGHT( 8, 4)
+H264_WEIGHT( 4, 8)
+H264_WEIGHT( 4, 4)
+H264_WEIGHT( 4, 2)
+
author	Loren Merritt <lorenm@u.washington.edu>
	Sun, 9 Oct 2005 23:38:52 +0000 (23:38 +0000)
committer	Loren Merritt <lorenm@u.washington.edu>
	Sun, 9 Oct 2005 23:38:52 +0000 (23:38 +0000)
libavcodec/i386/dsputil_mmx.c		patch \| blob \| history
libavcodec/i386/h264dsp_mmx.c		patch \| blob \| history