VP8: move zeroing of luma DC block into the WHT

author Jason Garrett-Glaser <darkshikari@gmail.com>

Mon, 2 Aug 2010 20:18:09 +0000 (20:18 +0000)

committer Jason Garrett-Glaser <darkshikari@gmail.com>

Mon, 2 Aug 2010 20:18:09 +0000 (20:18 +0000)
author Jason Garrett-Glaser <darkshikari@gmail.com>
Mon, 2 Aug 2010 20:18:09 +0000 (20:18 +0000)
committer Jason Garrett-Glaser <darkshikari@gmail.com>
Mon, 2 Aug 2010 20:18:09 +0000 (20:18 +0000)
diff --git a/libavcodec/vp8.c b/libavcodec/vp8.c

index 596cfc5a7673b61d2d4df66e102ccd456b7fd33d..651924196b03a380a05de555d17db264097394cb 100644 (file)
--- a/libavcodec/vp8.c
+++ b/libavcodec/vp8.c
@@ -117,6 +117,7 @@ typedef struct {
       */
      DECLARE_ALIGNED(16, uint8_t, non_zero_count_cache)[6][4];
      DECLARE_ALIGNED(16, DCTELEM, block)[6][4][16];
+    DECLARE_ALIGNED(16, DCTELEM, block_dc)[16];
      uint8_t intra4x4_pred_mode_mb[16];
  
      int chroma_pred_mode;    ///< 8x8c pred mode of the current macroblock
@@ -864,22 +865,19 @@ static av_always_inline
  void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
                        uint8_t t_nnz[9], uint8_t l_nnz[9])
  {
-    LOCAL_ALIGNED_16(DCTELEM, dc,[16]);
      int i, x, y, luma_start = 0, luma_ctx = 3;
      int nnz_pred, nnz, nnz_total = 0;
      int segment = s->segment;
  
      if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
-        AV_ZERO128(dc);
-        AV_ZERO128(dc+8);
          nnz_pred = t_nnz[8] + l_nnz[8];
  
          // decode DC values and do hadamard
-        nnz = decode_block_coeffs(c, dc, s->prob->token[1], 0, nnz_pred,
+        nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred,
                                    s->qmat[segment].luma_dc_qmul);
          l_nnz[8] = t_nnz[8] = !!nnz;
          nnz_total += nnz;
-        s->vp8dsp.vp8_luma_dc_wht(s->block, dc);
+        s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc);
          luma_start = 1;
          luma_ctx = 0;
      }
diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c

index f3f3fb6da0acd4cd8c6ff7e7428d642cdd98f4e5..5f5124803dd5f545fd92500c080e79d6ab373a76 100644 (file)
--- a/libavcodec/vp8dsp.c
+++ b/libavcodec/vp8dsp.c
@@ -46,6 +46,10 @@ static void vp8_luma_dc_wht_c(DCTELEM block[4][4][16], DCTELEM dc[16])
          t1 = dc[i*4+1] + dc[i*4+2];
          t2 = dc[i*4+1] - dc[i*4+2];
          t3 = dc[i*4+0] - dc[i*4+3] + 3; // rounding
+        dc[i*4+0] = 0;
+        dc[i*4+1] = 0;
+        dc[i*4+2] = 0;
+        dc[i*4+3] = 0;
  
          *block[i][0] = (t0 + t1) >> 3;
          *block[i][1] = (t3 + t2) >> 3;
diff --git a/libavcodec/x86/vp8dsp-init.c b/libavcodec/x86/vp8dsp-init.c

index 4bf49364e7ca7f88f896889af7f82f69a021b094..aceec6a346431f8b3d14993af8f3b8c4ccd250c0 100644 (file)
--- a/libavcodec/x86/vp8dsp-init.c
+++ b/libavcodec/x86/vp8dsp-init.c
@@ -224,6 +224,7 @@ extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16], int str
  extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16], int stride);
  extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16], int stride);
  extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
+extern void ff_vp8_luma_dc_wht_sse(DCTELEM block[4][4][16], DCTELEM dc[16]);
  extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16], int stride);
  extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16], int stride);
  
@@ -335,6 +336,7 @@ av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
  
      if (mm_flags & FF_MM_SSE) {
          c->vp8_idct_add                         = ff_vp8_idct_add_sse;
+        c->vp8_luma_dc_wht                      = ff_vp8_luma_dc_wht_sse;
          c->put_vp8_epel_pixels_tab[0][0][0]     =
          c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
      }
diff --git a/libavcodec/x86/vp8dsp.asm b/libavcodec/x86/vp8dsp.asm

index 4f430d80c82f7b6af8e95650f0ae8a8b70177ecb..6999e87b632afb7175a4aacdeafc4be394bf364d 100644 (file)
--- a/libavcodec/x86/vp8dsp.asm
+++ b/libavcodec/x86/vp8dsp.asm
@@ -1186,12 +1186,23 @@ VP8_IDCT_ADD sse
      SWAP %1, %4, %3
  %endmacro
  
-INIT_MMX
-cglobal vp8_luma_dc_wht_mmx, 2,3
+%macro VP8_DC_WHT 1
+cglobal vp8_luma_dc_wht_%1, 2,3
      movq          m0, [r1]
      movq          m1, [r1+8]
      movq          m2, [r1+16]
      movq          m3, [r1+24]
+%ifidn %1, sse
+    xorps      xmm0, xmm0
+    movaps  [r1+ 0], xmm0
+    movaps  [r1+16], xmm0
+%else
+    pxor         m4, m4
+    movq    [r1+ 0], m4
+    movq    [r1+ 8], m4
+    movq    [r1+16], m4
+    movq    [r1+24], m4
+%endif
      HADAMARD4_1D  0, 1, 2, 3
      TRANSPOSE4x4W 0, 1, 2, 3, 4
      paddw         m0, [pw_3]
@@ -1203,6 +1214,11 @@ cglobal vp8_luma_dc_wht_mmx, 2,3
      SCATTER_WHT   0, 1, 0
      SCATTER_WHT   2, 3, 2
      RET
+%endmacro
+
+INIT_MMX
+VP8_DC_WHT mmx
+VP8_DC_WHT sse
  
  ;-----------------------------------------------------------------------------
  ; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
author	Jason Garrett-Glaser <darkshikari@gmail.com>
	Mon, 2 Aug 2010 20:18:09 +0000 (20:18 +0000)
committer	Jason Garrett-Glaser <darkshikari@gmail.com>
	Mon, 2 Aug 2010 20:18:09 +0000 (20:18 +0000)
libavcodec/vp8.c		patch \| blob \| history
libavcodec/vp8dsp.c		patch \| blob \| history
libavcodec/x86/vp8dsp-init.c		patch \| blob \| history
libavcodec/x86/vp8dsp.asm		patch \| blob \| history