From a272ff25cd99f47950dddb55e94b370e95b70016 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 15 May 2013 13:16:02 -0400
Subject: [PATCH] WIP: 16x16 idct/recon merge

This patch eliminates the intermediate diff buffer usage by
combining the short idct and the add residual into one function.
The encoder can use the same code as well.

Change-Id: Iea7976b22b1927d24b8004d2a3fddae7ecca3ba1
---
 test/dct16x16_test.cc                 |  29 ++++++----
 vp9/common/vp9_idct.c                 |  66 +++++++++++-----------
 vp9/common/vp9_rtcd_defs.sh           |  15 ++---
 vp9/common/x86/vp9_idct_intrin_sse2.c | 101 +++++++++++++++++-----------------
 vp9/decoder/vp9_idct_blk.c            |  21 +------
 vp9/decoder/x86/vp9_dequantize_sse2.c |  59 --------------------
 vp9/encoder/vp9_encodemb.c            |  15 ++---
 7 files changed, 117 insertions(+), 189 deletions(-)

diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index f6d2d59..9fb45d6 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -17,6 +17,7 @@
 extern "C" {
 #include "vp9/common/vp9_entropy.h"
 #include "vp9_rtcd.h"
+void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
 }
 
 #include "acm_random.h"
@@ -269,19 +270,23 @@ TEST(VP9Idct16x16Test, AccuracyCheck) {
   const int count_test_block = 1000;
   for (int i = 0; i < count_test_block; ++i) {
     int16_t in[256], coeff[256];
-    int16_t out_c[256];
+    uint8_t dst[256], src[256];
     double out_r[256];
 
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 256; ++j)
-      in[j] = rnd.Rand8() - rnd.Rand8();
+      in[j] = src[j] - dst[j];
 
     reference_16x16_dct_2d(in, out_r);
     for (int j = 0; j < 256; j++)
       coeff[j] = round(out_r[j]);
-    vp9_short_idct16x16_c(coeff, out_c, 32);
+    vp9_short_idct16x16_add_c(coeff, dst, 16);
     for (int j = 0; j < 256; ++j) {
-      const int diff = out_c[j] - in[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       EXPECT_GE(1, error)
           << "Error: 16x16 IDCT has error " << error
@@ -289,7 +294,7 @@ TEST(VP9Idct16x16Test, AccuracyCheck) {
     }
   }
 }
-#if 1
+
 // we need enable fdct test once we re-do the 16 point fdct.
 TEST(VP9Fdct16x16Test, AccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -299,18 +304,22 @@ TEST(VP9Fdct16x16Test, AccuracyCheck) {
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[256];
     int16_t test_temp_block[256];
-    int16_t test_output_block[256];
+    uint8_t dst[256], src[256];
 
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 256; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 32;
     vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct16x16_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct16x16_add_c(test_temp_block, dst, 16);
 
     for (int j = 0; j < 256; ++j) {
-      const int diff = test_input_block[j] - test_output_block[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       if (max_error < error)
         max_error = error;
@@ -354,6 +363,4 @@ TEST(VP9Fdct16x16Test, CoeffSizeCheck) {
     }
   }
 }
-#endif
-
 }  // namespace
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 5e6384c..b166fcb 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -621,10 +621,9 @@ static void idct16_1d(int16_t *input, int16_t *output) {
   output[15] = step2[0] - step2[15];
 }
 
-void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[16 * 16];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[16], temp_out[16];
 
@@ -641,7 +640,8 @@ void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
       temp_in[j] = out[j * 16 + i];
     idct16_1d(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
@@ -823,8 +823,8 @@ static const transform_2d IHT_16[] = {
   { iadst16_1d, iadst16_1d }   // ADST_ADST = 3
 };
 
-void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
-                          int pitch, int tx_type) {
+void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+                              int tx_type) {
   int i, j;
   int16_t out[16 * 16];
   int16_t *outptr = out;
@@ -844,37 +844,37 @@ void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
       temp_in[j] = out[j * 16 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
-  }
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);  }
 }
 
-void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
-    int16_t out[16 * 16];
-    int16_t *outptr = out;
-    const int half_pitch = pitch >> 1;
-    int i, j;
-    int16_t temp_in[16], temp_out[16];
-
-    /* First transform rows. Since all non-zero dct coefficients are in
-     * upper-left 4x4 area, we only need to calculate first 4 rows here.
-     */
-    vpx_memset(out, 0, sizeof(out));
-    for (i = 0; i < 4; ++i) {
-      idct16_1d(input, outptr);
-      input += 16;
-      outptr += 16;
-    }
-
-    // Then transform columns
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j*16 + i];
-      idct16_1d(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
-    }
-}
+void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
+  int16_t out[16 * 16];
+  int16_t *outptr = out;
+  int i, j;
+  int16_t temp_in[16], temp_out[16];
+
+  /* First transform rows. Since all non-zero dct coefficients are in
+   * upper-left 4x4 area, we only need to calculate first 4 rows here.
+   */
+  vpx_memset(out, 0, sizeof(out));
+  for (i = 0; i < 4; ++i) {
+    idct16_1d(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
 
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j*16 + i];
+    idct16_1d(temp_in, temp_out);
+    for (j = 0; j < 16; ++j)
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
+  }
+}
 
 void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index c45d030..ea60fbb 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -91,9 +91,6 @@ specialize vp9_add_residual_4x4 sse2
 prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride"
 specialize vp9_add_residual_8x8 sse2
 
-prototype void vp9_add_residual_16x16 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_16x16 sse2
-
 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_8x8 sse2
 
@@ -200,11 +197,11 @@ specialize vp9_short_idct10_8x8 sse2
 prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_8x8
 
-prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct16x16 sse2
+prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct16x16_add sse2
 
-prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_16x16 sse2
+prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_16x16_add sse2
 
 prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_16x16
@@ -224,8 +221,8 @@ specialize vp9_short_iht8x8
 prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
 specialize vp9_short_iht4x4
 
-prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht16x16
+prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
+specialize vp9_short_iht16x16_add
 
 prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
 specialize vp9_idct4_1d sse2
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index e53a937..667da33 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -752,8 +752,17 @@ void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
                            stp2_10, stp2_13, stp2_11, stp2_12) \
   }
 
-void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+#define RECON_AND_STORE(dest, in_x) \
+  {                                                     \
+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      in_x = _mm_add_epi16(in_x, d0); \
+      in_x = _mm_packus_epi16(in_x, in_x); \
+      _mm_storel_epi64((__m128i *)(dest), in_x); \
+      dest += stride; \
+  }
+
+void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   const __m128i zero = _mm_setzero_si128();
@@ -938,31 +947,30 @@ void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
       in14 = _mm_srai_epi16(in14, 6);
       in15 = _mm_srai_epi16(in15, 6);
 
-      // Store results
-      _mm_store_si128((__m128i *)output, in0);
-      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
-      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
-      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
-      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
-      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
-      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
-      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
-      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
-      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
-
-      output += 8;
+      RECON_AND_STORE(dest, in0);
+      RECON_AND_STORE(dest, in1);
+      RECON_AND_STORE(dest, in2);
+      RECON_AND_STORE(dest, in3);
+      RECON_AND_STORE(dest, in4);
+      RECON_AND_STORE(dest, in5);
+      RECON_AND_STORE(dest, in6);
+      RECON_AND_STORE(dest, in7);
+      RECON_AND_STORE(dest, in8);
+      RECON_AND_STORE(dest, in9);
+      RECON_AND_STORE(dest, in10);
+      RECON_AND_STORE(dest, in11);
+      RECON_AND_STORE(dest, in12);
+      RECON_AND_STORE(dest, in13);
+      RECON_AND_STORE(dest, in14);
+      RECON_AND_STORE(dest, in15);
+
+      dest += 8 - (stride * 16);
     }
   }
 }
 
-void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
+                                     int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   const __m128i zero = _mm_setzero_si128();
@@ -1007,7 +1015,6 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
-
   // 1-D idct. Load input data.
   in0 = _mm_load_si128((__m128i *)input);
   in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
@@ -1298,24 +1305,24 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
     in14 = _mm_srai_epi16(in14, 6);
     in15 = _mm_srai_epi16(in15, 6);
 
-    // Store results
-    _mm_store_si128((__m128i *)output, in0);
-    _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-    _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-    _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-    _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-    _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-    _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-    _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
-    _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
-    _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
-    _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
-    _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
-    _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
-    _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
-    _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
-    _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
-    output += 8;
+    RECON_AND_STORE(dest, in0);
+    RECON_AND_STORE(dest, in1);
+    RECON_AND_STORE(dest, in2);
+    RECON_AND_STORE(dest, in3);
+    RECON_AND_STORE(dest, in4);
+    RECON_AND_STORE(dest, in5);
+    RECON_AND_STORE(dest, in6);
+    RECON_AND_STORE(dest, in7);
+    RECON_AND_STORE(dest, in8);
+    RECON_AND_STORE(dest, in9);
+    RECON_AND_STORE(dest, in10);
+    RECON_AND_STORE(dest, in11);
+    RECON_AND_STORE(dest, in12);
+    RECON_AND_STORE(dest, in13);
+    RECON_AND_STORE(dest, in14);
+    RECON_AND_STORE(dest, in15);
+
+    dest += 8 - (stride * 16);
   }
 }
 
@@ -1934,16 +1941,6 @@ void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
       in30 = _mm_srai_epi16(in30, 6);
       in31 = _mm_srai_epi16(in31, 6);
 
-#define RECON_AND_STORE(dest, in_x) \
-  {                                                     \
-     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-      d0 = _mm_unpacklo_epi8(d0, zero); \
-      in_x = _mm_add_epi16(in_x, d0); \
-      in_x = _mm_packus_epi16(in_x, in_x); \
-      _mm_storel_epi64((__m128i *)(dest), in_x); \
-      dest += stride; \
-  }
-
       RECON_AND_STORE(dest, in0);
       RECON_AND_STORE(dest, in1);
       RECON_AND_STORE(dest, in2);
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index faaee73..bc943fa 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -105,10 +105,6 @@ void vp9_add_residual_8x8_c(const int16_t *diff, uint8_t *dest, int stride) {
   add_residual(diff, dest, stride, 8, 8);
 }
 
-void vp9_add_residual_16x16_c(const int16_t *diff, uint8_t *dest, int stride) {
-  add_residual(diff, dest, stride, 16, 16);
-}
-
 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
                                   int width, int height) {
   int r, c;
@@ -260,19 +256,14 @@ void vp9_iht_add_16x16_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
   if (tx_type == DCT_DCT) {
     vp9_idct_add_16x16(input, dest, stride, eob);
   } else {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
-
     if (eob > 0) {
-      vp9_short_iht16x16(input, output, 16, tx_type);
+      vp9_short_iht16x16_add(input, dest, stride, tx_type);
       vpx_memset(input, 0, 512);
-      vp9_add_residual_16x16(output, dest, stride);
     }
   }
 }
 
 void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
-
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
   if (eob) {
@@ -288,21 +279,15 @@ void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
       vp9_add_constant_residual_16x16(out, dest, stride);
 #if !CONFIG_SCATTERSCAN
     } else if (eob <= 10) {
-      // the idct halves ( >> 1) the pitch
-      vp9_short_idct10_16x16(input, output, 32);
-
+      vp9_short_idct10_16x16_add(input, dest, stride);
       input[0] = input[1] = input[2] = input[3] = 0;
       input[16] = input[17] = input[18] = 0;
       input[32] = input[33] = 0;
       input[48] = 0;
-
-      vp9_add_residual_16x16(output, dest, stride);
 #endif
     } else {
-      // the idct halves ( >> 1) the pitch
-      vp9_short_idct16x16(input, output, 16 << 1);
+      vp9_short_idct16x16_add(input, dest, stride);
       vpx_memset(input, 0, 512);
-      vp9_add_residual_16x16(output, dest, stride);
     }
   }
 }
diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
index 38fd5aa..796fc12 100644
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -122,65 +122,6 @@ void vp9_add_residual_8x8_sse2(const int16_t *diff, uint8_t *dest, int stride) {
   _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
 }
 
-void vp9_add_residual_16x16_sse2(const int16_t *diff, uint8_t *dest,
-                                 int stride) {
-  const int width = 16;
-  int i = 4;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-
-  do {
-    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
-    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
-    d2 = _mm_load_si128((const __m128i *)(diff + 1 * width));
-    d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
-    d4 = _mm_load_si128((const __m128i *)(diff + 2 * width));
-    d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8));
-    d6 = _mm_load_si128((const __m128i *)(diff + 3 * width));
-    d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8));
-
-    // Prediction data.
-    p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
-    p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
-    p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
-    p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
-
-    p0 = _mm_unpacklo_epi8(p1, zero);
-    p1 = _mm_unpackhi_epi8(p1, zero);
-    p2 = _mm_unpacklo_epi8(p3, zero);
-    p3 = _mm_unpackhi_epi8(p3, zero);
-    p4 = _mm_unpacklo_epi8(p5, zero);
-    p5 = _mm_unpackhi_epi8(p5, zero);
-    p6 = _mm_unpacklo_epi8(p7, zero);
-    p7 = _mm_unpackhi_epi8(p7, zero);
-
-    p0 = _mm_add_epi16(p0, d0);
-    p1 = _mm_add_epi16(p1, d1);
-    p2 = _mm_add_epi16(p2, d2);
-    p3 = _mm_add_epi16(p3, d3);
-    p4 = _mm_add_epi16(p4, d4);
-    p5 = _mm_add_epi16(p5, d5);
-    p6 = _mm_add_epi16(p6, d6);
-    p7 = _mm_add_epi16(p7, d7);
-
-    p0 = _mm_packus_epi16(p0, p1);
-    p1 = _mm_packus_epi16(p2, p3);
-    p2 = _mm_packus_epi16(p4, p5);
-    p3 = _mm_packus_epi16(p6, p7);
-
-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
-    _mm_store_si128((__m128i *)(dest + 1 * stride), p1);
-    _mm_store_si128((__m128i *)(dest + 2 * stride), p2);
-    _mm_store_si128((__m128i *)(dest + 3 * stride), p3);
-
-    diff += 4 * width;
-    dest += 4 * stride;
-  } while (--i);
-}
-
 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
                                         int stride) {
   uint8_t abs_diff;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index db18555..221de74 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -522,11 +522,12 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
     case TX_16X16:
       tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
       if (tx_type == DCT_DCT) {
-        vp9_short_idct16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                            diff, bw * 2);
+        vp9_short_idct16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                                block, 16), dst, xd->plane[plane].dst.stride);
       } else {
-        vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                           diff, bw, tx_type);
+        vp9_short_iht16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                               block, 16), dst, xd->plane[plane].dst.stride,
+                               tx_type);
       }
       *wip_txfrm_size = 16;
       break;
@@ -605,7 +606,7 @@ void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
 
   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
 
-  if (wip_txfrm_size < 32)
+  if (wip_txfrm_size < 16)
     vp9_recon_sbuv(xd, bsize);
 }
 
@@ -627,13 +628,13 @@ void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
   // wip version... will use foreach_transformed_block when done
   foreach_transformed_block_in_plane(xd, bsize, 0,
                                      encode_block, &arg);
-  if (wip_txfrm_size < 32)
+  if (wip_txfrm_size < 16)
     vp9_recon_sby(xd, bsize);
   wip_txfrm_size = 0;
 
   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
 
-  if (wip_txfrm_size < 32)
+  if (wip_txfrm_size < 16)
     vp9_recon_sbuv(xd, bsize);
 #endif
 }
-- 
2.7.4