From ba48a11130aa88cf20c2c54e43585968ce49e964 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Mon, 20 May 2013 13:03:17 -0400
Subject: [PATCH] WIP: 4x4 idct/recon merge

This patch eliminates the intermediate diff buffer usage by
combining the short idct and the add residual into one function.
The encoder can use the same code as well.

Change-Id: I296604bf73579c45105de0dd1adbcc91bcc53c22
---
 test/fdct4x4_test.cc                  | 12 +++--
 vp9/common/vp9_blockd.h               |  4 +-
 vp9/common/vp9_idct.c                 | 82 +++++++++++++++++------------------
 vp9/common/vp9_invtrans.c             |  9 ++--
 vp9/common/vp9_invtrans.h             |  5 +--
 vp9/common/vp9_rtcd_defs.sh           | 28 +++++-------
 vp9/common/x86/vp9_idct_intrin_sse2.c | 26 +++++++----
 vp9/decoder/vp9_decodframe.c          |  4 --
 vp9/decoder/vp9_idct_blk.c            | 47 +++-----------------
 vp9/decoder/x86/vp9_dequantize_sse2.c | 43 ------------------
 vp9/encoder/vp9_encodeframe.c         |  8 ++--
 vp9/encoder/vp9_encodeintra.c         | 13 ++----
 vp9/encoder/vp9_encodemb.c            | 60 ++++++-------------------
 vp9/encoder/vp9_onyx_if.c             |  8 ++--
 vp9/encoder/vp9_rdopt.c               | 21 ++++-----
 15 files changed, 124 insertions(+), 246 deletions(-)

diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index dfb64c3..1c887bb 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -96,11 +96,15 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[16];
     int16_t test_temp_block[16];
-    int16_t test_output_block[16];
+    uint8_t dst[16], src[16];
 
+    for (int j = 0; j < 16; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 16; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
 
     // TODO(Yaowu): this should be converted to a parameterized test
     // to test optimized versions of this function.
@@ -120,10 +124,10 @@ TEST(Vp9Fdct4x4Test, RoundTripErrorCheck) {
     }
 
     // Because the bitstream is not frozen yet, use the idct in the codebase.
-    vp9_short_idct4x4_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct4x4_add_c(test_temp_block, dst, 4);
 
     for (int j = 0; j < 16; ++j) {
-      const int diff = test_input_block[j] - test_output_block[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       if (max_error < error)
         max_error = error;
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index 6a3fbb1..533f7f7 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -393,8 +393,8 @@ typedef struct macroblockd {
 
   int lossless;
   /* Inverse transform function pointers. */
-  void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch);
-  void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch);
+  void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
+  void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride);
   void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);
   void (*itxm_add_y_block)(int16_t *q, uint8_t *dst, int stride,
     struct macroblockd *xd);
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 2ff7696..80af49e 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -18,12 +18,12 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int i;
+  int16_t output[16];
   int a1, b1, c1, d1;
   int16_t *ip = input;
   int16_t *op = output;
-  const int half_pitch = pitch >> 1;
 
   for (i = 0; i < 4; i++) {
     a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;
@@ -37,63 +37,60 @@ void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {
     op[3] = (d1 - c1) >> 1;
 
     ip += 4;
-    op += half_pitch;
+    op += 4;
   }
 
   ip = output;
-  op = output;
   for (i = 0; i < 4; i++) {
-    a1 = ip[half_pitch * 0] + ip[half_pitch * 3];
-    b1 = ip[half_pitch * 1] + ip[half_pitch * 2];
-    c1 = ip[half_pitch * 1] - ip[half_pitch * 2];
-    d1 = ip[half_pitch * 0] - ip[half_pitch * 3];
+    a1 = ip[4 * 0] + ip[4 * 3];
+    b1 = ip[4 * 1] + ip[4 * 2];
+    c1 = ip[4 * 1] - ip[4 * 2];
+    d1 = ip[4 * 0] - ip[4 * 3];
 
 
-    op[half_pitch * 0] = (a1 + b1 + 1) >> 1;
-    op[half_pitch * 1] = (c1 + d1) >> 1;
-    op[half_pitch * 2] = (a1 - b1) >> 1;
-    op[half_pitch * 3] = (d1 - c1) >> 1;
+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +
+                                       ((a1 + b1 + 1) >> 1));
+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +
+                                       ((c1 + d1) >> 1));
+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
+                                       ((a1 - b1) >> 1));
+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
+                                       ((d1 - c1) >> 1));
 
     ip++;
-    op++;
+    dest++;
   }
 }
 
-void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) {
+void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
   int i;
   int16_t tmp[4];
   int16_t *ip = in;
   int16_t *op = tmp;
-  const int half_pitch = pitch >> 1;
 
   op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
   op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;
 
   ip = tmp;
-  op = out;
   for (i = 0; i < 4; i++) {
-    op[half_pitch * 0] = (ip[0] + 1) >> 1;
-    op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1;
+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +
+                                       ((ip[0] + 1) >> 1));
+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +
+                                       (ip[0] >> 1));
+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
+                                       (ip[0] >> 1));
+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
+                                       (ip[0] >> 1));
     ip++;
-    op++;
+    dest++;
   }
 }
 
 void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,
                                  uint8_t *dst_ptr,
                                  int pitch, int stride) {
-  int r, c;
   int16_t dc = input_dc;
-  int16_t tmp[4 * 4];
-  vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1);
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++)
-      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);
-
-    dst_ptr += stride;
-    pred_ptr += pitch;
-  }
+  vp9_short_iwalsh4x4_1_add_c(&dc, dst_ptr, stride);
 }
 
 void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
@@ -116,10 +113,9 @@ void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
   output[3] = step[0] - step[3];
 }
 
-void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[4 * 4];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[4], temp_out[4];
 
@@ -138,22 +134,24 @@ void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {
       temp_in[j] = out[j * 4 + i];
     vp9_idct4_1d(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
-void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int i;
   int a1;
-  int16_t *op = output;
-  const int half_pitch = pitch >> 1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
-    op[0] = op[1] = op[2] = op[3] = a1;
-    op += half_pitch;
+    dest[0] = clip_pixel(dest[0] + a1);
+    dest[1] = clip_pixel(dest[1] + a1);
+    dest[2] = clip_pixel(dest[2] + a1);
+    dest[3] = clip_pixel(dest[3] + a1);
+    dest += dest_stride;
   }
 }
 
@@ -285,8 +283,8 @@ static void iadst4_1d(int16_t *input, int16_t *output) {
   output[3] = dct_const_round_shift(s3);
 }
 
-void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
-                        int pitch, int tx_type) {
+void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+                            int tx_type) {
   const transform_2d IHT_4[] = {
     { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0
     { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1
@@ -312,10 +310,10 @@ void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
       temp_in[j] = out[j * 4 + i];
     IHT_4[tx_type].cols(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * dest_stride + i]);
   }
 }
-
 static void iadst8_1d(int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c
index 01859df..d47fca1 100644
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -11,11 +11,10 @@
 #include "vp9/common/vp9_invtrans.h"
 #include "./vp9_rtcd.h"
 
-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
-                                 int16_t *dqcoeff, int16_t *diff,
-                                 int pitch) {
+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
+                                     uint8_t *dest, int stride) {
   if (eob <= 1)
-    xd->inv_txm4x4_1(dqcoeff, diff, pitch);
+    xd->inv_txm4x4_1_add(dqcoeff, dest, stride);
   else
-    xd->inv_txm4x4(dqcoeff, diff, pitch);
+    xd->inv_txm4x4_add(dqcoeff, dest, stride);
 }
diff --git a/vp9/common/vp9_invtrans.h b/vp9/common/vp9_invtrans.h
index 2aeb584..dbdc50a 100644
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@@ -15,7 +15,6 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
-                                 int16_t *dqcoeff, int16_t *diff,
-                                 int pitch);
+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
+                                     uint8_t *dest, int stride);
 #endif  // VP9_COMMON_VP9_INVTRANS_H_
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index cf8dd33..cb353b1 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -85,9 +85,6 @@ prototype void vp9_intra4x4_predict "struct macroblockd *xd, int block, enum BLO
 specialize vp9_intra4x4_predict;
 
 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
-prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_4x4 sse2
-
 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_8x8 sse2
 
@@ -179,11 +176,11 @@ specialize vp9_convolve8_avg_vert ssse3
 #
 # dct
 #
-prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4_1
+prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct4x4_1_add
 
-prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4 sse2
+prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct4x4_add sse2
 
 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct8x8_add sse2
@@ -212,12 +209,12 @@ specialize vp9_short_idct1_32x32
 prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct10_32x32_add
 
+prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_short_iht4x4_add
+
 prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
 specialize vp9_short_iht8x8_add
 
-prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht4x4
-
 prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
 specialize vp9_short_iht16x16_add
 
@@ -229,12 +226,11 @@ specialize vp9_idct4_1d sse2
 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
 specialize vp9_dc_only_idct_add sse2
 
-prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_iwalsh4x4_1
-prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_iwalsh4x4
-prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
-specialize vp9_dc_only_inv_walsh_add
+prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_iwalsh4x4_1_add
+
+prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_iwalsh4x4_add
 
 prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
 specialize vp9_sad32x3
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index ab8604c..599dcff 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -73,7 +73,7 @@ void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
   *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
 }
 
-void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@@ -81,7 +81,6 @@ void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
                                     (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
                                     (int16_t)cospi_8_64, (int16_t)cospi_24_64);
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const int half_pitch = pitch >> 1;
   __m128i input0, input1, input2, input3;
 
   // Rows
@@ -188,14 +187,23 @@ void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
   input2 = _mm_srai_epi16(input2, 4);
   input3 = _mm_srai_epi16(input3, 4);
 
-  // Store results
-  _mm_storel_epi64((__m128i *)output, input2);
-  input2 = _mm_srli_si128(input2, 8);
-  _mm_storel_epi64((__m128i *)(output + half_pitch), input2);
+#define RECON_AND_STORE4X4(dest, in_x) \
+  {                                                     \
+      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      d0 = _mm_add_epi16(in_x, d0); \
+      d0 = _mm_packus_epi16(d0, d0); \
+      *(int *)dest = _mm_cvtsi128_si32(d0); \
+      dest += stride; \
+  }
+
+  input0 = _mm_srli_si128(input2, 8);
+  input1 = _mm_srli_si128(input3, 8);
 
-  _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);
-  input3 = _mm_srli_si128(input3, 8);
-  _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);
+  RECON_AND_STORE4X4(dest, input2);
+  RECON_AND_STORE4X4(dest, input0);
+  RECON_AND_STORE4X4(dest, input1);
+  RECON_AND_STORE4X4(dest, input3);
 }
 
 void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index b200e6c..b866e95 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -998,14 +998,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
                  pc->uv_dc_delta_q == 0 &&
                  pc->uv_ac_delta_q == 0;
   if (xd->lossless) {
-    xd->inv_txm4x4_1      = vp9_short_iwalsh4x4_1;
-    xd->inv_txm4x4        = vp9_short_iwalsh4x4;
     xd->itxm_add          = vp9_idct_add_lossless_c;
     xd->itxm_add_y_block  = vp9_idct_add_y_block_lossless_c;
     xd->itxm_add_uv_block = vp9_idct_add_uv_block_lossless_c;
   } else {
-    xd->inv_txm4x4_1      = vp9_short_idct4x4_1;
-    xd->inv_txm4x4        = vp9_short_idct4x4;
     xd->itxm_add          = vp9_idct_add;
     xd->itxm_add_y_block  = vp9_idct_add_y_block;
     xd->itxm_add_uv_block = vp9_idct_add_uv_block;
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index 10b585b..7726598 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -84,23 +84,6 @@ void vp9_idct_add_uv_block_lossless_c(int16_t *q, uint8_t *dst, int stride,
   }
 }
 
-static void add_residual(const int16_t *diff, uint8_t *dest, int stride,
-                         int width, int height) {
-  int r, c;
-
-  for (r = 0; r < height; r++) {
-    for (c = 0; c < width; c++)
-      dest[c] = clip_pixel(diff[c] + dest[c]);
-
-    dest += stride;
-    diff += width;
-  }
-}
-
-void vp9_add_residual_4x4_c(const int16_t *diff, uint8_t *dest, int stride) {
-  add_residual(diff, dest, stride, 4, 4);
-}
-
 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
                                   int width, int height) {
   int r, c;
@@ -133,11 +116,8 @@ void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
   if (tx_type == DCT_DCT) {
     vp9_idct_add(input, dest, stride, eob);
   } else {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
-    vp9_short_iht4x4(input, output, 4, tx_type);
+    vp9_short_iht4x4_add(input, dest, stride, tx_type);
     vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
   }
 }
 
@@ -154,13 +134,9 @@ void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
 }
 
 void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   if (eob > 1) {
-    // the idct halves ( >> 1) the pitch
-    vp9_short_idct4x4(input, output, 4 << 1);
+    vp9_short_idct4x4_add(input, dest, stride);
     vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
   } else {
     vp9_dc_only_idct_add(input[0], dest, dest, stride, stride);
     ((int *)input)[0] = 0;
@@ -168,38 +144,27 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
 }
 
 void vp9_dc_idct_add_c(int16_t *input, uint8_t *dest, int stride, int dc) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   input[0] = dc;
-
-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct4x4(input, output, 4 << 1);
+  vp9_short_idct4x4_add(input, dest, stride);
   vpx_memset(input, 0, 32);
-  vp9_add_residual_4x4(output, dest, stride);
 }
 
 void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride,
                              int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   if (eob > 1) {
-    vp9_short_iwalsh4x4_c(input, output, 4 << 1);
+    vp9_short_iwalsh4x4_add(input, dest, stride);
     vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
   } else {
-    vp9_dc_only_inv_walsh_add(input[0], dest, dest, stride, stride);
+    vp9_short_iwalsh4x4_1_add_c(input, dest, stride);
     ((int *)input)[0] = 0;
   }
 }
 
 void vp9_dc_idct_add_lossless_c(int16_t *input, uint8_t *dest,
                                 int stride, int dc) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   input[0] = dc;
-  vp9_short_iwalsh4x4_c(input, output, 4 << 1);
+  vp9_short_iwalsh4x4_add(input, dest, stride);
   vpx_memset(input, 0, 32);
-  vp9_add_residual_4x4(output, dest, stride);
 }
 
 void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
index 72036c2..54ec67f 100644
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -15,49 +15,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_add_residual_4x4_sse2(const int16_t *diff, uint8_t *dest, int stride) {
-  const int width = 4;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width));
-  const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width));
-  const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width));
-  const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width));
-
-  // Prediction data.
-  __m128i p0 = _mm_cvtsi32_si128(*(const int *)(dest + 0 * stride));
-  __m128i p1 = _mm_cvtsi32_si128(*(const int *)(dest + 1 * stride));
-  __m128i p2 = _mm_cvtsi32_si128(*(const int *)(dest + 2 * stride));
-  __m128i p3 = _mm_cvtsi32_si128(*(const int *)(dest + 3 * stride));
-
-  p0 = _mm_unpacklo_epi8(p0, zero);
-  p1 = _mm_unpacklo_epi8(p1, zero);
-  p2 = _mm_unpacklo_epi8(p2, zero);
-  p3 = _mm_unpacklo_epi8(p3, zero);
-
-  p0 = _mm_add_epi16(p0, d0);
-  p1 = _mm_add_epi16(p1, d1);
-  p2 = _mm_add_epi16(p2, d2);
-  p3 = _mm_add_epi16(p3, d3);
-
-  p0 = _mm_packus_epi16(p0, p1);
-  p2 = _mm_packus_epi16(p2, p3);
-
-  *(int *)dest = _mm_cvtsi128_si32(p0);
-  dest += stride;
-
-  p0 = _mm_srli_si128(p0, 8);
-  *(int *)dest = _mm_cvtsi128_si32(p0);
-  dest += stride;
-
-  *(int *)dest = _mm_cvtsi128_si32(p2);
-  dest += stride;
-
-  p2 = _mm_srli_si128(p2, 8);
-  *(int *)dest = _mm_cvtsi128_si32(p2);
-}
-
 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
                                         int stride) {
   uint8_t abs_diff;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 69ab147..e210597 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1207,8 +1207,8 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   if (lossless) {
     cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;
     cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
     cpi->mb.optimize              = 0;
     cpi->common.filter_level      = 0;
     cpi->zbin_mode_boost_enabled  = 0;
@@ -1216,8 +1216,8 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
   } else {
     cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;
     cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
   }
 }
 
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index fe5bdb3..f8cf50f 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -53,9 +53,6 @@ static void encode_intra4x4block(MACROBLOCK *x, int ib,
   int16_t* const src_diff =
       raster_block_offset_int16(xd, bsize, 0, ib,
                                 x->plane[0].src_diff);
-  int16_t* const diff =
-      raster_block_offset_int16(xd, bsize, 0, ib,
-                                xd->plane[0].diff);
   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
   const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
 
@@ -72,17 +69,15 @@ static void encode_intra4x4block(MACROBLOCK *x, int ib,
   if (tx_type != DCT_DCT) {
     vp9_short_fht4x4(src_diff, coeff, 4 << bwl, tx_type);
     x->quantize_b_4x4(x, ib, tx_type, 16);
-    vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                     diff, 4 << bwl, tx_type);
+    vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), dst,
+                         xd->plane[0].dst.stride, tx_type);
   } else {
     x->fwd_txm4x4(src_diff, coeff, 8 << bwl);
     x->quantize_b_4x4(x, ib, tx_type, 16);
-    vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[0].eobs[ib],
+    vp9_inverse_transform_b_4x4_add(&x->e_mbd, xd->plane[0].eobs[ib],
                                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                                diff, 8 << bwl);
+                                dst, xd->plane[0].dst.stride);
   }
-
-  vp9_recon_b(dst, diff, 4 << bwl, dst, xd->plane[0].dst.stride);
 }
 
 void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bsize) {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index bbc97da..84b3507 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -425,7 +425,6 @@ struct encode_b_args {
   VP9_COMMON *cm;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
-  int *wip_txfrm_size;  // for "work in progress" only... will remove once done
 };
 
 static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -494,14 +493,9 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                          int ss_txfrm_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK* const x = args->x;
-  int *wip_txfrm_size = args->wip_txfrm_size;
   MACROBLOCKD* const xd = &x->e_mbd;
-  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
   const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
                                                        block, ss_txfrm_size);
-  int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
-                                                  raster_block,
-                                                  xd->plane[plane].diff);
   uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
                                                  raster_block,
                                                  xd->plane[plane].dst.buf,
@@ -517,7 +511,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
     case TX_32X32:
         vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
                                 block, 16), dst, xd->plane[plane].dst.stride);
-        *wip_txfrm_size = 32;
       break;
     case TX_16X16:
       tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
@@ -529,7 +522,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                                block, 16), dst, xd->plane[plane].dst.stride,
                                tx_type);
       }
-      *wip_txfrm_size = 16;
       break;
     case TX_8X8:
       tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
@@ -541,7 +533,6 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                              block, 16), dst, xd->plane[plane].dst.stride,
                              tx_type);
       }
-      *wip_txfrm_size = 8;
       break;
     case TX_4X4:
       tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
@@ -549,13 +540,13 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
         // this is like vp9_short_idct4x4 but has a special case around eob<=1
         // which is significant (not just an optimization) for the lossless
         // case.
-        vp9_inverse_transform_b_4x4(xd, xd->plane[plane].eobs[block],
-            BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), diff, bw * 2);
+        vp9_inverse_transform_b_4x4_add(xd, xd->plane[plane].eobs[block],
+            BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), dst,
+            xd->plane[plane].dst.stride);
       } else {
-        vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                         diff, bw, tx_type);
+        vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                             dst, xd->plane[plane].dst.stride, tx_type);
       }
-      *wip_txfrm_size = 4;
       break;
   }
 }
@@ -563,16 +554,15 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
 void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                          BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL, NULL};
+  struct encode_b_args arg = {cm, x, NULL};
 
-  foreach_transformed_block_in_plane(xd, bsize, 0,
-                                     xform_quant, &arg);
+  foreach_transformed_block_in_plane(xd, bsize, 0, xform_quant, &arg);
 }
 
 void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                          BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL, NULL};
+  struct encode_b_args arg = {cm, x, NULL};
 
   foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
 }
@@ -581,61 +571,37 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                     BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  int wip_txfrm_size = 0;
-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
+  struct encode_b_args arg = {cm, x, &ctx};
 
   vp9_subtract_sby(x, bsize);
   if (x->optimize)
     vp9_optimize_init(xd, bsize, &ctx);
 
-  foreach_transformed_block_in_plane(xd, bsize, 0,
-                                     encode_block, &arg);
-  if (wip_txfrm_size < 8)
-    vp9_recon_sby(xd, bsize);
+  foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
 }
 
 void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                      BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  int wip_txfrm_size = 0;
-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
+  struct encode_b_args arg = {cm, x, &ctx};
 
   vp9_subtract_sbuv(x, bsize);
   if (x->optimize)
     vp9_optimize_init(xd, bsize, &ctx);
 
   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
-
-  if (wip_txfrm_size < 8)
-    vp9_recon_sbuv(xd, bsize);
 }
 
 void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
                    BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  int wip_txfrm_size = 0;
-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
+  struct encode_b_args arg = {cm, x, &ctx};
 
   vp9_subtract_sb(x, bsize);
   if (x->optimize)
     vp9_optimize_init(xd, bsize, &ctx);
-#if 0
-  foreach_transformed_block(xd, bsize, encode_block, &arg);
-
-  vp9_recon_sb(xd, bsize);
-#else
-  // wip version... will use foreach_transformed_block when done
-  foreach_transformed_block_in_plane(xd, bsize, 0,
-                                     encode_block, &arg);
-  if (wip_txfrm_size < 8)
-    vp9_recon_sby(xd, bsize);
-  wip_txfrm_size = 0;
 
-  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
-
-  if (wip_txfrm_size < 8)
-    vp9_recon_sbuv(xd, bsize);
-#endif
+  foreach_transformed_block(xd, bsize, encode_block, &arg);
 }
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 2d3fea9..94aa0c2 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1129,11 +1129,11 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) {
 
   cpi->oxcf.lossless = oxcf->lossless;
   if (cpi->oxcf.lossless) {
-    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_iwalsh4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
   } else {
-    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_idct4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
   }
 
   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index f928e7a..93068b1 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -592,11 +592,6 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
                                 BLOCK_SIZE_SB8X8,
                                 0, ib,
                                 x->plane[0].src_diff);
-  int16_t* const diff =
-      raster_block_offset_int16(xd,
-                                BLOCK_SIZE_SB8X8,
-                                0, ib,
-                                xd->plane[0].diff);
   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
   uint8_t* const dst =
       raster_block_offset_uint8(xd,
@@ -668,18 +663,18 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
   xd->mode_info_context->bmi[ib].as_mode.first =
     (B_PREDICTION_MODE)(*best_mode);
 
-  // inverse transform
-  if (best_tx_type != DCT_DCT)
-    vp9_short_iht4x4(best_dqcoeff, diff, 8, best_tx_type);
-  else
-    xd->inv_txm4x4(best_dqcoeff, diff, 16);
-
   vp9_intra4x4_predict(xd, ib,
                        BLOCK_SIZE_SB8X8,
                        *best_mode,
                        dst, xd->plane[0].dst.stride);
-  vp9_recon_b(dst, diff, 8,
-              dst, xd->plane[0].dst.stride);
+
+  // inverse transform
+  if (best_tx_type != DCT_DCT) {
+    vp9_short_iht4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride,
+                           best_tx_type);
+  } else {
+    xd->inv_txm4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride);
+  }
 
   return best_rd;
 }
-- 
2.7.4