From 2cf0d4be122f9951b34115401aad069a9464b4c5 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Tue, 14 May 2013 11:58:13 -0400
Subject: [PATCH] WIP: 32x32 idct/recon merge

This patch eliminates the intermediate diff buffer usage by
combining the short idct and the add residual into one function.
The encoder can use the same code as well.

Change-Id: I4ea09df0e162591e420d869b7431c2e7f89a8c1a
---
 test/dct32x32_test.cc                 | 29 ++++++++-----
 vp9/common/vp9_idct.c                 | 13 +++---
 vp9/common/vp9_rtcd_defs.sh           | 12 ++---
 vp9/common/x86/vp9_idct_intrin_sse2.c | 82 ++++++++++++++++++++---------------
 vp9/decoder/vp9_idct_blk.c            | 12 +----
 vp9/decoder/x86/vp9_dequantize_sse2.c | 59 -------------------------
 vp9/encoder/vp9_encodemb.c            | 49 ++++++++++++++++-----
 7 files changed, 115 insertions(+), 141 deletions(-)

diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index a565270..e05d482 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -18,7 +18,7 @@ extern "C" {
 #include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
   void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch);
-  void vp9_short_idct32x32_c(short *input, short *output, int pitch);
+  void vp9_short_idct32x32_add_c(short *input, uint8_t *output, int pitch);
 }
 
 #include "test/acm_random.h"
@@ -91,28 +91,31 @@ static void reference_32x32_dct_2d(int16_t input[32*32], double output[32*32]) {
   }
 }
 
-
 TEST(VP9Idct32x32Test, AccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 1000;
   for (int i = 0; i < count_test_block; ++i) {
     int16_t in[1024], coeff[1024];
-    int16_t out_c[1024];
+    uint8_t dst[1024], src[1024];
     double out_r[1024];
 
+    for (int j = 0; j < 1024; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 1024; ++j)
-      in[j] = rnd.Rand8() - rnd.Rand8();
+      in[j] = src[j] - dst[j];
 
     reference_32x32_dct_2d(in, out_r);
     for (int j = 0; j < 1024; j++)
       coeff[j] = round(out_r[j]);
-    vp9_short_idct32x32_c(coeff, out_c, 64);
+    vp9_short_idct32x32_add_c(coeff, dst, 32);
     for (int j = 0; j < 1024; ++j) {
-      const int diff = out_c[j] - in[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       EXPECT_GE(1, error)
-          << "Error: 3x32 IDCT has error " << error
+          << "Error: 32x32 IDCT has error " << error
           << " at index " << j;
     }
   }
@@ -126,18 +129,22 @@ TEST(VP9Fdct32x32Test, AccuracyCheck) {
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[1024];
     int16_t test_temp_block[1024];
-    int16_t test_output_block[1024];
+    uint8_t dst[1024], src[1024];
 
+    for (int j = 0; j < 1024; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 1024; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 64;
     vp9_short_fdct32x32_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct32x32_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct32x32_add_c(test_temp_block, dst, 32);
 
     for (int j = 0; j < 1024; ++j) {
-      const unsigned diff = test_input_block[j] - test_output_block[j];
+      const unsigned diff = dst[j] - src[j];
       const unsigned error = diff * diff;
       if (max_error < error)
         max_error = error;
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 3ec093f..5e6384c 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -1249,10 +1249,9 @@ static void idct32_1d(int16_t *input, int16_t *output) {
   output[31] = step1[0] - step1[31];
 }
 
-void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct32x32_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[32 * 32];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[32], temp_out[32];
 
@@ -1269,7 +1268,8 @@ void vp9_short_idct32x32_c(int16_t *input, int16_t *output, int pitch) {
       temp_in[j] = out[j * 32 + i];
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
@@ -1279,10 +1279,10 @@ void vp9_short_idct1_32x32_c(int16_t *input, int16_t *output) {
   output[0] = ROUND_POWER_OF_TWO(out, 6);
 }
 
-void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct10_32x32_add_c(int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
   int16_t out[32 * 32];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[32], temp_out[32];
 
@@ -1302,6 +1302,7 @@ void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) {
       temp_in[j] = out[j * 32 + i];
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
   }
 }
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 02d3253..c45d030 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -94,9 +94,6 @@ specialize vp9_add_residual_8x8 sse2
 prototype void vp9_add_residual_16x16 "const int16_t *diff, uint8_t *dest, int stride"
 specialize vp9_add_residual_16x16 sse2
 
-prototype void vp9_add_residual_32x32 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_32x32 sse2
-
 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_8x8 sse2
 
@@ -212,15 +209,14 @@ specialize vp9_short_idct10_16x16 sse2
 prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_16x16
 
-
-prototype void vp9_short_idct32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct32x32 sse2
+prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct32x32_add sse2
 
 prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_32x32
 
-prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_32x32
+prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_32x32_add
 
 prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
 specialize vp9_short_iht8x8
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index dd7e68a..e53a937 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -1319,8 +1319,7 @@ void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
   }
 }
 
-void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+void vp9_short_idct32x32_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
 
@@ -1832,6 +1831,8 @@ void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
       col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
       col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
     } else {
+      const __m128i zero = _mm_setzero_si128();
+
       // 2_D: Calculate the results and store them to destination.
       in0 = _mm_add_epi16(stp1_0, stp1_31);
       in1 = _mm_add_epi16(stp1_1, stp1_30);
@@ -1933,41 +1934,50 @@ void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
       in30 = _mm_srai_epi16(in30, 6);
       in31 = _mm_srai_epi16(in31, 6);
 
-      // Store results
-      _mm_store_si128((__m128i *)output, in0);
-      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
-      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
-      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
-      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
-      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
-      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
-      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
-      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
-      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
-      _mm_store_si128((__m128i *)(output + half_pitch * 16), in16);
-      _mm_store_si128((__m128i *)(output + half_pitch * 17), in17);
-      _mm_store_si128((__m128i *)(output + half_pitch * 18), in18);
-      _mm_store_si128((__m128i *)(output + half_pitch * 19), in19);
-      _mm_store_si128((__m128i *)(output + half_pitch * 20), in20);
-      _mm_store_si128((__m128i *)(output + half_pitch * 21), in21);
-      _mm_store_si128((__m128i *)(output + half_pitch * 22), in22);
-      _mm_store_si128((__m128i *)(output + half_pitch * 23), in23);
-      _mm_store_si128((__m128i *)(output + half_pitch * 24), in24);
-      _mm_store_si128((__m128i *)(output + half_pitch * 25), in25);
-      _mm_store_si128((__m128i *)(output + half_pitch * 26), in26);
-      _mm_store_si128((__m128i *)(output + half_pitch * 27), in27);
-      _mm_store_si128((__m128i *)(output + half_pitch * 28), in28);
-      _mm_store_si128((__m128i *)(output + half_pitch * 29), in29);
-      _mm_store_si128((__m128i *)(output + half_pitch * 30), in30);
-      _mm_store_si128((__m128i *)(output + half_pitch * 31), in31);
+#define RECON_AND_STORE(dest, in_x) \
+  {                                                     \
+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      in_x = _mm_add_epi16(in_x, d0); \
+      in_x = _mm_packus_epi16(in_x, in_x); \
+      _mm_storel_epi64((__m128i *)(dest), in_x); \
+      dest += stride; \
+  }
 
-      output += 8;
+      RECON_AND_STORE(dest, in0);
+      RECON_AND_STORE(dest, in1);
+      RECON_AND_STORE(dest, in2);
+      RECON_AND_STORE(dest, in3);
+      RECON_AND_STORE(dest, in4);
+      RECON_AND_STORE(dest, in5);
+      RECON_AND_STORE(dest, in6);
+      RECON_AND_STORE(dest, in7);
+      RECON_AND_STORE(dest, in8);
+      RECON_AND_STORE(dest, in9);
+      RECON_AND_STORE(dest, in10);
+      RECON_AND_STORE(dest, in11);
+      RECON_AND_STORE(dest, in12);
+      RECON_AND_STORE(dest, in13);
+      RECON_AND_STORE(dest, in14);
+      RECON_AND_STORE(dest, in15);
+      RECON_AND_STORE(dest, in16);
+      RECON_AND_STORE(dest, in17);
+      RECON_AND_STORE(dest, in18);
+      RECON_AND_STORE(dest, in19);
+      RECON_AND_STORE(dest, in20);
+      RECON_AND_STORE(dest, in21);
+      RECON_AND_STORE(dest, in22);
+      RECON_AND_STORE(dest, in23);
+      RECON_AND_STORE(dest, in24);
+      RECON_AND_STORE(dest, in25);
+      RECON_AND_STORE(dest, in26);
+      RECON_AND_STORE(dest, in27);
+      RECON_AND_STORE(dest, in28);
+      RECON_AND_STORE(dest, in29);
+      RECON_AND_STORE(dest, in30);
+      RECON_AND_STORE(dest, in31);
+
+      dest += 8 - (stride * 32);
     }
   }
 }
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index 3480df2..faaee73 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -109,10 +109,6 @@ void vp9_add_residual_16x16_c(const int16_t *diff, uint8_t *dest, int stride) {
   add_residual(diff, dest, stride, 16, 16);
 }
 
-void vp9_add_residual_32x32_c(const int16_t *diff, uint8_t *dest, int stride) {
-  add_residual(diff, dest, stride, 32, 32);
-}
-
 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
                                   int width, int height) {
   int r, c;
@@ -321,20 +317,16 @@ void vp9_idct_add_32x32_c(int16_t *input, uint8_t *dest, int stride, int eob) {
       input[0] = 0;
 #if !CONFIG_SCATTERSCAN
     } else if (eob <= 10) {
-      // the idct halves ( >> 1) the pitch
-      vp9_short_idct10_32x32(input, output, 64);
-
+      vp9_short_idct10_32x32_add_c(input, dest, stride);
       input[0] = input[1] = input[2] = input[3] = 0;
       input[32] = input[33] = input[34] = 0;
       input[64] = input[65] = 0;
       input[96] = 0;
 
-      vp9_add_residual_32x32(output, dest, stride);
 #endif
     } else {
-      vp9_short_idct32x32(input, output, 64);
+      vp9_short_idct32x32_add(input, dest, stride);
       vpx_memset(input, 0, 2048);
-      vp9_add_residual_32x32(output, dest, stride);
     }
   }
 }
diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
index 1296b70..38fd5aa 100644
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -181,65 +181,6 @@ void vp9_add_residual_16x16_sse2(const int16_t *diff, uint8_t *dest,
   } while (--i);
 }
 
-void vp9_add_residual_32x32_sse2(const int16_t *diff, uint8_t *dest,
-                                 int stride) {
-  const int width = 32;
-  int i = 16;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-
-  do {
-    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
-    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
-    d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16));
-    d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24));
-    d4 = _mm_load_si128((const __m128i *)(diff + 1 * width));
-    d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
-    d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16));
-    d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24));
-
-    // Prediction data.
-    p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
-    p3 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16));
-    p5 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
-    p7 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16));
-
-    p0 = _mm_unpacklo_epi8(p1, zero);
-    p1 = _mm_unpackhi_epi8(p1, zero);
-    p2 = _mm_unpacklo_epi8(p3, zero);
-    p3 = _mm_unpackhi_epi8(p3, zero);
-    p4 = _mm_unpacklo_epi8(p5, zero);
-    p5 = _mm_unpackhi_epi8(p5, zero);
-    p6 = _mm_unpacklo_epi8(p7, zero);
-    p7 = _mm_unpackhi_epi8(p7, zero);
-
-    p0 = _mm_add_epi16(p0, d0);
-    p1 = _mm_add_epi16(p1, d1);
-    p2 = _mm_add_epi16(p2, d2);
-    p3 = _mm_add_epi16(p3, d3);
-    p4 = _mm_add_epi16(p4, d4);
-    p5 = _mm_add_epi16(p5, d5);
-    p6 = _mm_add_epi16(p6, d6);
-    p7 = _mm_add_epi16(p7, d7);
-
-    p0 = _mm_packus_epi16(p0, p1);
-    p1 = _mm_packus_epi16(p2, p3);
-    p2 = _mm_packus_epi16(p4, p5);
-    p3 = _mm_packus_epi16(p6, p7);
-
-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
-    _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1);
-    _mm_store_si128((__m128i *)(dest + 1 * stride), p2);
-    _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3);
-
-    diff += 2 * width;
-    dest += 2 * stride;
-  } while (--i);
-}
-
 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
                                         int stride) {
   uint8_t abs_diff;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index e4002d6..db18555 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -425,6 +425,7 @@ struct encode_b_args {
   VP9_COMMON *cm;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
+  int *wip_txfrm_size;  // for "work in progress" only... will remove once done
 };
 
 static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -493,6 +494,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
                          int ss_txfrm_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK* const x = args->x;
+  int *wip_txfrm_size = args->wip_txfrm_size;
   MACROBLOCKD* const xd = &x->e_mbd;
   const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
   const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
@@ -500,6 +502,10 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
   int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
                                                   raster_block,
                                                   xd->plane[plane].diff);
+  uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
+                                                 raster_block,
+                                                 xd->plane[plane].dst.buf,
+                                                 xd->plane[plane].dst.stride);
   TX_TYPE tx_type = DCT_DCT;
 
   xform_quant(plane, block, bsize, ss_txfrm_size, arg);
@@ -509,8 +515,9 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
 
   switch (ss_txfrm_size / 2) {
     case TX_32X32:
-      vp9_short_idct32x32(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                          diff, bw * 2);
+        vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                                block, 16), dst, xd->plane[plane].dst.stride);
+        *wip_txfrm_size = 32;
       break;
     case TX_16X16:
       tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
@@ -521,6 +528,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
         vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                            diff, bw, tx_type);
       }
+      *wip_txfrm_size = 16;
       break;
     case TX_8X8:
       tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
@@ -531,6 +539,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
         vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                          diff, bw, tx_type);
       }
+      *wip_txfrm_size = 8;
       break;
     case TX_4X4:
       tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
@@ -544,6 +553,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
         vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
                          diff, bw, tx_type);
       }
+      *wip_txfrm_size = 4;
       break;
   }
 }
@@ -551,7 +561,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE_TYPE bsize,
 void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                          BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL};
+  struct encode_b_args arg = {cm, x, NULL, NULL};
 
   foreach_transformed_block_in_plane(xd, bsize, 0,
                                      xform_quant, &arg);
@@ -560,7 +570,7 @@ void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
 void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                          BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL};
+  struct encode_b_args arg = {cm, x, NULL, NULL};
 
   foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
 }
@@ -569,7 +579,8 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                     BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  int wip_txfrm_size = 0;
+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
 
   vp9_subtract_sby(x, bsize);
   if (x->optimize)
@@ -577,15 +588,16 @@ void vp9_encode_sby(VP9_COMMON *const cm, MACROBLOCK *x,
 
   foreach_transformed_block_in_plane(xd, bsize, 0,
                                      encode_block, &arg);
-
-  vp9_recon_sby(xd, bsize);
+  if (wip_txfrm_size < 32)
+    vp9_recon_sby(xd, bsize);
 }
 
 void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                      BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  int wip_txfrm_size = 0;
+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
 
   vp9_subtract_sbuv(x, bsize);
   if (x->optimize)
@@ -593,20 +605,35 @@ void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
 
   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
 
-  vp9_recon_sbuv(xd, bsize);
+  if (wip_txfrm_size < 32)
+    vp9_recon_sbuv(xd, bsize);
 }
 
 void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
                    BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  struct encode_b_args arg = {cm, x, &ctx};
+  int wip_txfrm_size = 0;
+  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
 
   vp9_subtract_sb(x, bsize);
   if (x->optimize)
     vp9_optimize_init(xd, bsize, &ctx);
-
+#if 0
   foreach_transformed_block(xd, bsize, encode_block, &arg);
 
   vp9_recon_sb(xd, bsize);
+#else
+  // wip version... will use foreach_transformed_block when done
+  foreach_transformed_block_in_plane(xd, bsize, 0,
+                                     encode_block, &arg);
+  if (wip_txfrm_size < 32)
+    vp9_recon_sby(xd, bsize);
+  wip_txfrm_size = 0;
+
+  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
+
+  if (wip_txfrm_size < 32)
+    vp9_recon_sbuv(xd, bsize);
+#endif
 }
-- 
2.7.4