Enable 16x16 Hadamard transform in SATD based mode decision

author Jingning Han <jingning@google.com>

Mon, 30 Mar 2015 19:31:46 +0000 (12:31 -0700)

committer Gerrit Code Review <gerrit@gerrit.golo.chromium.org>

Mon, 30 Mar 2015 22:43:31 +0000 (15:43 -0700)
author Jingning Han <jingning@google.com>
Mon, 30 Mar 2015 19:31:46 +0000 (12:31 -0700)
committer Gerrit Code Review <gerrit@gerrit.golo.chromium.org>
Mon, 30 Mar 2015 22:43:31 +0000 (15:43 -0700)
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl

index ed43010..887f407 100644 (file)
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -1112,8 +1112,8 @@ specialize qw/vp9_avg_4x4 sse2/;
  add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
  specialize qw/vp9_hadamard_8x8 sse2/;
  
-add_proto qw/void vp9_hadamard_16x16/, "int16_t *coeff";
-specialize qw/vp9_hadamard_16x16/;
+add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp9_hadamard_16x16 sse2/;
  
  add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length";
  specialize qw/vp9_satd sse2/;
diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c

index b5632a2..58daa3a 100644 (file)
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c
@@ -78,8 +78,15 @@ void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
  }
  
  // In place 16x16 2D Hadamard transform
-void vp9_hadamard_16x16_c(int16_t *coeff) {
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
+                          int16_t *coeff) {
    int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+                                + (idx & 0x01) * 8;
+    vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
    for (idx = 0; idx < 64; ++idx) {
      int16_t a0 = coeff[0];
      int16_t a1 = coeff[64];
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c

index fa1f94d..1221b3a 100644 (file)
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -375,7 +375,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
                                    scan_order->scan, scan_order->iscan);
              break;
            case TX_16X16:
-            vp9_fdct16x16(src_diff, coeff, diff_stride);
+            vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
              vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
                              p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
                              pd->dequant, eob,
diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c

index 3f78b8a..ee3ead4 100644 (file)
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
@@ -165,6 +165,44 @@ void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
    _mm_storeu_si128((__m128i *)coeff, src[7]);
  }
  
+void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
+                             int16_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+                                + (idx & 0x01) * 8;
+    vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    coeff0 = _mm_srai_epi16(coeff0, 1);
+    coeff1 = _mm_srai_epi16(coeff1, 1);
+    _mm_store_si128((__m128i *)coeff, coeff0);
+    _mm_store_si128((__m128i *)(coeff + 64), coeff1);
+
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+    coeff2 = _mm_srai_epi16(coeff2, 1);
+    coeff3 = _mm_srai_epi16(coeff3, 1);
+    _mm_store_si128((__m128i *)(coeff + 128), coeff2);
+    _mm_store_si128((__m128i *)(coeff + 192), coeff3);
+
+    coeff += 8;
+  }
+}
+
  int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
    int i;
    __m128i sum = _mm_load_si128((const __m128i *)coeff);
author	Jingning Han <jingning@google.com>
	Mon, 30 Mar 2015 19:31:46 +0000 (12:31 -0700)
committer	Gerrit Code Review <gerrit@gerrit.golo.chromium.org>
	Mon, 30 Mar 2015 22:43:31 +0000 (15:43 -0700)
vp9/common/vp9_rtcd_defs.pl		patch \| blob \| history
vp9/encoder/vp9_avg.c		patch \| blob \| history
vp9/encoder/vp9_pickmode.c		patch \| blob \| history
vp9/encoder/x86/vp9_avg_intrin_sse2.c		patch \| blob \| history