Merge "Use bigdia search with pruned subpel search"

author Deb Mukherjee <debargha@google.com>

Fri, 12 Sep 2014 23:42:18 +0000 (16:42 -0700)

committer Gerrit Code Review <gerrit@gerrit.golo.chromium.org>

Fri, 12 Sep 2014 23:42:18 +0000 (16:42 -0700)
author Deb Mukherjee <debargha@google.com>
Fri, 12 Sep 2014 23:42:18 +0000 (16:42 -0700)
committer Gerrit Code Review <gerrit@gerrit.golo.chromium.org>
Fri, 12 Sep 2014 23:42:18 +0000 (16:42 -0700)
diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl

index 5b0cefa..40bcb33 100755 (executable)
--- a/build/make/rtcd.pl
+++ b/build/make/rtcd.pl
@@ -49,7 +49,7 @@ open CONFIG_FILE, $opts{config} or
  
  my %config = ();
  while (<CONFIG_FILE>) {
-  next if !/^CONFIG_/;
+  next if !/^(?:CONFIG_|HAVE_)/;
    chomp;
    my @pair = split /=/;
    $config{$pair[0]} = $pair[1];
diff --git a/configure b/configure

index 32b70f1..7b9c211 100755 (executable)
--- a/configure
+++ b/configure
@@ -281,6 +281,7 @@ EXPERIMENT_LIST="
      spatial_svc
      vp9_temporal_denoising
      fp_mb_stats
+    emulate_hardware_highbitdepth
  "
  CONFIG_LIST="
      external_build
diff --git a/examples.mk b/examples.mk

index bd38c41..fd67a44 100644 (file)
--- a/examples.mk
+++ b/examples.mk
@@ -114,7 +114,7 @@ vpx_temporal_svc_encoder.SRCS        += video_common.h
  vpx_temporal_svc_encoder.SRCS        += video_writer.h video_writer.c
  vpx_temporal_svc_encoder.GUID        = B18C08F2-A439-4502-A78E-849BE3D60947
  vpx_temporal_svc_encoder.DESCRIPTION = Temporal SVC Encoder
-EXAMPLES-$(CONFIG_VP8_DECODER)     += simple_decoder.c
+EXAMPLES-$(CONFIG_DECODERS)        += simple_decoder.c
  simple_decoder.GUID                 = D3BBF1E9-2427-450D-BBFF-B2843C1D44CC
  simple_decoder.SRCS                += ivfdec.h ivfdec.c
  simple_decoder.SRCS                += tools_common.h tools_common.c
@@ -123,7 +123,7 @@ simple_decoder.SRCS                += video_reader.h video_reader.c
  simple_decoder.SRCS                += vpx_ports/mem_ops.h
  simple_decoder.SRCS                += vpx_ports/mem_ops_aligned.h
  simple_decoder.DESCRIPTION          = Simplified decoder loop
-EXAMPLES-$(CONFIG_VP8_DECODER)     += postproc.c
+EXAMPLES-$(CONFIG_DECODERS)        += postproc.c
  postproc.SRCS                      += ivfdec.h ivfdec.c
  postproc.SRCS                      += tools_common.h tools_common.c
  postproc.SRCS                      += video_common.h
@@ -132,7 +132,7 @@ postproc.SRCS                      += vpx_ports/mem_ops.h
  postproc.SRCS                      += vpx_ports/mem_ops_aligned.h
  postproc.GUID                       = 65E33355-F35E-4088-884D-3FD4905881D7
  postproc.DESCRIPTION                = Decoder postprocessor control
-EXAMPLES-$(CONFIG_VP8_DECODER)     += decode_to_md5.c
+EXAMPLES-$(CONFIG_DECODERS)        += decode_to_md5.c
  decode_to_md5.SRCS                 += md5_utils.h md5_utils.c
  decode_to_md5.SRCS                 += ivfdec.h ivfdec.c
  decode_to_md5.SRCS                 += tools_common.h tools_common.c
@@ -142,29 +142,34 @@ decode_to_md5.SRCS                 += vpx_ports/mem_ops.h
  decode_to_md5.SRCS                 += vpx_ports/mem_ops_aligned.h
  decode_to_md5.GUID                  = 59120B9B-2735-4BFE-B022-146CA340FE42
  decode_to_md5.DESCRIPTION           = Frame by frame MD5 checksum
-EXAMPLES-$(CONFIG_VP8_ENCODER)  += simple_encoder.c
+EXAMPLES-$(CONFIG_ENCODERS)     += simple_encoder.c
  simple_encoder.SRCS             += ivfenc.h ivfenc.c
  simple_encoder.SRCS             += tools_common.h tools_common.c
  simple_encoder.SRCS             += video_common.h
  simple_encoder.SRCS             += video_writer.h video_writer.c
  simple_encoder.GUID              = 4607D299-8A71-4D2C-9B1D-071899B6FBFD
  simple_encoder.DESCRIPTION       = Simplified encoder loop
-EXAMPLES-$(CONFIG_VP8_ENCODER)  += twopass_encoder.c
+EXAMPLES-$(CONFIG_VP9_ENCODER)  += vp9_lossless_encoder.c
+vp9_lossless_encoder.SRCS       += ivfenc.h ivfenc.c
+vp9_lossless_encoder.SRCS       += tools_common.h tools_common.c
+vp9_lossless_encoder.SRCS       += video_common.h
+vp9_lossless_encoder.SRCS       += video_writer.h video_writer.c
+vp9_lossless_encoder.GUID        = B63C7C88-5348-46DC-A5A6-CC151EF93366
+vp9_lossless_encoder.DESCRIPTION = Simplified lossless VP9 encoder
+EXAMPLES-$(CONFIG_ENCODERS)     += twopass_encoder.c
  twopass_encoder.SRCS            += ivfenc.h ivfenc.c
  twopass_encoder.SRCS            += tools_common.h tools_common.c
  twopass_encoder.SRCS            += video_common.h
  twopass_encoder.SRCS            += video_writer.h video_writer.c
  twopass_encoder.GUID             = 73494FA6-4AF9-4763-8FBB-265C92402FD8
  twopass_encoder.DESCRIPTION      = Two-pass encoder loop
-ifeq ($(CONFIG_DECODERS),yes)
-EXAMPLES-$(CONFIG_VP8_ENCODER)  += decode_with_drops.c
+EXAMPLES-$(CONFIG_DECODERS)     += decode_with_drops.c
  decode_with_drops.SRCS          += ivfdec.h ivfdec.c
  decode_with_drops.SRCS          += tools_common.h tools_common.c
  decode_with_drops.SRCS          += video_common.h
  decode_with_drops.SRCS          += video_reader.h video_reader.c
  decode_with_drops.SRCS          += vpx_ports/mem_ops.h
  decode_with_drops.SRCS          += vpx_ports/mem_ops_aligned.h
-endif
  decode_with_drops.GUID           = CE5C53C4-8DDA-438A-86ED-0DDD3CDB8D26
  decode_with_drops.DESCRIPTION    = Drops frames while decoding
  EXAMPLES-$(CONFIG_ENCODERS)        += set_maps.c
diff --git a/examples/decode_to_md5.c b/examples/decode_to_md5.c

index 1c56303..fbc0f4a 100644 (file)
--- a/examples/decode_to_md5.c
+++ b/examples/decode_to_md5.c
@@ -33,8 +33,6 @@
  #include <stdlib.h>
  #include <string.h>
  
-#define VPX_CODEC_DISABLE_COMPAT 1
-
  #include "vpx/vp8dx.h"
  #include "vpx/vpx_decoder.h"
  
diff --git a/examples/decode_with_drops.c b/examples/decode_with_drops.c

index a20fdac..9423e38 100644 (file)
--- a/examples/decode_with_drops.c
+++ b/examples/decode_with_drops.c
@@ -56,8 +56,6 @@
  #include <stdlib.h>
  #include <string.h>
  
-#define VPX_CODEC_DISABLE_COMPAT 1
-
  #include "vpx/vp8dx.h"
  #include "vpx/vpx_decoder.h"
  
diff --git a/examples/postproc.c b/examples/postproc.c

index 59c50b1..c74347c 100644 (file)
--- a/examples/postproc.c
+++ b/examples/postproc.c
@@ -43,8 +43,6 @@
  #include <stdlib.h>
  #include <string.h>
  
-#define VPX_CODEC_DISABLE_COMPAT 1
-
  #include "vpx/vp8dx.h"
  #include "vpx/vpx_decoder.h"
  
diff --git a/examples/set_maps.c b/examples/set_maps.c

index 2ee5bca..851adc4 100644 (file)
--- a/examples/set_maps.c
+++ b/examples/set_maps.c
@@ -47,7 +47,6 @@
  #include <stdlib.h>
  #include <string.h>
  
-#define VPX_CODEC_DISABLE_COMPAT 1
  #include "vpx/vp8cx.h"
  #include "vpx/vpx_encoder.h"
  
diff --git a/examples/simple_decoder.c b/examples/simple_decoder.c

index 3318758..3f7d6aa 100644 (file)
--- a/examples/simple_decoder.c
+++ b/examples/simple_decoder.c
@@ -29,9 +29,7 @@
  // -----------------
  // For decoders, you only have to include `vpx_decoder.h` and then any
  // header files for the specific codecs you use. In this case, we're using
-// vp8. The `VPX_CODEC_DISABLE_COMPAT` macro can be defined to ensure
-// strict compliance with the latest SDK by disabling some backwards
-// compatibility features. Defining this macro is encouraged.
+// vp8.
  //
  // Initializing The Codec
  // ----------------------
@@ -81,8 +79,6 @@
  #include <stdlib.h>
  #include <string.h>
  
-#define VPX_CODEC_DISABLE_COMPAT 1
-
  #include "vpx/vp8dx.h"
  #include "vpx/vpx_decoder.h"
  
diff --git a/examples/simple_encoder.c b/examples/simple_encoder.c

index 30bb73a..f20c246 100644 (file)
--- a/examples/simple_encoder.c
+++ b/examples/simple_encoder.c
@@ -28,9 +28,7 @@
  // -----------------
  // For encoders, you only have to include `vpx_encoder.h` and then any
  // header files for the specific codecs you use. In this case, we're using
-// vp8. The `VPX_CODEC_DISABLE_COMPAT` macro can be defined to ensure
-// strict compliance with the latest SDK by disabling some backwards
-// compatibility features. Defining this macro is encouraged.
+// vp8.
  //
  // Getting The Default Configuration
  // ---------------------------------
@@ -101,7 +99,6 @@
  #include <stdlib.h>
  #include <string.h>
  
-#define VPX_CODEC_DISABLE_COMPAT 1
  #include "vpx/vpx_encoder.h"
  
  #include "./tools_common.h"
diff --git a/examples/twopass_encoder.c b/examples/twopass_encoder.c

index 76d5a28..653ae94 100644 (file)
--- a/examples/twopass_encoder.c
+++ b/examples/twopass_encoder.c
@@ -28,9 +28,8 @@
  // Encoding A Frame
  // ----------------
  // Encoding a frame in two pass mode is identical to the simple encoder
-// example, except the deadline is set to VPX_DL_BEST_QUALITY to get the
-// best quality possible. VPX_DL_GOOD_QUALITY could also be used.
-//
+// example. To increase the quality while sacrificing encoding speed,
+// VPX_DL_BEST_QUALITY can be used in place of VPX_DL_GOOD_QUALITY.
  //
  // Processing Statistics Packets
  // -----------------------------
@@ -52,7 +51,6 @@
  #include <stdlib.h>
  #include <string.h>
  
-#define VPX_CODEC_DISABLE_COMPAT 1
  #include "vpx/vpx_encoder.h"
  
  #include "./tools_common.h"
@@ -142,13 +140,13 @@ static vpx_fixed_buf_t pass0(vpx_image_t *raw,
    // Calculate frame statistics.
    while (vpx_img_read(raw, infile)) {
      ++frame_count;
-    get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY,
+    get_frame_stats(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY,
                      &stats);
    }
  
    // Flush encoder.
    while (get_frame_stats(&codec, NULL, frame_count, 1, 0,
-                         VPX_DL_BEST_QUALITY, &stats)) {}
+                         VPX_DL_GOOD_QUALITY, &stats)) {}
  
    printf("Pass 0 complete. Processed %d frames.\n", frame_count);
    if (vpx_codec_destroy(&codec))
@@ -182,11 +180,11 @@ static void pass1(vpx_image_t *raw,
    // Encode frames.
    while (vpx_img_read(raw, infile)) {
      ++frame_count;
-    encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_BEST_QUALITY, writer);
+    encode_frame(&codec, raw, frame_count, 1, 0, VPX_DL_GOOD_QUALITY, writer);
    }
  
    // Flush encoder.
-  while (encode_frame(&codec, NULL, -1, 1, 0, VPX_DL_BEST_QUALITY, writer)) {}
+  while (encode_frame(&codec, NULL, -1, 1, 0, VPX_DL_GOOD_QUALITY, writer)) {}
  
    printf("\n");
  
diff --git a/examples/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c

index 7c050fa..9f50dc7 100644 (file)
--- a/examples/vp8_multi_resolution_encoder.c
+++ b/examples/vp8_multi_resolution_encoder.c
@@ -24,7 +24,6 @@
  #include "third_party/libyuv/include/libyuv/scale.h"
  #include "third_party/libyuv/include/libyuv/cpu_id.h"
  
-#define VPX_CODEC_DISABLE_COMPAT 1
  #include "vpx/vpx_encoder.h"
  #include "vpx/vp8cx.h"
  
diff --git a/examples/vp8cx_set_ref.c b/examples/vp8cx_set_ref.c

index 5f3f0a3..b0961a2 100644 (file)
--- a/examples/vp8cx_set_ref.c
+++ b/examples/vp8cx_set_ref.c
@@ -50,7 +50,6 @@
  #include <stdlib.h>
  #include <string.h>
  
-#define VPX_CODEC_DISABLE_COMPAT 1
  #include "vpx/vp8cx.h"
  #include "vpx/vpx_encoder.h"
  
diff --git a/examples/vp9_lossless_encoder.c b/examples/vp9_lossless_encoder.c

new file mode 100644 (file)

index 0000000..3fcda0c
--- /dev/null
+++ b/examples/vp9_lossless_encoder.c
@@ -0,0 +1,144 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vpx_encoder.h"
+#include "vpx/vp8cx.h"
+
+#include "./tools_common.h"
+#include "./video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit() {
+  fprintf(stderr, "vp9_lossless_encoder: Example demonstrating VP9 lossless "
+                  "encoding feature. Supports raw input only.\n");
+  fprintf(stderr, "Usage: %s <width> <height> <infile> <outfile>\n", exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int encode_frame(vpx_codec_ctx_t *codec,
+                        vpx_image_t *img,
+                        int frame_index,
+                        int flags,
+                        VpxVideoWriter *writer) {
+  int got_pkts = 0;
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  const vpx_codec_err_t res = vpx_codec_encode(codec, img, frame_index, 1,
+                                               flags, VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK)
+    die_codec(codec, "Failed to encode frame");
+
+  while ((pkt = vpx_codec_get_cx_data(codec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
+      if (!vpx_video_writer_write_frame(writer,
+                                        pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(codec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+    }
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  vpx_codec_ctx_t codec;
+  vpx_codec_enc_cfg_t cfg;
+  int frame_count = 0;
+  vpx_image_t raw;
+  vpx_codec_err_t res;
+  VpxVideoInfo info = {0};
+  VpxVideoWriter *writer = NULL;
+  const VpxInterface *encoder = NULL;
+  const int fps = 30;
+
+  exec_name = argv[0];
+
+  if (argc < 5)
+    die("Invalid number of arguments");
+
+  encoder = get_vpx_encoder_by_name("vp9");
+  if (!encoder)
+     die("Unsupported codec.");
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = strtol(argv[1], NULL, 0);
+  info.frame_height = strtol(argv[2], NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 ||
+      info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 ||
+      (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
+                                             info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+  res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res)
+    die_codec(&codec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+
+  writer = vpx_video_writer_open(argv[4], kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing.", argv[4]);
+
+  if (!(infile = fopen(argv[3], "rb")))
+    die("Failed to open %s for reading.", argv[3]);
+
+  if (vpx_codec_enc_init(&codec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&codec, "Failed to initialize encoder");
+
+  if (vpx_codec_control_(&codec, VP9E_SET_LOSSLESS, 1))
+    die_codec(&codec, "Failed to use lossless mode");
+
+  // Encode frames.
+  while (vpx_img_read(&raw, infile)) {
+    encode_frame(&codec, &raw, frame_count++, 0, writer);
+  }
+
+  // Flush encoder.
+  while (encode_frame(&codec, NULL, -1, 0, writer)) {}
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_count);
+
+  vpx_img_free(&raw);
+  if (vpx_codec_destroy(&codec))
+    die_codec(&codec, "Failed to destroy codec.");
+
+  vpx_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/examples/vp9_spatial_svc_encoder.c b/examples/vp9_spatial_svc_encoder.c

index 81d3800..7ce3403 100644 (file)
--- a/examples/vp9_spatial_svc_encoder.c
+++ b/examples/vp9_spatial_svc_encoder.c
@@ -282,7 +282,7 @@ int main(int argc, const char **argv) {
    int frame_duration = 1; /* 1 timebase tick per frame */
    FILE *infile = NULL;
    int end_of_stream = 0;
-  int frame_size;
+  int frames_received = 0;
  
    memset(&svc_ctx, 0, sizeof(svc_ctx));
    svc_ctx.log_print = 1;
@@ -325,6 +325,8 @@ int main(int argc, const char **argv) {
  
    // Encode frames
    while (!end_of_stream) {
+    vpx_codec_iter_t iter = NULL;
+    const vpx_codec_cx_pkt_t *cx_pkt;
      if (frame_cnt >= app_input.frames_to_code || !vpx_img_read(&raw, infile)) {
        // We need one extra vpx_svc_encode call at end of stream to flush
        // encoder and get remaining data
@@ -337,18 +339,34 @@ int main(int argc, const char **argv) {
      if (res != VPX_CODEC_OK) {
        die_codec(&codec, "Failed to encode frame");
      }
-    if (!(app_input.passes == 2 && app_input.pass == 1)) {
-      while ((frame_size = vpx_svc_get_frame_size(&svc_ctx)) > 0) {
-        vpx_video_writer_write_frame(writer,
-                                     vpx_svc_get_buffer(&svc_ctx),
-                                     frame_size, pts);
+
+    while ((cx_pkt = vpx_codec_get_cx_data(&codec, &iter)) != NULL) {
+      switch (cx_pkt->kind) {
+        case VPX_CODEC_CX_FRAME_PKT: {
+          if (cx_pkt->data.frame.sz > 0)
+            vpx_video_writer_write_frame(writer,
+                                         cx_pkt->data.frame.buf,
+                                         cx_pkt->data.frame.sz,
+                                         cx_pkt->data.frame.pts);
+
+          printf("SVC frame: %d, kf: %d, size: %d, pts: %d\n", frames_received,
+                 !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY),
+                 (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
+          ++frames_received;
+          break;
+        }
+        case VPX_CODEC_STATS_PKT: {
+          stats_write(&app_input.rc_stats,
+                      cx_pkt->data.twopass_stats.buf,
+                      cx_pkt->data.twopass_stats.sz);
+          break;
+        }
+        default: {
+          break;
+        }
        }
      }
-    if (vpx_svc_get_rc_stats_buffer_size(&svc_ctx) > 0) {
-      stats_write(&app_input.rc_stats,
-                  vpx_svc_get_rc_stats_buffer(&svc_ctx),
-                  vpx_svc_get_rc_stats_buffer_size(&svc_ctx));
-    }
+
      if (!end_of_stream) {
        ++frame_cnt;
        pts += frame_duration;
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c

index 5eac92c..3de983e 100644 (file)
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -18,7 +18,6 @@
  #include <stdlib.h>
  #include <string.h>
  
-#define VPX_CODEC_DISABLE_COMPAT 1
  #include "./vpx_config.h"
  #include "vpx_ports/vpx_timer.h"
  #include "vpx/vp8cx.h"
diff --git a/libs.mk b/libs.mk

index c7c2748..f9f2d80 100644 (file)
--- a/libs.mk
+++ b/libs.mk
@@ -531,7 +531,6 @@ libs.doxy: $(CODEC_DOC_SRCS)
         @echo "    [CREATE] $@"
         @rm -f $@
         @echo "INPUT += $^" >> $@
-       @echo "PREDEFINED = VPX_CODEC_DISABLE_COMPAT" >> $@
         @echo "INCLUDE_PATH += ." >> $@;
         @echo "ENABLED_SECTIONS += $(sort $(CODEC_DOC_SECTIONS))" >> $@
  
diff --git a/test/convolve_test.cc b/test/convolve_test.cc

index 1724db3..d1e7f1d 100644 (file)
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -581,6 +581,8 @@ TEST_P(ConvolveTest, CheckScalingFiltering) {
  
  using std::tr1::make_tuple;
  
+#if CONFIG_VP9_HIGHBITDEPTH
+#else
  const ConvolveFunctions convolve8_c(
      vp9_convolve8_horiz_c, vp9_convolve8_avg_horiz_c,
      vp9_convolve8_vert_c, vp9_convolve8_avg_vert_c,
@@ -600,8 +602,11 @@ INSTANTIATE_TEST_CASE_P(C, ConvolveTest, ::testing::Values(
      make_tuple(64, 32, &convolve8_c),
      make_tuple(32, 64, &convolve8_c),
      make_tuple(64, 64, &convolve8_c)));
+#endif
  
-#if HAVE_SSE2
+#if HAVE_SSE2 && ARCH_X86_64
+#if CONFIG_VP9_HIGHBITDEPTH
+#else
  const ConvolveFunctions convolve8_sse2(
      vp9_convolve8_horiz_sse2, vp9_convolve8_avg_horiz_sse2,
      vp9_convolve8_vert_sse2, vp9_convolve8_avg_vert_sse2,
@@ -622,6 +627,7 @@ INSTANTIATE_TEST_CASE_P(SSE2, ConvolveTest, ::testing::Values(
      make_tuple(32, 64, &convolve8_sse2),
      make_tuple(64, 64, &convolve8_sse2)));
  #endif
+#endif
  
  #if HAVE_SSSE3
  const ConvolveFunctions convolve8_ssse3(
@@ -645,7 +651,7 @@ INSTANTIATE_TEST_CASE_P(SSSE3, ConvolveTest, ::testing::Values(
      make_tuple(64, 64, &convolve8_ssse3)));
  #endif
  
-#if HAVE_AVX2
+#if HAVE_AVX2 && HAVE_SSSE3
  const ConvolveFunctions convolve8_avx2(
      vp9_convolve8_horiz_avx2, vp9_convolve8_avg_horiz_ssse3,
      vp9_convolve8_vert_avx2, vp9_convolve8_avg_vert_ssse3,
@@ -665,7 +671,7 @@ INSTANTIATE_TEST_CASE_P(AVX2, ConvolveTest, ::testing::Values(
      make_tuple(64, 32, &convolve8_avx2),
      make_tuple(32, 64, &convolve8_avx2),
      make_tuple(64, 64, &convolve8_avx2)));
-#endif
+#endif  // HAVE_AVX2 && HAVE_SSSE3
  
  #if HAVE_NEON_ASM
  const ConvolveFunctions convolve8_neon(
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc

index c38cc2e..d1ce109 100644 (file)
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -20,12 +20,9 @@
  
  #include "./vp9_rtcd.h"
  #include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
  #include "vpx/vpx_integer.h"
  
-extern "C" {
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *output, int pitch);
-}
-
  using libvpx_test::ACMRandom;
  
  namespace {
@@ -258,42 +255,72 @@ void reference_16x16_dct_2d(int16_t input[256], double output[256]) {
    }
  }
  
-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
                          int tx_type);
-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                          int tx_type);
  
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct16x16Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht16x16Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct16x16Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht16x16Param;
  
-void fdct16x16_ref(const int16_t *in, int16_t *out, int stride,
+void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
                     int /*tx_type*/) {
    vp9_fdct16x16_c(in, out, stride);
  }
  
-void idct16x16_ref(const int16_t *in, uint8_t *dest, int stride,
+void idct16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
                     int /*tx_type*/) {
    vp9_idct16x16_256_add_c(in, dest, stride);
  }
  
-void fht16x16_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,
+                  int tx_type) {
    vp9_fht16x16_c(in, out, stride, tx_type);
  }
  
-void iht16x16_ref(const int16_t *in, uint8_t *dest, int stride, int tx_type) {
+void iht16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
+                  int tx_type) {
    vp9_iht16x16_256_add_c(in, dest, stride, tx_type);
  }
  
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct16x16_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct16x16_256_add_c(in, out, stride, 10);
+}
+
+void idct16x16_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct16x16_256_add_c(in, out, stride, 12);
+}
+
+void idct16x16_10_ref(const tran_low_t *in, uint8_t *out, int stride,
+                      int tx_type) {
+  idct16x16_10(in, out, stride);
+}
+
+void idct16x16_12_ref(const tran_low_t *in, uint8_t *out, int stride,
+                      int tx_type) {
+  idct16x16_12(in, out, stride);
+}
+
+void iht16x16_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_high_iht16x16_256_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht16x16_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_high_iht16x16_256_add_c(in, out, stride, tx_type, 12);
+}
+#endif
+
  class Trans16x16TestBase {
   public:
    virtual ~Trans16x16TestBase() {}
  
   protected:
-  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
+  virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
  
-  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;
+  virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;
  
    void RunAccuracyCheck() {
      ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -302,23 +329,48 @@ class Trans16x16TestBase {
      const int count_test_block = 10000;
      for (int i = 0; i < count_test_block; ++i) {
        DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
        DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
        DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+      DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
  
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
        for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        test_input_block[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
        }
  
        ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
                                            test_temp_block, pitch_));
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
  
        for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ?  dst[j] - src[j] : dst16[j] - src16[j];
+#else
          const uint32_t diff = dst[j] - src[j];
+#endif
          const uint32_t error = diff * diff;
          if (max_error < error)
            max_error = error;
@@ -326,10 +378,10 @@ class Trans16x16TestBase {
        }
      }
  
-    EXPECT_GE(1u, max_error)
+    EXPECT_GE(1u  << 2 * (bit_depth_ - 8), max_error)
          << "Error: 16x16 FHT/IHT has an individual round trip error > 1";
  
-    EXPECT_GE(count_test_block , total_error)
+    EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)
          << "Error: 16x16 FHT/IHT has average round trip error > 1 per block";
    }
  
@@ -337,13 +389,13 @@ class Trans16x16TestBase {
      ACMRandom rnd(ACMRandom::DeterministicSeed());
      const int count_test_block = 1000;
      DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
  
      for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
        for (int j = 0; j < kNumCoeffs; ++j)
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
  
        fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
        ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
@@ -359,21 +411,21 @@ class Trans16x16TestBase {
      const int count_test_block = 1000;
      DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
      DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
  
      for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
        for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
-        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
        }
        if (i == 0) {
          for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = 255;
+          input_extreme_block[j] = mask_;
        } else if (i == 1) {
          for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = -255;
+          input_extreme_block[j] = -mask_;
        }
  
        fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
@@ -383,7 +435,7 @@ class Trans16x16TestBase {
        // The minimum quant value is 4.
        for (int j = 0; j < kNumCoeffs; ++j) {
          EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
              << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
        }
      }
@@ -394,39 +446,65 @@ class Trans16x16TestBase {
      const int count_test_block = 1000;
      DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
      DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
  
      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
      DECLARE_ALIGNED_ARRAY(16, uint8_t, ref, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, ref16, kNumCoeffs);
+#endif
  
      for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
        for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
-        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        if (bit_depth_ == VPX_BITS_8)
+          input_block[j] = rnd.Rand8() - rnd.Rand8();
+        else
+          input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
        }
        if (i == 0)
          for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = 255;
+          input_extreme_block[j] = mask_;
        if (i == 1)
          for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = -255;
+          input_extreme_block[j] = -mask_;
  
        fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
  
        // clear reconstructed pixel buffers
        vpx_memset(dst, 0, kNumCoeffs * sizeof(uint8_t));
        vpx_memset(ref, 0, kNumCoeffs * sizeof(uint8_t));
+#if CONFIG_VP9_HIGHBITDEPTH
+      vpx_memset(dst16, 0, kNumCoeffs * sizeof(uint16_t));
+      vpx_memset(ref16, 0, kNumCoeffs * sizeof(uint16_t));
+#endif
  
        // quantization with maximum allowed step sizes
        output_ref_block[0] = (output_ref_block[0] / dc_thred) * dc_thred;
        for (int j = 1; j < kNumCoeffs; ++j)
          output_ref_block[j] = (output_ref_block[j] / ac_thred) * ac_thred;
-      inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
-
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(ref[j], dst[j]);
+      if (bit_depth_ == VPX_BITS_8) {
+        inv_txfm_ref(output_ref_block, ref, pitch_, tx_type_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        inv_txfm_ref(output_ref_block, CONVERT_TO_BYTEPTR(ref16), pitch_,
+                     tx_type_);
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(output_ref_block,
+                                            CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
+      if (bit_depth_ == VPX_BITS_8) {
+        for (int j = 0; j < kNumCoeffs; ++j)
+          EXPECT_EQ(ref[j], dst[j]);
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        for (int j = 0; j < kNumCoeffs; ++j)
+          EXPECT_EQ(ref16[j], dst16[j]);
+#endif
+      }
      }
    }
  
@@ -434,28 +512,52 @@ class Trans16x16TestBase {
      ACMRandom rnd(ACMRandom::DeterministicSeed());
      const int count_test_block = 1000;
      DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
  
      for (int i = 0; i < count_test_block; ++i) {
        double out_r[kNumCoeffs];
  
        // Initialize a test block with input range [-255, 255].
        for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        in[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          in[j] = src16[j] - dst16[j];
+#endif
+        }
        }
  
        reference_16x16_dct_2d(in, out_r);
        for (int j = 0; j < kNumCoeffs; ++j)
          coeff[j] = round(out_r[j]);
  
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, 16));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            16));
+#endif
+      }
  
        for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
          const uint32_t diff = dst[j] - src[j];
+#endif
          const uint32_t error = diff * diff;
          EXPECT_GE(1u, error)
              << "Error: 16x16 IDCT has error " << error
@@ -465,6 +567,8 @@ class Trans16x16TestBase {
    }
    int pitch_;
    int tx_type_;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
    FhtFunc fwd_txfm_ref;
    IhtFunc inv_txfm_ref;
  };
@@ -479,17 +583,34 @@ class Trans16x16DCT
      fwd_txfm_ = GET_PARAM(0);
      inv_txfm_ = GET_PARAM(1);
      tx_type_  = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
      pitch_    = 16;
      fwd_txfm_ref = fdct16x16_ref;
      inv_txfm_ref = idct16x16_ref;
+    mask_ = (1 << bit_depth_) - 1;
+#if CONFIG_VP9_HIGHBITDEPTH
+    switch (bit_depth_) {
+      case 10:
+        inv_txfm_ref = idct16x16_10_ref;
+        break;
+      case 12:
+        inv_txfm_ref = idct16x16_12_ref;
+        break;
+      default:
+        inv_txfm_ref = idct16x16_ref;
+        break;
+    }
+#else
+    inv_txfm_ref = idct16x16_ref;
+#endif
    }
    virtual void TearDown() { libvpx_test::ClearSystemState(); }
  
   protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
      fwd_txfm_(in, out, stride);
    }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
      inv_txfm_(out, dst, stride);
    }
  
@@ -529,17 +650,34 @@ class Trans16x16HT
      fwd_txfm_ = GET_PARAM(0);
      inv_txfm_ = GET_PARAM(1);
      tx_type_  = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
      pitch_    = 16;
      fwd_txfm_ref = fht16x16_ref;
      inv_txfm_ref = iht16x16_ref;
+    mask_ = (1 << bit_depth_) - 1;
+#if CONFIG_VP9_HIGHBITDEPTH
+    switch (bit_depth_) {
+      case VPX_BITS_10:
+        inv_txfm_ref = iht16x16_10;
+        break;
+      case VPX_BITS_12:
+        inv_txfm_ref = iht16x16_12;
+        break;
+      default:
+        inv_txfm_ref = iht16x16_ref;
+        break;
+    }
+#else
+    inv_txfm_ref = iht16x16_ref;
+#endif
    }
    virtual void TearDown() { libvpx_test::ClearSystemState(); }
  
   protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
      fwd_txfm_(in, out, stride, tx_type_);
    }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
      inv_txfm_(out, dst, stride, tx_type_);
    }
  
@@ -567,45 +705,78 @@ TEST_P(Trans16x16HT, QuantCheck) {
  
  using std::tr1::make_tuple;
  
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16DCT,
+    ::testing::Values(
+        make_tuple(&vp9_high_fdct16x16_c, &idct16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fdct16x16_c, &idct16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
+#else
  INSTANTIATE_TEST_CASE_P(
      C, Trans16x16DCT,
      ::testing::Values(
-        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0)));
+        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_c, 0, VPX_BITS_8)));
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      C, Trans16x16HT,
      ::testing::Values(
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2),
-        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3)));
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_high_fht16x16_c, &iht16x16_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans16x16HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_c, &vp9_iht16x16_256_add_c, 3, VPX_BITS_8)));
+#endif
  
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      NEON, Trans16x16DCT,
      ::testing::Values(
          make_tuple(&vp9_fdct16x16_c,
-                   &vp9_idct16x16_256_add_neon, 0)));
+                   &vp9_idct16x16_256_add_neon, 0, VPX_BITS_8)));
  #endif
  
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      SSE2, Trans16x16DCT,
      ::testing::Values(
          make_tuple(&vp9_fdct16x16_sse2,
-                   &vp9_idct16x16_256_add_sse2, 0)));
+                   &vp9_idct16x16_256_add_sse2, 0, VPX_BITS_8)));
  INSTANTIATE_TEST_CASE_P(
      SSE2, Trans16x16HT,
      ::testing::Values(
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0),
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1),
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2),
-        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3)));
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 0,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 1,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 2,
+                   VPX_BITS_8),
+        make_tuple(&vp9_fht16x16_sse2, &vp9_iht16x16_256_add_sse2, 3,
+                   VPX_BITS_8)));
  #endif
  
-#if HAVE_SSSE3
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      SSSE3, Trans16x16DCT,
      ::testing::Values(
-        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0)));
+        make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0,
+                   VPX_BITS_8)));
  #endif
  }  // namespace
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc

index d2d437c..c7a1931 100644 (file)
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -21,6 +21,7 @@
  #include "./vpx_config.h"
  #include "./vp9_rtcd.h"
  #include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
  #include "vpx/vpx_integer.h"
  
  using libvpx_test::ACMRandom;
@@ -71,10 +72,21 @@ void reference_32x32_dct_2d(const int16_t input[kNumCoeffs],
    }
  }
  
-typedef void (*FwdTxfmFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*InvTxfmFunc)(const int16_t *in, uint8_t *out, int stride);
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
  
-typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int> Trans32x32Param;
+typedef std::tr1::tuple<FwdTxfmFunc, InvTxfmFunc, int, vpx_bit_depth_t>
+    Trans32x32Param;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct32x32_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct32x32_1024_add_c(in, out, stride, 10);
+}
+
+void idct32x32_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct32x32_1024_add_c(in, out, stride, 12);
+}
+#endif
  
  class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
   public:
@@ -84,12 +96,16 @@ class Trans32x32Test : public ::testing::TestWithParam<Trans32x32Param> {
      inv_txfm_ = GET_PARAM(1);
      version_  = GET_PARAM(2);  // 0: high precision forward transform
                                 // 1: low precision version for rd loop
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
    }
  
    virtual void TearDown() { libvpx_test::ClearSystemState(); }
  
   protected:
    int version_;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
    FwdTxfmFunc fwd_txfm_;
    InvTxfmFunc inv_txfm_;
  };
@@ -100,23 +116,47 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
    int64_t total_error = 0;
    const int count_test_block = 1000;
    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
  
    for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-255, 255].
+    // Initialize a test block with input range [-mask_, mask_].
      for (int j = 0; j < kNumCoeffs; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-      test_input_block[j] = src[j] - dst[j];
+      if (bit_depth_ == 8) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        src16[j] = rnd.Rand16() & mask_;
+        dst16[j] = rnd.Rand16() & mask_;
+        test_input_block[j] = src16[j] - dst16[j];
+#endif
+      }
      }
  
      ASM_REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, 32));
-    ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+    if (bit_depth_ == VPX_BITS_8) {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32));
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(test_temp_block,
+                                         CONVERT_TO_BYTEPTR(dst16), 32));
+#endif
+    }
  
      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      const uint32_t diff =
+          bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
        const uint32_t diff = dst[j] - src[j];
+#endif
        const uint32_t error = diff * diff;
        if (max_error < error)
          max_error = error;
@@ -129,10 +169,10 @@ TEST_P(Trans32x32Test, AccuracyCheck) {
      total_error /= 45;
    }
  
-  EXPECT_GE(1u, max_error)
+  EXPECT_GE(1u << 2 * (bit_depth_ - 8), max_error)
        << "Error: 32x32 FDCT/IDCT has an individual round-trip error > 1";
  
-  EXPECT_GE(count_test_block, total_error)
+  EXPECT_GE(count_test_block << 2 * (bit_depth_ - 8), total_error)
        << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
  }
  
@@ -141,12 +181,12 @@ TEST_P(Trans32x32Test, CoeffCheck) {
    const int count_test_block = 1000;
  
    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
  
    for (int i = 0; i < count_test_block; ++i) {
      for (int j = 0; j < kNumCoeffs; ++j)
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
+      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
  
      const int stride = 32;
      vp9_fdct32x32_c(input_block, output_ref_block, stride);
@@ -170,21 +210,21 @@ TEST_P(Trans32x32Test, MemCheck) {
  
    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
  
    for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-255, 255].
+    // Initialize a test block with input range [-mask_, mask_].
      for (int j = 0; j < kNumCoeffs; ++j) {
-      input_block[j] = rnd.Rand8() - rnd.Rand8();
-      input_extreme_block[j] = rnd.Rand8() & 1 ? 255 : -255;
+      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+      input_extreme_block[j] = rnd.Rand8() & 1 ? mask_ : -mask_;
      }
      if (i == 0) {
        for (int j = 0; j < kNumCoeffs; ++j)
-        input_extreme_block[j] = 255;
+        input_extreme_block[j] = mask_;
      } else if (i == 1) {
        for (int j = 0; j < kNumCoeffs; ++j)
-        input_extreme_block[j] = -255;
+        input_extreme_block[j] = -mask_;
      }
  
      const int stride = 32;
@@ -201,9 +241,9 @@ TEST_P(Trans32x32Test, MemCheck) {
          EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
              << "Error: 32x32 FDCT rd has mismatched coefficients";
        }
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_ref_block[j]))
+      EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_ref_block[j]))
            << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+      EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
            << "Error: 32x32 FDCT has coefficient larger than "
            << "4*DCT_MAX_VALUE";
      }
@@ -214,26 +254,49 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    const int count_test_block = 1000;
    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
  
    for (int i = 0; i < count_test_block; ++i) {
      double out_r[kNumCoeffs];
  
      // Initialize a test block with input range [-255, 255]
      for (int j = 0; j < kNumCoeffs; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-      in[j] = src[j] - dst[j];
+      if (bit_depth_ == VPX_BITS_8) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        src16[j] = rnd.Rand16() & mask_;
+        dst16[j] = rnd.Rand16() & mask_;
+        in[j] = src16[j] - dst16[j];
+#endif
+      }
      }
  
      reference_32x32_dct_2d(in, out_r);
      for (int j = 0; j < kNumCoeffs; ++j)
        coeff[j] = round(out_r[j]);
-    ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+    if (bit_depth_ == VPX_BITS_8) {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, dst, 32));
+#if CONFIG_VP9_HIGHBITDEPTH
+    } else {
+      ASM_REGISTER_STATE_CHECK(inv_txfm_(coeff, CONVERT_TO_BYTEPTR(dst16), 32));
+#endif
+    }
      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      const int diff =
+          bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
        const int diff = dst[j] - src[j];
+#endif
        const int error = diff * diff;
        EXPECT_GE(1, error)
            << "Error: 32x32 IDCT has error " << error
@@ -244,39 +307,59 @@ TEST_P(Trans32x32Test, InverseAccuracy) {
  
  using std::tr1::make_tuple;
  
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, Trans32x32Test,
+    ::testing::Values(
+        make_tuple(&vp9_high_fdct32x32_c,
+                   &idct32x32_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fdct32x32_rd_c,
+                   &idct32x32_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_high_fdct32x32_c,
+                   &idct32x32_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_high_fdct32x32_rd_c,
+                   &idct32x32_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_fdct32x32_c,
+                   &vp9_idct32x32_1024_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fdct32x32_rd_c,
+                   &vp9_idct32x32_1024_add_c, 1, VPX_BITS_8)));
+#else
  INSTANTIATE_TEST_CASE_P(
      C, Trans32x32Test,
      ::testing::Values(
-        make_tuple(&vp9_fdct32x32_c, &vp9_idct32x32_1024_add_c, 0),
-        make_tuple(&vp9_fdct32x32_rd_c, &vp9_idct32x32_1024_add_c, 1)));
+        make_tuple(&vp9_fdct32x32_c,
+                   &vp9_idct32x32_1024_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fdct32x32_rd_c,
+                   &vp9_idct32x32_1024_add_c, 1, VPX_BITS_8)));
+#endif
  
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      NEON, Trans32x32Test,
      ::testing::Values(
          make_tuple(&vp9_fdct32x32_c,
-                   &vp9_idct32x32_1024_add_neon, 0),
+                   &vp9_idct32x32_1024_add_neon, 0, VPX_BITS_8),
          make_tuple(&vp9_fdct32x32_rd_c,
-                   &vp9_idct32x32_1024_add_neon, 1)));
+                   &vp9_idct32x32_1024_add_neon, 1, VPX_BITS_8)));
  #endif
  
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      SSE2, Trans32x32Test,
      ::testing::Values(
          make_tuple(&vp9_fdct32x32_sse2,
-                   &vp9_idct32x32_1024_add_sse2, 0),
+                   &vp9_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
          make_tuple(&vp9_fdct32x32_rd_sse2,
-                   &vp9_idct32x32_1024_add_sse2, 1)));
+                   &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
  #endif
  
-#if HAVE_AVX2
+#if HAVE_AVX2 && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      AVX2, Trans32x32Test,
      ::testing::Values(
          make_tuple(&vp9_fdct32x32_avx2,
-                   &vp9_idct32x32_1024_add_sse2, 0),
+                   &vp9_idct32x32_1024_add_sse2, 0, VPX_BITS_8),
          make_tuple(&vp9_fdct32x32_rd_avx2,
-                   &vp9_idct32x32_1024_add_sse2, 1)));
+                   &vp9_idct32x32_1024_add_sse2, 1, VPX_BITS_8)));
  #endif
  }  // namespace
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc

index 08a69ab..f803c8e 100644 (file)
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -20,46 +20,71 @@
  
  #include "./vp9_rtcd.h"
  #include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
  #include "vpx/vpx_integer.h"
  
-extern "C" {
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *output, int pitch);
-}
-
  using libvpx_test::ACMRandom;
  
  namespace {
  const int kNumCoeffs = 16;
-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
                          int tx_type);
-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                          int tx_type);
  
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct4x4Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht4x4Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;
  
-void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) {
+void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                 int tx_type) {
    vp9_fdct4x4_c(in, out, stride);
  }
  
-void fht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
    vp9_fht4x4_c(in, out, stride, tx_type);
  }
  
-void fwht4x4_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) {
+void fwht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                 int tx_type) {
    vp9_fwht4x4_c(in, out, stride);
  }
  
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct4x4_16_add_c(in, out, stride, 10);
+}
+
+void idct4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct4x4_16_add_c(in, out, stride, 12);
+}
+
+void iht4x4_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_high_iht4x4_16_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht4x4_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_high_iht4x4_16_add_c(in, out, stride, tx_type, 12);
+}
+
+void iwht4x4_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_iwht4x4_16_add_c(in, out, stride, 10);
+}
+
+void iwht4x4_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_iwht4x4_16_add_c(in, out, stride, 12);
+}
+#endif
+
  class Trans4x4TestBase {
   public:
    virtual ~Trans4x4TestBase() {}
  
   protected:
-  virtual void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) = 0;
+  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
  
-  virtual void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) = 0;
+  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
  
    void RunAccuracyCheck(int limit) {
      ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -68,23 +93,47 @@ class Trans4x4TestBase {
      const int count_test_block = 10000;
      for (int i = 0; i < count_test_block; ++i) {
        DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
-      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, kNumCoeffs);
        DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
        DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+      DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
  
        // Initialize a test block with input range [-255, 255].
        for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        test_input_block[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
        }
  
        ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
                                            test_temp_block, pitch_));
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block,
+                                            CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
  
        for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
          const uint32_t diff = dst[j] - src[j];
+#endif
          const uint32_t error = diff * diff;
          if (max_error < error)
            max_error = error;
@@ -105,13 +154,13 @@ class Trans4x4TestBase {
      ACMRandom rnd(ACMRandom::DeterministicSeed());
      const int count_test_block = 5000;
      DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
  
      for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
        for (int j = 0; j < kNumCoeffs; ++j)
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
  
        fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
        ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
@@ -127,21 +176,21 @@ class Trans4x4TestBase {
      const int count_test_block = 5000;
      DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
      DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_block, kNumCoeffs);
  
      for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
        for (int j = 0; j < kNumCoeffs; ++j) {
-        input_block[j] = rnd.Rand8() - rnd.Rand8();
-        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
        }
        if (i == 0) {
          for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = 255;
+          input_extreme_block[j] = mask_;
        } else if (i == 1) {
          for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = -255;
+          input_extreme_block[j] = -mask_;
        }
  
        fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
@@ -151,8 +200,8 @@ class Trans4x4TestBase {
        // The minimum quant value is 4.
        for (int j = 0; j < kNumCoeffs; ++j) {
          EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
-            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
+            << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
        }
      }
    }
@@ -161,24 +210,48 @@ class Trans4x4TestBase {
      ACMRandom rnd(ACMRandom::DeterministicSeed());
      const int count_test_block = 1000;
      DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+#endif
  
      for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
        for (int j = 0; j < kNumCoeffs; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        in[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          in[j] = src16[j] - dst16[j];
+#endif
+        }
        }
  
        fwd_txfm_ref(in, coeff, pitch_, tx_type_);
  
-      ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
  
        for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
          const uint32_t diff = dst[j] - src[j];
+#endif
          const uint32_t error = diff * diff;
          EXPECT_GE(static_cast<uint32_t>(limit), error)
              << "Error: 4x4 IDCT has error " << error
@@ -190,6 +263,8 @@ class Trans4x4TestBase {
    int pitch_;
    int tx_type_;
    FhtFunc fwd_txfm_ref;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
  };
  
  class Trans4x4DCT
@@ -204,14 +279,16 @@ class Trans4x4DCT
      tx_type_  = GET_PARAM(2);
      pitch_    = 4;
      fwd_txfm_ref = fdct4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
    }
    virtual void TearDown() { libvpx_test::ClearSystemState(); }
  
   protected:
-  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
      fwd_txfm_(in, out, stride);
    }
-  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
      inv_txfm_(out, dst, stride);
    }
  
@@ -247,15 +324,17 @@ class Trans4x4HT
      tx_type_  = GET_PARAM(2);
      pitch_    = 4;
      fwd_txfm_ref = fht4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
    }
    virtual void TearDown() { libvpx_test::ClearSystemState(); }
  
   protected:
-  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
      fwd_txfm_(in, out, stride, tx_type_);
    }
  
-  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
      inv_txfm_(out, dst, stride, tx_type_);
    }
  
@@ -291,14 +370,16 @@ class Trans4x4WHT
      tx_type_  = GET_PARAM(2);
      pitch_    = 4;
      fwd_txfm_ref = fwht4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
    }
    virtual void TearDown() { libvpx_test::ClearSystemState(); }
  
   protected:
-  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
      fwd_txfm_(in, out, stride);
    }
-  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
      inv_txfm_(out, dst, stride);
    }
  
@@ -323,57 +404,95 @@ TEST_P(Trans4x4WHT, InvAccuracyCheck) {
  }
  using std::tr1::make_tuple;
  
+#if CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      C, Trans4x4DCT,
      ::testing::Values(
-        make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0)));
+        make_tuple(&vp9_high_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0, VPX_BITS_8)));
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      C, Trans4x4HT,
      ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_high_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4WHT,
+    ::testing::Values(
+        make_tuple(&vp9_high_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+#else
  INSTANTIATE_TEST_CASE_P(
      C, Trans4x4WHT,
      ::testing::Values(
-        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0)));
+        make_tuple(&vp9_fwht4x4_c, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+#endif
  
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      NEON, Trans4x4DCT,
      ::testing::Values(
          make_tuple(&vp9_fdct4x4_c,
-                   &vp9_idct4x4_16_add_neon, 0)));
+                   &vp9_idct4x4_16_add_neon, 0, VPX_BITS_8)));
  INSTANTIATE_TEST_CASE_P(
      DISABLED_NEON, Trans4x4HT,
      ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3)));
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
  #endif
  
-#if CONFIG_USE_X86INC && HAVE_MMX
+#if CONFIG_USE_X86INC && HAVE_MMX && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      MMX, Trans4x4WHT,
      ::testing::Values(
-        make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0)));
+        make_tuple(&vp9_fwht4x4_mmx, &vp9_iwht4x4_16_add_c, 0, VPX_BITS_8)));
  #endif
  
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      SSE2, Trans4x4DCT,
      ::testing::Values(
          make_tuple(&vp9_fdct4x4_sse2,
-                   &vp9_idct4x4_16_add_sse2, 0)));
+                   &vp9_idct4x4_16_add_sse2, 0, VPX_BITS_8)));
  INSTANTIATE_TEST_CASE_P(
      SSE2, Trans4x4HT,
      ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
  #endif
  
  }  // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc

index a694f0c..5455534 100644 (file)
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -20,45 +20,96 @@
  
  #include "./vp9_rtcd.h"
  #include "vp9/common/vp9_entropy.h"
+#include "vpx/vpx_codec.h"
  #include "vpx/vpx_integer.h"
  
-extern "C" {
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *output, int pitch);
+const int kNumCoeffs = 64;
+const double kPi = 3.141592653589793238462643383279502884;
+void reference_8x8_dct_1d(const double in[8], double out[8], int stride) {
+  const double kInvSqrt2 = 0.707106781186547524400844362104;
+  for (int k = 0; k < 8; k++) {
+    out[k] = 0.0;
+    for (int n = 0; n < 8; n++)
+      out[k] += in[n] * cos(kPi * (2 * n + 1) * k / 16.0);
+    if (k == 0)
+      out[k] = out[k] * kInvSqrt2;
+  }
+}
+
+void reference_8x8_dct_2d(const int16_t input[kNumCoeffs],
+                          double output[kNumCoeffs]) {
+  // First transform columns
+  for (int i = 0; i < 8; ++i) {
+    double temp_in[8], temp_out[8];
+    for (int j = 0; j < 8; ++j)
+      temp_in[j] = input[j*8 + i];
+    reference_8x8_dct_1d(temp_in, temp_out, 1);
+    for (int j = 0; j < 8; ++j)
+      output[j * 8 + i] = temp_out[j];
+  }
+  // Then transform rows
+  for (int i = 0; i < 8; ++i) {
+    double temp_in[8], temp_out[8];
+    for (int j = 0; j < 8; ++j)
+      temp_in[j] = output[j + i*8];
+    reference_8x8_dct_1d(temp_in, temp_out, 1);
+    // Scale by some magic number
+    for (int j = 0; j < 8; ++j)
+      output[j + i * 8] = temp_out[j] * 2;
+  }
  }
  
  using libvpx_test::ACMRandom;
  
  namespace {
-typedef void (*FdctFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*IdctFunc)(const int16_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, int16_t *out, int stride,
+typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
                          int tx_type);
-typedef void (*IhtFunc)(const int16_t *in, uint8_t *out, int stride,
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                          int tx_type);
  
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int> Dct8x8Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int> Ht8x8Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct8x8Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht8x8Param;
  
-void fdct8x8_ref(const int16_t *in, int16_t *out, int stride, int /*tx_type*/) {
+void fdct8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
    vp9_fdct8x8_c(in, out, stride);
  }
  
-void fht8x8_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
+void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride, int tx_type) {
    vp9_fht8x8_c(in, out, stride, tx_type);
  }
  
+#if CONFIG_VP9_HIGHBITDEPTH
+void idct8x8_10(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct8x8_64_add_c(in, out, stride, 10);
+}
+
+void idct8x8_12(const tran_low_t *in, uint8_t *out, int stride) {
+  vp9_high_idct8x8_64_add_c(in, out, stride, 12);
+}
+
+void iht8x8_10(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_high_iht8x8_64_add_c(in, out, stride, tx_type, 10);
+}
+
+void iht8x8_12(const tran_low_t *in, uint8_t *out, int stride, int tx_type) {
+  vp9_high_iht8x8_64_add_c(in, out, stride, tx_type, 12);
+}
+#endif
+
  class FwdTrans8x8TestBase {
   public:
    virtual ~FwdTrans8x8TestBase() {}
  
   protected:
-  virtual void RunFwdTxfm(int16_t *in, int16_t *out, int stride) = 0;
-  virtual void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) = 0;
+  virtual void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) = 0;
+  virtual void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) = 0;
  
    void RunSignBiasCheck() {
      ACMRandom rnd(ACMRandom::DeterministicSeed());
      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_output_block, 64);
      int count_sign_block[64][2];
      const int count_test_block = 100000;
  
@@ -67,7 +118,8 @@ class FwdTrans8x8TestBase {
      for (int i = 0; i < count_test_block; ++i) {
        // Initialize a test block with input range [-255, 255].
        for (int j = 0; j < 64; ++j)
-        test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+        test_input_block[j] = ((rnd.Rand16() >> (16 - bit_depth_)) & mask_) -
+                              ((rnd.Rand16() >> (16 - bit_depth_)) & mask_);
        ASM_REGISTER_STATE_CHECK(
            RunFwdTxfm(test_input_block, test_output_block, pitch_));
  
@@ -82,7 +134,7 @@ class FwdTrans8x8TestBase {
      for (int j = 0; j < 64; ++j) {
        const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
        const int max_diff = 1125;
-      EXPECT_LT(diff, max_diff)
+      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
            << "Error: 8x8 FDCT/FHT has a sign bias > "
            << 1. * max_diff / count_test_block * 100 << "%"
            << " for input range [-255, 255] at index " << j
@@ -111,7 +163,7 @@ class FwdTrans8x8TestBase {
      for (int j = 0; j < 64; ++j) {
        const int diff = abs(count_sign_block[j][0] - count_sign_block[j][1]);
        const int max_diff = 10000;
-      EXPECT_LT(diff, max_diff)
+      EXPECT_LT(diff, max_diff << (bit_depth_ - 8))
            << "Error: 4x4 FDCT/FHT has a sign bias > "
            << 1. * max_diff / count_test_block * 100 << "%"
            << " for input range [-15, 15] at index " << j
@@ -127,16 +179,28 @@ class FwdTrans8x8TestBase {
      int total_error = 0;
      const int count_test_block = 100000;
      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, 64);
      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, 64);
+#endif
  
      for (int i = 0; i < count_test_block; ++i) {
        // Initialize a test block with input range [-255, 255].
        for (int j = 0; j < 64; ++j) {
-        src[j] = rnd.Rand8();
-        dst[j] = rnd.Rand8();
-        test_input_block[j] = src[j] - dst[j];
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
        }
  
        ASM_REGISTER_STATE_CHECK(
@@ -152,11 +216,23 @@ class FwdTrans8x8TestBase {
              test_temp_block[j] *= 4;
            }
        }
-      ASM_REGISTER_STATE_CHECK(
-          RunInvTxfm(test_temp_block, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
  
        for (int j = 0; j < 64; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const int diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
          const int diff = dst[j] - src[j];
+#endif
          const int error = diff * diff;
          if (max_error < error)
            max_error = error;
@@ -164,11 +240,11 @@ class FwdTrans8x8TestBase {
        }
      }
  
-    EXPECT_GE(1, max_error)
+    EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
        << "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual"
        << " roundtrip error > 1";
  
-    EXPECT_GE(count_test_block/5, total_error)
+    EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8))/5, total_error)
        << "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
        << "error > 1/5 per block";
    }
@@ -180,37 +256,68 @@ class FwdTrans8x8TestBase {
      int total_coeff_error = 0;
      const int count_test_block = 100000;
      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 64);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, ref_temp_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_temp_block, 64);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, ref_temp_block, 64);
      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 64);
      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 64);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, 64);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, 64);
+#endif
  
      for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-255, 255].
+      // Initialize a test block with input range [-mask_, mask_].
        for (int j = 0; j < 64; ++j) {
-        if (i == 0) {
-          src[j] = 255;
-          dst[j] = 0;
-        } else if (i == 1) {
-          src[j] = 0;
-          dst[j] = 255;
+        if (bit_depth_ == VPX_BITS_8) {
+          if (i == 0) {
+            src[j] = 255;
+            dst[j] = 0;
+          } else if (i == 1) {
+            src[j] = 0;
+            dst[j] = 255;
+          } else {
+            src[j] = rnd.Rand8() % 2 ? 255 : 0;
+            dst[j] = rnd.Rand8() % 2 ? 255 : 0;
+          }
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
          } else {
-          src[j] = rnd.Rand8() % 2 ? 255 : 0;
-          dst[j] = rnd.Rand8() % 2 ? 255 : 0;
+          if (i == 0) {
+            src16[j] = mask_;
+            dst16[j] = 0;
+          } else if (i == 1) {
+            src16[j] = 0;
+            dst16[j] = mask_;
+          } else {
+            src16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+            dst16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+          }
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
          }
-
-        test_input_block[j] = src[j] - dst[j];
        }
  
        ASM_REGISTER_STATE_CHECK(
            RunFwdTxfm(test_input_block, test_temp_block, pitch_));
        ASM_REGISTER_STATE_CHECK(
            fwd_txfm_ref(test_input_block, ref_temp_block, pitch_, tx_type_));
-      ASM_REGISTER_STATE_CHECK(
-          RunInvTxfm(test_temp_block, dst, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(
+            RunInvTxfm(test_temp_block, CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
  
        for (int j = 0; j < 64; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const int diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
          const int diff = dst[j] - src[j];
+#endif
          const int error = diff * diff;
          if (max_error < error)
            max_error = error;
@@ -220,11 +327,11 @@ class FwdTrans8x8TestBase {
          total_coeff_error += abs(coeff_diff);
        }
  
-      EXPECT_GE(1, max_error)
+      EXPECT_GE(1 << 2 * (bit_depth_ - 8), max_error)
            << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has"
            << "an individual roundtrip error > 1";
  
-      EXPECT_GE(count_test_block/5, total_error)
+      EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8))/5, total_error)
            << "Error: Extremal 8x8 FDCT/IDCT or FHT/IHT has average"
            << " roundtrip error > 1/5 per block";
  
@@ -234,9 +341,97 @@ class FwdTrans8x8TestBase {
      }
    }
  
+  void RunInvAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+#if CONFIG_VP9_HIGHBITDEPTH
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, src16, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint16_t, dst16, kNumCoeffs);
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      double out_r[kNumCoeffs];
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8() % 2 ? 255 : 0;
+          dst[j] = src[j] > 0 ? 0 : 255;
+          in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand8() % 2 ? mask_ : 0;
+          dst16[j] = src16[j] > 0 ? 0 : mask_;
+          in[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      reference_8x8_dct_2d(in, out_r);
+      for (int j = 0; j < kNumCoeffs; ++j)
+        coeff[j] = round(out_r[j]);
+
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        const uint32_t diff = dst[j] - src[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_GE(1u << 2 * (bit_depth_ - 8), error)
+            << "Error: 8x8 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+
+  void RunFwdAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff_r, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, tran_low_t, coeff, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      double out_r[kNumCoeffs];
+
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < kNumCoeffs; ++j)
+        in[j] = rnd.Rand8() % 2 == 0 ? mask_ : -mask_;
+
+      RunFwdTxfm(in, coeff, pitch_);
+      reference_8x8_dct_2d(in, out_r);
+      for (int j = 0; j < kNumCoeffs; ++j)
+        coeff_r[j] = round(out_r[j]);
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = coeff[j] - coeff_r[j];
+        const uint32_t error = diff * diff;
+        EXPECT_GE(9u << 2 * (bit_depth_ - 8), error)
+            << "Error: 8x8 DCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
    int pitch_;
    int tx_type_;
    FhtFunc fwd_txfm_ref;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
  };
  
  class FwdTrans8x8DCT
@@ -251,15 +446,17 @@ class FwdTrans8x8DCT
      tx_type_  = GET_PARAM(2);
      pitch_    = 8;
      fwd_txfm_ref = fdct8x8_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
    }
  
    virtual void TearDown() { libvpx_test::ClearSystemState(); }
  
   protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
      fwd_txfm_(in, out, stride);
    }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
      inv_txfm_(out, dst, stride);
    }
  
@@ -279,6 +476,14 @@ TEST_P(FwdTrans8x8DCT, ExtremalCheck) {
    RunExtremalCheck();
  }
  
+TEST_P(FwdTrans8x8DCT, FwdAccuracyCheck) {
+  RunFwdAccuracyCheck();
+}
+
+TEST_P(FwdTrans8x8DCT, InvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}
+
  class FwdTrans8x8HT
      : public FwdTrans8x8TestBase,
        public ::testing::TestWithParam<Ht8x8Param> {
@@ -291,15 +496,17 @@ class FwdTrans8x8HT
      tx_type_  = GET_PARAM(2);
      pitch_    = 8;
      fwd_txfm_ref = fht8x8_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
    }
  
    virtual void TearDown() { libvpx_test::ClearSystemState(); }
  
   protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, int stride) {
+  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
      fwd_txfm_(in, out, stride, tx_type_);
    }
-  void RunInvTxfm(int16_t *out, uint8_t *dst, int stride) {
+  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
      inv_txfm_(out, dst, stride, tx_type_);
    }
  
@@ -321,50 +528,81 @@ TEST_P(FwdTrans8x8HT, ExtremalCheck) {
  
  using std::tr1::make_tuple;
  
+#if CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      C, FwdTrans8x8DCT,
      ::testing::Values(
-        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0)));
+        make_tuple(&vp9_high_fdct8x8_c, &idct8x8_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fdct8x8_c, &idct8x8_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, FwdTrans8x8DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_c, 0, VPX_BITS_8)));
+#endif
+
+#if CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      C, FwdTrans8x8HT,
      ::testing::Values(
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3)));
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 0, VPX_BITS_10),
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 1, VPX_BITS_10),
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 2, VPX_BITS_10),
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_10, 3, VPX_BITS_10),
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 0, VPX_BITS_12),
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 1, VPX_BITS_12),
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 2, VPX_BITS_12),
+        make_tuple(&vp9_high_fht8x8_c, &iht8x8_12, 3, VPX_BITS_12),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+#else
+INSTANTIATE_TEST_CASE_P(
+    C, FwdTrans8x8HT,
+    ::testing::Values(
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_c, 3, VPX_BITS_8)));
+#endif
  
-#if HAVE_NEON_ASM
+#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      NEON, FwdTrans8x8DCT,
      ::testing::Values(
-        make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0)));
+        make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0,
+                   VPX_BITS_8)));
  INSTANTIATE_TEST_CASE_P(
      DISABLED_NEON, FwdTrans8x8HT,
      ::testing::Values(
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2),
-        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3)));
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_c, &vp9_iht8x8_64_add_neon, 3, VPX_BITS_8)));
  #endif
  
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      SSE2, FwdTrans8x8DCT,
      ::testing::Values(
-        make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0)));
+        make_tuple(&vp9_fdct8x8_sse2, &vp9_idct8x8_64_add_sse2, 0,
+                   VPX_BITS_8)));
  INSTANTIATE_TEST_CASE_P(
      SSE2, FwdTrans8x8HT,
      ::testing::Values(
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0),
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1),
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2),
-        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3)));
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 0, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 1, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 2, VPX_BITS_8),
+        make_tuple(&vp9_fht8x8_sse2, &vp9_iht8x8_64_add_sse2, 3, VPX_BITS_8)));
  #endif
  
-#if HAVE_SSSE3 && ARCH_X86_64
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      SSSE3, FwdTrans8x8DCT,
      ::testing::Values(
-        make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0)));
+        make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0,
+                   VPX_BITS_8)));
  #endif
  }  // namespace
diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc

index 5f4c33a..f488cb4 100644 (file)
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -109,7 +109,8 @@ TEST(VP9Idct8x8Test, AccuracyCheck) {
    ACMRandom rnd(ACMRandom::DeterministicSeed());
    const int count_test_block = 10000;
    for (int i = 0; i < count_test_block; ++i) {
-    int16_t input[64], coeff[64];
+    int16_t input[64];
+    tran_low_t coeff[64];
      double output_r[64];
      uint8_t dst[64], src[64];
  
diff --git a/test/invalid_file_test.cc b/test/invalid_file_test.cc

index 3cfb292..b61d490 100644 (file)
--- a/test/invalid_file_test.cc
+++ b/test/invalid_file_test.cc
@@ -118,8 +118,9 @@ const DecodeParam kVP9InvalidFileTests[] = {
    {1, "invalid-vp90-2-00-quantizer-11.webm.ivf.s52984_r01-05_b6-z.ivf"},
    {1, "invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf"},
    {1, "invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf"},
-  {1, "invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf"},
+  {1, "invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf"},
    {1, "invalid-vp91-2-mixedrefcsp-444to420.ivf"},
+  {1, "invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf"},
  };
  
  VP9_INSTANTIATE_TEST_CASE(InvalidFileTest,
@@ -150,8 +151,8 @@ const DecodeParam kMultiThreadedVP9InvalidFileTests[] = {
    {4, "invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm"},
    {4, "invalid-"
        "vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf"},
-  {2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf"},
-  {4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf"},
+  {2, "invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf"},
+  {4, "invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf"},
  };
  
  INSTANTIATE_TEST_CASE_P(
diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc

index 15f4e6c..9c24fee 100644 (file)
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc
@@ -26,8 +26,8 @@
  using libvpx_test::ACMRandom;
  
  namespace {
-typedef void (*FwdTxfmFunc)(const int16_t *in, int16_t *out, int stride);
-typedef void (*InvTxfmFunc)(const int16_t *in, uint8_t *out, int stride);
+typedef void (*FwdTxfmFunc)(const int16_t *in, tran_low_t *out, int stride);
+typedef void (*InvTxfmFunc)(const tran_low_t *in, uint8_t *out, int stride);
  typedef std::tr1::tuple<FwdTxfmFunc,
                          InvTxfmFunc,
                          InvTxfmFunc,
@@ -74,8 +74,8 @@ TEST_P(PartialIDctTest, RunQuantCheck) {
        FAIL() << "Wrong Size!";
        break;
    }
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block1, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block2, kMaxNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block1, kMaxNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block2, kMaxNumCoeffs);
    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst1, kMaxNumCoeffs);
    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst2, kMaxNumCoeffs);
  
@@ -83,7 +83,7 @@ TEST_P(PartialIDctTest, RunQuantCheck) {
    const int block_size = size * size;
  
    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kMaxNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, output_ref_block, kMaxNumCoeffs);
  
    int max_error = 0;
    for (int i = 0; i < count_test_block; ++i) {
@@ -153,8 +153,8 @@ TEST_P(PartialIDctTest, ResultsMatch) {
        FAIL() << "Wrong Size!";
        break;
    }
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block1, kMaxNumCoeffs);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_coef_block2, kMaxNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block1, kMaxNumCoeffs);
+  DECLARE_ALIGNED_ARRAY(16, tran_low_t, test_coef_block2, kMaxNumCoeffs);
    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst1, kMaxNumCoeffs);
    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst2, kMaxNumCoeffs);
    const int count_test_block = 1000;
@@ -229,6 +229,7 @@ INSTANTIATE_TEST_CASE_P(
                     &vp9_idct4x4_16_add_c,
                     &vp9_idct4x4_1_add_c,
                     TX_4X4, 1)));
+
  #if HAVE_NEON_ASM
  INSTANTIATE_TEST_CASE_P(
      NEON, PartialIDctTest,
@@ -259,7 +260,7 @@ INSTANTIATE_TEST_CASE_P(
                     TX_4X4, 1)));
  #endif
  
-#if HAVE_SSE2
+#if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      SSE2, PartialIDctTest,
      ::testing::Values(
@@ -293,7 +294,7 @@ INSTANTIATE_TEST_CASE_P(
                     TX_4X4, 1)));
  #endif
  
-#if HAVE_SSSE3 && ARCH_X86_64
+#if HAVE_SSSE3 && ARCH_X86_64 && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      SSSE3_64, PartialIDctTest,
      ::testing::Values(
@@ -303,7 +304,7 @@ INSTANTIATE_TEST_CASE_P(
                     TX_8X8, 12)));
  #endif
  
-#if HAVE_SSSE3
+#if HAVE_SSSE3 && !CONFIG_VP9_HIGHBITDEPTH
  INSTANTIATE_TEST_CASE_P(
      SSSE3, PartialIDctTest,
      ::testing::Values(
diff --git a/test/svc_test.cc b/test/svc_test.cc

index 218f53d..5035dee 100644 (file)
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -74,6 +74,7 @@ class SvcTest : public ::testing::Test {
      const vpx_codec_err_t res =
          vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
      EXPECT_EQ(VPX_CODEC_OK, res);
+    vpx_codec_control(&codec_, VP8E_SET_CPUUSED, 4);  // Make the test faster
      codec_initialized_ = true;
    }
  
@@ -83,11 +84,23 @@ class SvcTest : public ::testing::Test {
      codec_initialized_ = false;
    }
  
+  void GetStatsData(std::string *const stats_buf) {
+    vpx_codec_iter_t iter = NULL;
+    const vpx_codec_cx_pkt_t *cx_pkt;
+
+    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
+      if (cx_pkt->kind == VPX_CODEC_STATS_PKT) {
+        EXPECT_GT(cx_pkt->data.twopass_stats.sz, 0U);
+        ASSERT_TRUE(cx_pkt->data.twopass_stats.buf != NULL);
+        stats_buf->append(static_cast<char*>(cx_pkt->data.twopass_stats.buf),
+                          cx_pkt->data.twopass_stats.sz);
+      }
+    }
+  }
+
    void Pass1EncodeNFrames(const int n, const int layers,
                            std::string *const stats_buf) {
      vpx_codec_err_t res;
-    size_t stats_size = 0;
-    const char *stats_data = NULL;
  
      ASSERT_GT(n, 0);
      ASSERT_GT(layers, 0);
@@ -104,22 +117,15 @@ class SvcTest : public ::testing::Test {
        res = vpx_svc_encode(&svc_, &codec_, video.img(), video.pts(),
                             video.duration(), VPX_DL_GOOD_QUALITY);
        ASSERT_EQ(VPX_CODEC_OK, res);
-      stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
-      EXPECT_GT(stats_size, 0U);
-      stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
-      ASSERT_TRUE(stats_data != NULL);
-      stats_buf->append(stats_data, stats_size);
+      GetStatsData(stats_buf);
        video.Next();
      }
  
      // Flush encoder and test EOS packet.
      res = vpx_svc_encode(&svc_, &codec_, NULL, video.pts(),
                           video.duration(), VPX_DL_GOOD_QUALITY);
-    stats_size = vpx_svc_get_rc_stats_buffer_size(&svc_);
-    EXPECT_GT(stats_size, 0U);
-    stats_data = vpx_svc_get_rc_stats_buffer(&svc_);
-    ASSERT_TRUE(stats_data != NULL);
-    stats_buf->append(stats_data, stats_size);
+    ASSERT_EQ(VPX_CODEC_OK, res);
+    GetStatsData(stats_buf);
  
      ReleaseEncoder();
    }
@@ -127,20 +133,27 @@ class SvcTest : public ::testing::Test {
    void StoreFrames(const size_t max_frame_received,
                     struct vpx_fixed_buf *const outputs,
                     size_t *const frame_received) {
-    size_t frame_size;
-    while ((frame_size = vpx_svc_get_frame_size(&svc_)) > 0) {
-      ASSERT_LT(*frame_received, max_frame_received);
-
-      if (*frame_received == 0) {
-        EXPECT_EQ(1, vpx_svc_is_keyframe(&svc_));
+    vpx_codec_iter_t iter = NULL;
+    const vpx_codec_cx_pkt_t *cx_pkt;
+
+    while ((cx_pkt = vpx_codec_get_cx_data(&codec_, &iter)) != NULL) {
+      if (cx_pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+        const size_t frame_size = cx_pkt->data.frame.sz;
+
+        EXPECT_GT(frame_size, 0U);
+        ASSERT_TRUE(cx_pkt->data.frame.buf != NULL);
+        ASSERT_LT(*frame_received, max_frame_received);
+
+        if (*frame_received == 0)
+          EXPECT_EQ(1, !!(cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY));
+
+        outputs[*frame_received].buf = malloc(frame_size + 16);
+        ASSERT_TRUE(outputs[*frame_received].buf != NULL);
+        memcpy(outputs[*frame_received].buf, cx_pkt->data.frame.buf,
+               frame_size);
+        outputs[*frame_received].sz = frame_size;
+        ++(*frame_received);
        }
-
-      outputs[*frame_received].buf = malloc(frame_size + 16);
-      ASSERT_TRUE(outputs[*frame_received].buf != NULL);
-      memcpy(outputs[*frame_received].buf, vpx_svc_get_buffer(&svc_),
-             frame_size);
-      outputs[*frame_received].sz = frame_size;
-      ++(*frame_received);
      }
    }
  
diff --git a/test/test-data.sha1 b/test/test-data.sha1

index ee1997f..e6114ab 100644 (file)
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -683,10 +683,10 @@ c12918cf0a716417fba2de35c3fc5ab90e52dfce  vp90-2-18-resize.ivf.md5
  717da707afcaa1f692ff1946f291054eb75a4f06  screendata.y4m
  b7c1296630cdf1a7ef493d15ff4f9eb2999202f6  invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf
  0a3884edb3fd8f9d9b500223e650f7de257b67d8  invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
-fac89b5735be8a86b0dc05159f996a5c3208ae32  invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf
-22e0ee8babe574722baf4ef6d7ff5d7cf80d386c  invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf.res
-4506dfdcdf8ee4250924b075a0dcf1f070f72e5a  invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf
-d3ea592c8d7b05d14c7ed48befc0a3aaf7709b7a  invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf.res
+fac89b5735be8a86b0dc05159f996a5c3208ae32  invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf
+0a3884edb3fd8f9d9b500223e650f7de257b67d8  invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf.res
+4506dfdcdf8ee4250924b075a0dcf1f070f72e5a  invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf
+bcdedaf168ac225575468fda77502d2dc9fd5baa  invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res
  65e93f9653bcf65b022f7d225268d1a90a76e7bb  vp90-2-19-skip.webm
  368dccdde5288c13c25695d2eacdc7402cadf613  vp90-2-19-skip.webm.md5
  ffe460282df2b0e7d4603c2158653ad96f574b02  vp90-2-19-skip-01.webm
@@ -695,7 +695,9 @@ b03c408cf23158638da18dbc3323b99a1635c68a  invalid-vp90-2-12-droppable_1.ivf.s367
  0a3884edb3fd8f9d9b500223e650f7de257b67d8  invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
  5e67e24e7f53fd189e565513cef8519b1bd6c712  invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf
  741158f67c0d9d23726624d06bdc482ad368afc9  invalid-vp90-2-05-resize.ivf.s59293_r01-05_b6-.ivf.res
-8b1f7bf7e86c0976d277f60e8fcd9539e75a079a  invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf
-fb79dcbbbb8c82d5a750e339acce66e39a32f15f  invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf.res
+8b1f7bf7e86c0976d277f60e8fcd9539e75a079a  invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf
+9c6bdf048fb2e66f07d4b4db5b32e6f303bd6109  invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res
  552e372e9b78127389fb06b34545df2cec15ba6d  invalid-vp91-2-mixedrefcsp-444to420.ivf
  a61774cf03fc584bd9f0904fc145253bb8ea6c4c  invalid-vp91-2-mixedrefcsp-444to420.ivf.res
+812d05a64a0d83c1b504d0519927ddc5a2cdb273  invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
+1e472baaf5f6113459f0399a38a5a5e68d17799d  invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
diff --git a/test/test.mk b/test/test.mk

index 6d2dbd6..b92b6da 100644 (file)
--- a/test/test.mk
+++ b/test/test.mk
@@ -811,14 +811,16 @@ LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_paral
  LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x2_frame_parallel.webm.ivf.s47039_r01-05_b6-.ivf.res
  LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm
  LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-08-tile_1x4_frame_parallel_all_key.webm.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.ivf.res
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf
-LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-aq2.webm.ivf.s3984_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s19552_r01-05_b6-.v2.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-09-subpixel-00.ivf.s20492_r01-05_b6-.v2.ivf.res
  LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf
  LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s3676_r01-05_b6-.ivf.res
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp90-2-12-droppable_1.ivf.s73804_r01-05_b6-.ivf.res
  LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf
  LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += invalid-vp91-2-mixedrefcsp-444to420.ivf.res
  
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c

index d4f68a9..2f33d4a 100644 (file)
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -68,6 +68,10 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
      int adj_val[3] = {3, 4, 6};
      int shift_inc1 = 0;
      int shift_inc2 = 1;
+    int col_sum[16] = {0, 0, 0, 0,
+                       0, 0, 0, 0,
+                       0, 0, 0, 0,
+                       0, 0, 0, 0};
      /* If motion_magnitude is small, making the denoiser more aggressive by
       * increasing the adjustment for each level. Add another increment for
       * blocks that are labeled for increase denoising. */
@@ -98,11 +102,11 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
              if (absdiff <= 3 + shift_inc1)
              {
                  running_avg_y[c] = mc_running_avg_y[c];
-                sum_diff += diff;
+                col_sum[c] += diff;
              }
              else
              {
-                if (absdiff >= 4 && absdiff <= 7)
+                if (absdiff >= 4 + shift_inc1 && absdiff <= 7)
                      adjustment = adj_val[0];
                  else if (absdiff >= 8 && absdiff <= 15)
                      adjustment = adj_val[1];
@@ -116,7 +120,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
                      else
                          running_avg_y[c] = sig[c] + adjustment;
  
-                    sum_diff += adjustment;
+                    col_sum[c] += adjustment;
                  }
                  else
                  {
@@ -125,7 +129,7 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
                      else
                          running_avg_y[c] = sig[c] - adjustment;
  
-                    sum_diff -= adjustment;
+                    col_sum[c] -= adjustment;
                  }
              }
          }
@@ -136,6 +140,23 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
          running_avg_y += avg_y_stride;
      }
  
+    for (c = 0; c < 16; ++c) {
+      // Below we clip the value in the same way which SSE code use.
+      // When adopting aggressive denoiser, the adj_val for each pixel
+      // could be at most 8 (this is current max adjustment of the map).
+      // In SSE code, we calculate the sum of adj_val for
+      // the columns, so the sum could be upto 128(16 rows). However,
+      // the range of the value is -128 ~ 127 in SSE code, that's why
+      // we do this change in C code.
+      // We don't do this for UV denoiser, since there are only 8 rows,
+      // and max adjustments <= 8, so the sum of the columns will not
+      // exceed 64.
+      if (col_sum[c] >= 128) {
+        col_sum[c] = 127;
+      }
+      sum_diff += col_sum[c];
+    }
+
      sum_diff_thresh= SUM_DIFF_THRESHOLD;
      if (increase_denoising) sum_diff_thresh = SUM_DIFF_THRESHOLD_HIGH;
      if (abs(sum_diff) > sum_diff_thresh) {
@@ -166,14 +187,14 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
                  running_avg_y[c] = 0;
                else
                  running_avg_y[c] = running_avg_y[c] - adjustment;
-              sum_diff -= adjustment;
+              col_sum[c] -= adjustment;
              } else if (diff < 0) {
                // Bring denoised signal up.
                if (running_avg_y[c] + adjustment > 255)
                  running_avg_y[c] = 255;
                else
                  running_avg_y[c] = running_avg_y[c] + adjustment;
-              sum_diff += adjustment;
+              col_sum[c] += adjustment;
              }
            }
            // TODO(marpan): Check here if abs(sum_diff) has gone below the
@@ -182,6 +203,15 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride,
            mc_running_avg_y += mc_avg_y_stride;
            running_avg_y += avg_y_stride;
          }
+
+        sum_diff = 0;
+        for (c = 0; c < 16; ++c) {
+          if (col_sum[c] >= 128) {
+            col_sum[c] = 127;
+          }
+          sum_diff += col_sum[c];
+        }
+
          if (abs(sum_diff) > sum_diff_thresh)
            return COPY_BLOCK;
        } else {
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c

index 7ed2fe1..eb0619d 100644 (file)
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -258,13 +258,6 @@ static void optimize_b(MACROBLOCK *mb, int ib, int type,
      b = &mb->block[ib];
      d = &mb->e_mbd.block[ib];
  
-    /* Enable this to test the effect of RDO as a replacement for the dynamic
-     *  zero bin instead of an augmentation of it.
-     */
-#if 0
-    vp8_strict_quantize_b(b, d);
-#endif
-
      dequant_ptr = d->dequant;
      coeff_ptr = b->coeff;
      qcoeff_ptr = d->qcoeff;
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c

index fda997f..9953bd6 100644 (file)
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -16,57 +16,6 @@
  #include "quantize.h"
  #include "vp8/common/quant_common.h"
  
-#define EXACT_QUANT
-
-#ifdef EXACT_FASTQUANT
-void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
-{
-    int i, rc, eob;
-    int zbin;
-    int x, y, z, sz;
-    short *coeff_ptr       = b->coeff;
-    short *zbin_ptr        = b->zbin;
-    short *round_ptr       = b->round;
-    short *quant_ptr       = b->quant_fast;
-    unsigned char *quant_shift_ptr = b->quant_shift;
-    short *qcoeff_ptr      = d->qcoeff;
-    short *dqcoeff_ptr     = d->dqcoeff;
-    short *dequant_ptr     = d->dequant;
-
-    vpx_memset(qcoeff_ptr, 0, 32);
-    vpx_memset(dqcoeff_ptr, 0, 32);
-
-    eob = -1;
-
-    for (i = 0; i < 16; i++)
-    {
-        rc   = vp8_default_zig_zag1d[i];
-        z    = coeff_ptr[rc];
-        zbin = zbin_ptr[rc] ;
-
-        sz = (z >> 31);                              /* sign of z */
-        x  = (z ^ sz) - sz;                          /* x = abs(z) */
-
-        if (x >= zbin)
-        {
-            x += round_ptr[rc];
-            y  = ((((x * quant_ptr[rc]) >> 16) + x)
-                 * quant_shift_ptr[rc]) >> 16;       /* quantize (x) */
-            x  = (y ^ sz) - sz;                      /* get the sign back */
-            qcoeff_ptr[rc] = x;                      /* write to destination */
-            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
-
-            if (y)
-            {
-                eob = i;                             /* last nonzero coeffs */
-            }
-        }
-    }
-    *d->eob = (char)(eob + 1);
-}
-
-#else
-
  void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
  {
      int i, rc, eob;
@@ -100,9 +49,6 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
      *d->eob = (char)(eob + 1);
  }
  
-#endif
-
-#ifdef EXACT_QUANT
  void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
  {
      int i, rc, eob;
@@ -155,117 +101,6 @@ void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
      *d->eob = (char)(eob + 1);
  }
  
-/* Perform regular quantization, with unbiased rounding and no zero bin. */
-void vp8_strict_quantize_b_c(BLOCK *b, BLOCKD *d)
-{
-    int i;
-    int rc;
-    int eob;
-    int x;
-    int y;
-    int z;
-    int sz;
-    short *coeff_ptr;
-    short *quant_ptr;
-    short *quant_shift_ptr;
-    short *qcoeff_ptr;
-    short *dqcoeff_ptr;
-    short *dequant_ptr;
-
-    coeff_ptr       = b->coeff;
-    quant_ptr       = b->quant;
-    quant_shift_ptr = b->quant_shift;
-    qcoeff_ptr      = d->qcoeff;
-    dqcoeff_ptr     = d->dqcoeff;
-    dequant_ptr     = d->dequant;
-    eob = - 1;
-    vpx_memset(qcoeff_ptr, 0, 32);
-    vpx_memset(dqcoeff_ptr, 0, 32);
-    for (i = 0; i < 16; i++)
-    {
-        int dq;
-        int rounding;
-
-        /*TODO: These arrays should be stored in zig-zag order.*/
-        rc = vp8_default_zig_zag1d[i];
-        z = coeff_ptr[rc];
-        dq = dequant_ptr[rc];
-        rounding = dq >> 1;
-        /* Sign of z. */
-        sz = -(z < 0);
-        x = (z + sz) ^ sz;
-        x += rounding;
-        if (x >= dq)
-        {
-            /* Quantize x. */
-            y  = ((((x * quant_ptr[rc]) >> 16) + x) * quant_shift_ptr[rc]) >> 16;
-            /* Put the sign back. */
-            x = (y + sz) ^ sz;
-            /* Save the coefficient and its dequantized value. */
-            qcoeff_ptr[rc] = x;
-            dqcoeff_ptr[rc] = x * dq;
-            /* Remember the last non-zero coefficient. */
-            if (y)
-                eob = i;
-        }
-    }
-
-    *d->eob = (char)(eob + 1);
-}
-
-#else
-
-void vp8_regular_quantize_b_c(BLOCK *b, BLOCKD *d)
-{
-    int i, rc, eob;
-    int zbin;
-    int x, y, z, sz;
-    short *zbin_boost_ptr = b->zrun_zbin_boost;
-    short *coeff_ptr      = b->coeff;
-    short *zbin_ptr       = b->zbin;
-    short *round_ptr      = b->round;
-    short *quant_ptr      = b->quant;
-    short *qcoeff_ptr     = d->qcoeff;
-    short *dqcoeff_ptr    = d->dqcoeff;
-    short *dequant_ptr    = d->dequant;
-    short zbin_oq_value   = b->zbin_extra;
-
-    vpx_memset(qcoeff_ptr, 0, 32);
-    vpx_memset(dqcoeff_ptr, 0, 32);
-
-    eob = -1;
-
-    for (i = 0; i < 16; i++)
-    {
-        rc   = vp8_default_zig_zag1d[i];
-        z    = coeff_ptr[rc];
-
-        zbin = zbin_ptr[rc] + *zbin_boost_ptr + zbin_oq_value;
-
-        zbin_boost_ptr ++;
-        sz = (z >> 31);                              /* sign of z */
-        x  = (z ^ sz) - sz;                          /* x = abs(z) */
-
-        if (x >= zbin)
-        {
-            y  = ((x + round_ptr[rc]) * quant_ptr[rc]) >> 16; /* quantize (x) */
-            x  = (y ^ sz) - sz;                      /* get the sign back */
-            qcoeff_ptr[rc]  = x;                     /* write to destination */
-            dqcoeff_ptr[rc] = x * dequant_ptr[rc];   /* dequantized value */
-
-            if (y)
-            {
-                eob = i;                             /* last nonzero coeffs */
-                zbin_boost_ptr = &b->zrun_zbin_boost[0]; /* reset zrl */
-            }
-        }
-    }
-
-    *d->eob = (char)(eob + 1);
-}
-
-#endif
-
  void vp8_quantize_mby_c(MACROBLOCK *x)
  {
      int i;
@@ -403,8 +238,6 @@ static const int qzbin_factors_y2[129] =
  };
  
  
-#define EXACT_QUANT
-#ifdef EXACT_QUANT
  static void invert_quant(int improved_quant, short *quant,
                           short *shift, short d)
  {
@@ -526,68 +359,6 @@ void vp8cx_init_quantizer(VP8_COMP *cpi)
          }
      }
  }
-#else
-void vp8cx_init_quantizer(VP8_COMP *cpi)
-{
-    int i;
-    int quant_val;
-    int Q;
-
-    int zbin_boost[16] = {0, 0, 8, 10, 12, 14, 16, 20, 24, 28, 32, 36, 40, 44, 44, 44};
-
-    for (Q = 0; Q < QINDEX_RANGE; Q++)
-    {
-        /* dc values */
-        quant_val = vp8_dc_quant(Q, cpi->common.y1dc_delta_q);
-        cpi->Y1quant[Q][0] = (1 << 16) / quant_val;
-        cpi->Y1zbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-        cpi->Y1round[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
-        cpi->common.Y1dequant[Q][0] = quant_val;
-        cpi->zrun_zbin_boost_y1[Q][0] = (quant_val * zbin_boost[0]) >> 7;
-
-        quant_val = vp8_dc2quant(Q, cpi->common.y2dc_delta_q);
-        cpi->Y2quant[Q][0] = (1 << 16) / quant_val;
-        cpi->Y2zbin[Q][0] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
-        cpi->Y2round[Q][0] = (qrounding_factors_y2[Q] * quant_val) >> 7;
-        cpi->common.Y2dequant[Q][0] = quant_val;
-        cpi->zrun_zbin_boost_y2[Q][0] = (quant_val * zbin_boost[0]) >> 7;
-
-        quant_val = vp8_dc_uv_quant(Q, cpi->common.uvdc_delta_q);
-        cpi->UVquant[Q][0] = (1 << 16) / quant_val;
-        cpi->UVzbin[Q][0] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;;
-        cpi->UVround[Q][0] = (qrounding_factors[Q] * quant_val) >> 7;
-        cpi->common.UVdequant[Q][0] = quant_val;
-        cpi->zrun_zbin_boost_uv[Q][0] = (quant_val * zbin_boost[0]) >> 7;
-
-        /* all the ac values = ; */
-        for (i = 1; i < 16; i++)
-        {
-            int rc = vp8_default_zig_zag1d[i];
-
-            quant_val = vp8_ac_yquant(Q);
-            cpi->Y1quant[Q][rc] = (1 << 16) / quant_val;
-            cpi->Y1zbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-            cpi->Y1round[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
-            cpi->common.Y1dequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_y1[Q][i] = (quant_val * zbin_boost[i]) >> 7;
-
-            quant_val = vp8_ac2quant(Q, cpi->common.y2ac_delta_q);
-            cpi->Y2quant[Q][rc] = (1 << 16) / quant_val;
-            cpi->Y2zbin[Q][rc] = ((qzbin_factors_y2[Q] * quant_val) + 64) >> 7;
-            cpi->Y2round[Q][rc] = (qrounding_factors_y2[Q] * quant_val) >> 7;
-            cpi->common.Y2dequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_y2[Q][i] = (quant_val * zbin_boost[i]) >> 7;
-
-            quant_val = vp8_ac_uv_quant(Q, cpi->common.uvac_delta_q);
-            cpi->UVquant[Q][rc] = (1 << 16) / quant_val;
-            cpi->UVzbin[Q][rc] = ((qzbin_factors[Q] * quant_val) + 64) >> 7;
-            cpi->UVround[Q][rc] = (qrounding_factors[Q] * quant_val) >> 7;
-            cpi->common.UVdequant[Q][rc] = quant_val;
-            cpi->zrun_zbin_boost_uv[Q][i] = (quant_val * zbin_boost[i]) >> 7;
-        }
-    }
-}
-#endif
  
  #define ZBIN_EXTRA_Y \
      (( cpi->common.Y1dequant[QIndex][1] *  \
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h

index 951e6e0..7d4ed95 100644 (file)
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -21,6 +21,7 @@
  #include "vp9/common/vp9_common_data.h"
  #include "vp9/common/vp9_enums.h"
  #include "vp9/common/vp9_filter.h"
+#include "vp9/common/vp9_idct.h"
  #include "vp9/common/vp9_mv.h"
  #include "vp9/common/vp9_scale.h"
  #include "vp9/common/vp9_seg_common.h"
@@ -176,7 +177,7 @@ struct buf_2d {
  };
  
  struct macroblockd_plane {
-  int16_t *dqcoeff;
+  tran_low_t *dqcoeff;
    PLANE_TYPE plane_type;
    int subsampling_x;
    int subsampling_y;
@@ -223,11 +224,17 @@ typedef struct macroblockd {
    /* mc buffer */
    DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
  
+#if CONFIG_VP9_HIGHBITDEPTH
+  /* Bit depth: 8, 10, 12 */
+  int bd;
+  DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
+#endif
+
    int lossless;
  
    int corrupted;
  
-  DECLARE_ALIGNED(16, int16_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_MB_PLANE][64 * 64]);
  
    ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
    ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c

index 856d41e..b196fc5 100644 (file)
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -18,14 +18,47 @@
  #include "vp9/common/vp9_common.h"
  #include "vp9/common/vp9_idct.h"
  
-void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
+#if CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
+// When CONFIG_EMULATE_HW_HIGHBITDEPTH is 1 the transform performs strict
+// overflow wrapping to match expected hardware implementations.
+// bd of 8 uses trans_low with 16bits, need to remove 16bits
+// bd of 10 uses trans_low with 18bits, need to remove 14bits
+// bd of 12 uses trans_low with 20bits, need to remove 12bits
+// bd of x uses trans_low with 8+x bits, need to remove 24-x bits
+#define WRAPLOW(x) ((((int32_t)x) << (24 - bd)) >> (24 - bd))
+#else
+#define WRAPLOW(x) (x)
+#endif  // CONFIG_EMULATE_HARDWARE_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE tran_low_t clamp_high(tran_high_t value, tran_low_t low,
+                                    tran_low_t high) {
+  return value < low ? low : (value > high ? high : value);
+}
+
+static INLINE tran_low_t clip_pixel_bd_high(tran_high_t dest,
+                                            tran_high_t trans, int bd) {
+  trans = WRAPLOW(trans);
+  switch (bd) {
+    case 8:
+    default:
+      return clamp_high(WRAPLOW(dest + trans), 0, 255);
+    case 10:
+      return clamp_high(WRAPLOW(dest + trans), 0, 1023);
+    case 12:
+      return clamp_high(WRAPLOW(dest + trans), 0, 4095);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
     0.5 shifts per pixel. */
    int i;
-  int16_t output[16];
-  int a1, b1, c1, d1, e1;
-  const int16_t *ip = input;
-  int16_t *op = output;
+  tran_low_t output[16];
+  tran_high_t a1, b1, c1, d1, e1;
+  const tran_low_t *ip = input;
+  tran_low_t *op = output;
  
    for (i = 0; i < 4; i++) {
      a1 = ip[0] >> UNIT_QUANT_SHIFT;
@@ -70,12 +103,12 @@ void vp9_iwht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
    }
  }
  
-void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
+void vp9_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest, int dest_stride) {
    int i;
-  int a1, e1;
-  int16_t tmp[4];
-  const int16_t *ip = in;
-  int16_t *op = tmp;
+  tran_high_t a1, e1;
+  tran_low_t tmp[4];
+  const tran_low_t *ip = in;
+  tran_low_t *op = tmp;
  
    a1 = ip[0] >> UNIT_QUANT_SHIFT;
    e1 = a1 >> 1;
@@ -96,9 +129,9 @@ void vp9_iwht4x4_1_add_c(const int16_t *in, uint8_t *dest, int dest_stride) {
    }
  }
  
-static void idct4(const int16_t *input, int16_t *output) {
-  int16_t step[4];
-  int temp1, temp2;
+static void idct4(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
    // stage 1
    temp1 = (input[0] + input[2]) * cospi_16_64;
    temp2 = (input[0] - input[2]) * cospi_16_64;
@@ -116,11 +149,11 @@ static void idct4(const int16_t *input, int16_t *output) {
    output[3] = step[0] - step[3];
  }
  
-void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[4 * 4];
-  int16_t *outptr = out;
+void vp9_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
    int i, j;
-  int16_t temp_in[4], temp_out[4];
+  tran_low_t temp_in[4], temp_out[4];
  
    // Rows
    for (i = 0; i < 4; ++i) {
@@ -140,10 +173,11 @@ void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride) {
    }
  }
  
-void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
+void vp9_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest,
+                         int dest_stride) {
    int i;
-  int a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  tran_high_t a1;
+  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
    out = dct_const_round_shift(out * cospi_16_64);
    a1 = ROUND_POWER_OF_TWO(out, 4);
  
@@ -156,9 +190,9 @@ void vp9_idct4x4_1_add_c(const int16_t *input, uint8_t *dest, int dest_stride) {
    }
  }
  
-static void idct8(const int16_t *input, int16_t *output) {
-  int16_t step1[8], step2[8];
-  int temp1, temp2;
+static void idct8(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
    // stage 1
    step1[0] = input[0];
    step1[2] = input[4];
@@ -201,11 +235,11 @@ static void idct8(const int16_t *input, int16_t *output) {
    output[7] = step1[0] - step1[7];
  }
  
-void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[8 * 8];
-  int16_t *outptr = out;
+void vp9_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
    int i, j;
-  int16_t temp_in[8], temp_out[8];
+  tran_low_t temp_in[8], temp_out[8];
  
    // First transform rows
    for (i = 0; i < 8; ++i) {
@@ -225,10 +259,10 @@ void vp9_idct8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride) {
    }
  }
  
-void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    int i, j;
-  int a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  tran_high_t a1;
+  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
    out = dct_const_round_shift(out * cospi_16_64);
    a1 = ROUND_POWER_OF_TWO(out, 5);
    for (j = 0; j < 8; ++j) {
@@ -238,13 +272,13 @@ void vp9_idct8x8_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
    }
  }
  
-static void iadst4(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
+static void iadst4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
  
-  int x0 = input[0];
-  int x1 = input[1];
-  int x2 = input[2];
-  int x3 = input[3];
+  tran_high_t x0 = input[0];
+  tran_high_t x1 = input[1];
+  tran_high_t x2 = input[2];
+  tran_high_t x3 = input[3];
  
    if (!(x0 | x1 | x2 | x3)) {
      output[0] = output[1] = output[2] = output[3] = 0;
@@ -280,7 +314,7 @@ static void iadst4(const int16_t *input, int16_t *output) {
    output[3] = dct_const_round_shift(s3);
  }
  
-void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                           int tx_type) {
    const transform_2d IHT_4[] = {
      { idct4, idct4  },  // DCT_DCT  = 0
@@ -290,9 +324,9 @@ void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
    };
  
    int i, j;
-  int16_t out[4 * 4];
-  int16_t *outptr = out;
-  int16_t temp_in[4], temp_out[4];
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[4], temp_out[4];
  
    // inverse transform row vectors
    for (i = 0; i < 4; ++i) {
@@ -311,17 +345,17 @@ void vp9_iht4x4_16_add_c(const int16_t *input, uint8_t *dest, int stride,
                                    + dest[j * stride + i]);
    }
  }
-static void iadst8(const int16_t *input, int16_t *output) {
+static void iadst8(const tran_low_t *input, tran_low_t *output) {
    int s0, s1, s2, s3, s4, s5, s6, s7;
  
-  int x0 = input[7];
-  int x1 = input[0];
-  int x2 = input[5];
-  int x3 = input[2];
-  int x4 = input[3];
-  int x5 = input[4];
-  int x6 = input[1];
-  int x7 = input[6];
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
  
    if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
      output[0] = output[1] = output[2] = output[3] = output[4]
@@ -395,12 +429,12 @@ static const transform_2d IHT_8[] = {
    { iadst8, iadst8 }   // ADST_ADST = 3
  };
  
-void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                           int tx_type) {
    int i, j;
-  int16_t out[8 * 8];
-  int16_t *outptr = out;
-  int16_t temp_in[8], temp_out[8];
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[8], temp_out[8];
    const transform_2d ht = IHT_8[tx_type];
  
    // inverse transform row vectors
@@ -421,11 +455,11 @@ void vp9_iht8x8_64_add_c(const int16_t *input, uint8_t *dest, int stride,
    }
  }
  
-void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[8 * 8] = { 0 };
-  int16_t *outptr = out;
+void vp9_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
    int i, j;
-  int16_t temp_in[8], temp_out[8];
+  tran_low_t temp_in[8], temp_out[8];
  
    // First transform rows
    // only first 4 row has non-zero coefs
@@ -446,9 +480,9 @@ void vp9_idct8x8_12_add_c(const int16_t *input, uint8_t *dest, int stride) {
    }
  }
  
-static void idct16(const int16_t *input, int16_t *output) {
-  int16_t step1[16], step2[16];
-  int temp1, temp2;
+static void idct16(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[16], step2[16];
+  tran_high_t temp1, temp2;
  
    // stage 1
    step1[0] = input[0/2];
@@ -611,11 +645,12 @@ static void idct16(const int16_t *input, int16_t *output) {
    output[15] = step2[0] - step2[15];
  }
  
-void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[16 * 16];
-  int16_t *outptr = out;
+void vp9_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
+                             int stride) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
    int i, j;
-  int16_t temp_in[16], temp_out[16];
+  tran_low_t temp_in[16], temp_out[16];
  
    // First transform rows
    for (i = 0; i < 16; ++i) {
@@ -635,25 +670,26 @@ void vp9_idct16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride) {
    }
  }
  
-static void iadst16(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
-
-  int x0 = input[15];
-  int x1 = input[0];
-  int x2 = input[13];
-  int x3 = input[2];
-  int x4 = input[11];
-  int x5 = input[4];
-  int x6 = input[9];
-  int x7 = input[6];
-  int x8 = input[7];
-  int x9 = input[8];
-  int x10 = input[5];
-  int x11 = input[10];
-  int x12 = input[3];
-  int x13 = input[12];
-  int x14 = input[1];
-  int x15 = input[14];
+static void iadst16(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
  
    if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
             | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
@@ -813,12 +849,12 @@ static const transform_2d IHT_16[] = {
    { iadst16, iadst16 }   // ADST_ADST = 3
  };
  
-void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
+void vp9_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                              int tx_type) {
    int i, j;
-  int16_t out[16 * 16];
-  int16_t *outptr = out;
-  int16_t temp_in[16], temp_out[16];
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
    const transform_2d ht = IHT_16[tx_type];
  
    // Rows
@@ -839,11 +875,12 @@ void vp9_iht16x16_256_add_c(const int16_t *input, uint8_t *dest, int stride,
    }
  }
  
-void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[16 * 16] = { 0 };
-  int16_t *outptr = out;
+void vp9_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
    int i, j;
-  int16_t temp_in[16], temp_out[16];
+  tran_low_t temp_in[16], temp_out[16];
  
    // First transform rows. Since all non-zero dct coefficients are in
    // upper-left 4x4 area, we only need to calculate first 4 rows here.
@@ -864,10 +901,10 @@ void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
    }
  }
  
-void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    int i, j;
-  int a1;
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  tran_high_t a1;
+  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
    out = dct_const_round_shift(out * cospi_16_64);
    a1 = ROUND_POWER_OF_TWO(out, 6);
    for (j = 0; j < 16; ++j) {
@@ -877,9 +914,9 @@ void vp9_idct16x16_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
    }
  }
  
-static void idct32(const int16_t *input, int16_t *output) {
-  int16_t step1[32], step2[32];
-  int temp1, temp2;
+static void idct32(const tran_low_t *input, tran_low_t *output) {
+  tran_low_t step1[32], step2[32];
+  tran_high_t temp1, temp2;
  
    // stage 1
    step1[0] = input[0];
@@ -1244,11 +1281,12 @@ static void idct32(const int16_t *input, int16_t *output) {
    output[31] = step1[0] - step1[31];
  }
  
-void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[32 * 32];
-  int16_t *outptr = out;
+void vp9_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+                              int stride) {
+  tran_low_t out[32 * 32];
+  tran_low_t *outptr = out;
    int i, j;
-  int16_t temp_in[32], temp_out[32];
+  tran_low_t temp_in[32], temp_out[32];
  
    // Rows
    for (i = 0; i < 32; ++i) {
@@ -1265,7 +1303,7 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
      if (zero_coeff[0] | zero_coeff[1])
        idct32(input, outptr);
      else
-      vpx_memset(outptr, 0, sizeof(int16_t) * 32);
+      vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
      input += 32;
      outptr += 32;
    }
@@ -1281,11 +1319,12 @@ void vp9_idct32x32_1024_add_c(const int16_t *input, uint8_t *dest, int stride) {
    }
  }
  
-void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
-  int16_t out[32 * 32] = {0};
-  int16_t *outptr = out;
+void vp9_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
+                            int stride) {
+  tran_low_t out[32 * 32] = {0};
+  tran_low_t *outptr = out;
    int i, j;
-  int16_t temp_in[32], temp_out[32];
+  tran_low_t temp_in[32], temp_out[32];
  
    // Rows
    // only upper-left 8x8 has non-zero coeff
@@ -1306,11 +1345,11 @@ void vp9_idct32x32_34_add_c(const int16_t *input, uint8_t *dest, int stride) {
    }
  }
  
-void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
+void vp9_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
    int i, j;
-  int a1;
+  tran_high_t a1;
  
-  int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
+  tran_low_t out = dct_const_round_shift(input[0] * cospi_16_64);
    out = dct_const_round_shift(out * cospi_16_64);
    a1 = ROUND_POWER_OF_TWO(out, 6);
  
@@ -1322,7 +1361,8 @@ void vp9_idct32x32_1_add_c(const int16_t *input, uint8_t *dest, int stride) {
  }
  
  // idct
-void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
    if (eob > 1)
      vp9_idct4x4_16_add(input, dest, stride);
    else
@@ -1330,14 +1370,16 @@ void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
  }
  
  
-void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
    if (eob > 1)
      vp9_iwht4x4_16_add(input, dest, stride);
    else
      vp9_iwht4x4_1_add(input, dest, stride);
  }
  
-void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
+void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob) {
    // If dc is 1, then input[0] is the reconstructed value, do not need
    // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
  
@@ -1354,7 +1396,7 @@ void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob) {
      vp9_idct8x8_64_add(input, dest, stride);
  }
  
-void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
                         int eob) {
    /* The calculation can be simplified if there are not many non-zero dct
     * coefficients. Use eobs to separate different cases. */
@@ -1367,7 +1409,7 @@ void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride,
      vp9_idct16x16_256_add(input, dest, stride);
  }
  
-void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                         int eob) {
    if (eob == 1)
      vp9_idct32x32_1_add(input, dest, stride);
@@ -1379,7 +1421,7 @@ void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
  }
  
  // iht
-void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                      int stride, int eob) {
    if (tx_type == DCT_DCT)
      vp9_idct4x4_add(input, dest, stride, eob);
@@ -1387,7 +1429,7 @@ void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
      vp9_iht4x4_16_add(input, dest, stride, tx_type);
  }
  
-void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                      int stride, int eob) {
    if (tx_type == DCT_DCT) {
      vp9_idct8x8_add(input, dest, stride, eob);
@@ -1396,7 +1438,7 @@ void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
    }
  }
  
-void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                        int stride, int eob) {
    if (tx_type == DCT_DCT) {
      vp9_idct16x16_add(input, dest, stride, eob);
@@ -1404,3 +1446,1433 @@ void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
      vp9_iht16x16_256_add(input, dest, stride, tx_type);
    }
  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                               int stride, int bd) {
+  /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
+     0.5 shifts per pixel. */
+  int i;
+  tran_low_t output[16];
+  tran_high_t a1, b1, c1, d1, e1;
+  const tran_low_t *ip = input;
+  tran_low_t *op = output;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  for (i = 0; i < 4; i++) {
+    a1 = ip[0] >> UNIT_QUANT_SHIFT;
+    c1 = ip[1] >> UNIT_QUANT_SHIFT;
+    d1 = ip[2] >> UNIT_QUANT_SHIFT;
+    b1 = ip[3] >> UNIT_QUANT_SHIFT;
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    op[0] = WRAPLOW(a1);
+    op[1] = WRAPLOW(b1);
+    op[2] = WRAPLOW(c1);
+    op[3] = WRAPLOW(d1);
+    ip += 4;
+    op += 4;
+  }
+
+  ip = output;
+  for (i = 0; i < 4; i++) {
+    a1 = ip[4 * 0];
+    c1 = ip[4 * 1];
+    d1 = ip[4 * 2];
+    b1 = ip[4 * 3];
+    a1 += c1;
+    d1 -= b1;
+    e1 = (a1 - d1) >> 1;
+    b1 = e1 - b1;
+    c1 = e1 - c1;
+    a1 -= b1;
+    d1 += c1;
+    dest[stride * 0] = clip_pixel_bd_high(dest[stride * 0], a1, bd);
+    dest[stride * 1] = clip_pixel_bd_high(dest[stride * 1], b1, bd);
+    dest[stride * 2] = clip_pixel_bd_high(dest[stride * 2], c1, bd);
+    dest[stride * 3] = clip_pixel_bd_high(dest[stride * 3], d1, bd);
+
+    ip++;
+    dest++;
+  }
+}
+
+static void high_idct4(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
+  (void) bd;
+  // stage 1
+  temp1 = (input[0] + input[2]) * cospi_16_64;
+  temp2 = (input[0] - input[2]) * cospi_16_64;
+  step[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step[1] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = input[1] * cospi_24_64 - input[3] * cospi_8_64;
+  temp2 = input[1] * cospi_8_64 + input[3] * cospi_24_64;
+  step[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step[3] = WRAPLOW(dct_const_round_shift(temp2));
+
+  // stage 2
+  output[0] = WRAPLOW(step[0] + step[3]);
+  output[1] = WRAPLOW(step[1] + step[2]);
+  output[2] = WRAPLOW(step[1] - step[2]);
+  output[3] = WRAPLOW(step[0] - step[3]);
+}
+
+void vp9_high_iwht4x4_1_add_c(const tran_low_t *in, uint8_t *dest8,
+                              int dest_stride, int bd) {
+  int i;
+  tran_high_t a1, e1;
+  tran_low_t tmp[4];
+  const tran_low_t *ip = in;
+  tran_low_t *op = tmp;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  (void) bd;
+
+  a1 = ip[0] >> UNIT_QUANT_SHIFT;
+  e1 = a1 >> 1;
+  a1 -= e1;
+  op[0] = WRAPLOW(a1);
+  op[1] = op[2] = op[3] = WRAPLOW(e1);
+
+  ip = tmp;
+  for (i = 0; i < 4; i++) {
+    e1 = ip[0] >> 1;
+    a1 = ip[0] - e1;
+    dest[dest_stride * 0] = clip_pixel_bd_high(dest[dest_stride * 0], a1, bd);
+    dest[dest_stride * 1] = clip_pixel_bd_high(dest[dest_stride * 1], e1, bd);
+    dest[dest_stride * 2] = clip_pixel_bd_high(dest[dest_stride * 2], e1, bd);
+    dest[dest_stride * 3] = clip_pixel_bd_high(dest[dest_stride * 3], e1, bd);
+    ip++;
+    dest++;
+  }
+}
+
+void vp9_high_idct4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                               int stride, int bd) {
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[4], temp_out[4];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 4; ++i) {
+    high_idct4(input, outptr, bd);
+    input += 4;
+    outptr += 4;
+  }
+
+  // Columns
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    high_idct4(temp_in, temp_out, bd);
+    for (j = 0; j < 4; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+  }
+}
+
+void vp9_high_idct4x4_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                              int dest_stride, int bd) {
+  int i;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  for (i = 0; i < 4; i++) {
+    dest[0] = clip_pixel_bd_high(dest[0], a1, bd);
+    dest[1] = clip_pixel_bd_high(dest[1], a1, bd);
+    dest[2] = clip_pixel_bd_high(dest[2], a1, bd);
+    dest[3] = clip_pixel_bd_high(dest[3], a1, bd);
+    dest += dest_stride;
+  }
+}
+
+static void high_idct8(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
+  // stage 1
+  step1[0] = input[0];
+  step1[2] = input[4];
+  step1[1] = input[2];
+  step1[3] = input[6];
+  temp1 = input[1] * cospi_28_64 - input[7] * cospi_4_64;
+  temp2 = input[1] * cospi_4_64 + input[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = input[5] * cospi_12_64 - input[3] * cospi_20_64;
+  temp2 = input[5] * cospi_20_64 + input[3] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  // stage 2 & stage 3 - even half
+  high_idct4(step1, step1, bd);
+
+  // stage 2 - odd half
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+  // stage 3 - odd half
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = WRAPLOW(step1[0] + step1[7]);
+  output[1] = WRAPLOW(step1[1] + step1[6]);
+  output[2] = WRAPLOW(step1[2] + step1[5]);
+  output[3] = WRAPLOW(step1[3] + step1[4]);
+  output[4] = WRAPLOW(step1[3] - step1[4]);
+  output[5] = WRAPLOW(step1[2] - step1[5]);
+  output[6] = WRAPLOW(step1[1] - step1[6]);
+  output[7] = WRAPLOW(step1[0] - step1[7]);
+}
+
+void vp9_high_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                               int stride, int bd) {
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  for (i = 0; i < 8; ++i) {
+    high_idct8(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    high_idct8(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(dest[j * stride + i],
+                                        ROUND_POWER_OF_TWO(temp_out[j], 5),
+                                        bd);
+  }
+}
+
+void vp9_high_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                              int stride, int bd) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 5);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i)
+      dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+
+static void high_iadst4(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_high_t x0 = input[0];
+  tran_high_t x1 = input[1];
+  tran_high_t x2 = input[2];
+  tran_high_t x3 = input[3];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3)) {
+    vpx_memset(output, 0, 4 * sizeof(*output));
+    return;
+  }
+
+  s0 = sinpi_1_9 * x0;
+  s1 = sinpi_2_9 * x0;
+  s2 = sinpi_3_9 * x1;
+  s3 = sinpi_4_9 * x2;
+  s4 = sinpi_1_9 * x2;
+  s5 = sinpi_2_9 * x3;
+  s6 = sinpi_4_9 * x3;
+  s7 = x0 - x2 + x3;
+
+  x0 = s0 + s3 + s5;
+  x1 = s1 - s4 - s6;
+  x2 = sinpi_3_9 * s7;
+  x3 = s2;
+
+  s0 = x0 + x3;
+  s1 = x1 + x3;
+  s2 = x2;
+  s3 = x0 + x1 - x3;
+
+  // 1-D transform scaling factor is sqrt(2).
+  // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
+  // + 1b (addition) = 29b.
+  // Hence the output bit depth is 15b.
+  output[0] = WRAPLOW(dct_const_round_shift(s0));
+  output[1] = WRAPLOW(dct_const_round_shift(s1));
+  output[2] = WRAPLOW(dct_const_round_shift(s2));
+  output[3] = WRAPLOW(dct_const_round_shift(s3));
+}
+
+void vp9_high_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
+                              int stride, int tx_type, int bd) {
+  const high_transform_2d IHT_4[] = {
+    { high_idct4, high_idct4  },    // DCT_DCT  = 0
+    { high_iadst4, high_idct4 },    // ADST_DCT = 1
+    { high_idct4, high_iadst4 },    // DCT_ADST = 2
+    { high_iadst4, high_iadst4 }    // ADST_ADST = 3
+  };
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t out[4 * 4];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[4], temp_out[4];
+
+  // Inverse transform row vectors.
+  for (i = 0; i < 4; ++i) {
+    IHT_4[tx_type].rows(input, outptr, bd);
+    input  += 4;
+    outptr += 4;
+  }
+
+  // Inverse transform column vectors.
+  for (i = 0; i < 4; ++i) {
+    for (j = 0; j < 4; ++j)
+      temp_in[j] = out[j * 4 + i];
+    IHT_4[tx_type].cols(temp_in, temp_out, bd);
+    for (j = 0; j < 4; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+  }
+}
+
+static void high_iadst8(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
+
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7)) {
+    vpx_memset(output, 0, 8 * sizeof(*output));
+    return;
+  }
+
+  // stage 1
+  s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
+  s1 = cospi_30_64 * x0 - cospi_2_64  * x1;
+  s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s6 = cospi_26_64 * x6 + cospi_6_64  * x7;
+  s7 = cospi_6_64  * x6 - cospi_26_64 * x7;
+
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s4));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s5));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s6));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s7));
+  x4 = WRAPLOW(dct_const_round_shift(s0 - s4));
+  x5 = WRAPLOW(dct_const_round_shift(s1 - s5));
+  x6 = WRAPLOW(dct_const_round_shift(s2 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s3 - s7));
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 =  cospi_8_64  * x4 + cospi_24_64 * x5;
+  s5 =  cospi_24_64 * x4 - cospi_8_64  * x5;
+  s6 = -cospi_24_64 * x6 + cospi_8_64  * x7;
+  s7 =  cospi_8_64  * x6 + cospi_24_64 * x7;
+
+  x0 = s0 + s2;
+  x1 = s1 + s3;
+  x2 = s0 - s2;
+  x3 = s1 - s3;
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+
+  // stage 3
+  s2 = cospi_16_64 * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (x6 - x7);
+
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
+
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x4);
+  output[2] = WRAPLOW(x6);
+  output[3] = WRAPLOW(-x2);
+  output[4] = WRAPLOW(x3);
+  output[5] = WRAPLOW(-x7);
+  output[6] = WRAPLOW(x5);
+  output[7] = WRAPLOW(-x1);
+}
+
+static const high_transform_2d HIGH_IHT_8[] = {
+  { high_idct8,  high_idct8  },  // DCT_DCT  = 0
+  { high_iadst8, high_idct8  },  // ADST_DCT = 1
+  { high_idct8,  high_iadst8 },  // DCT_ADST = 2
+  { high_iadst8, high_iadst8 }   // ADST_ADST = 3
+};
+
+void vp9_high_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
+                              int stride, int tx_type, int bd) {
+  int i, j;
+  tran_low_t out[8 * 8];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[8], temp_out[8];
+  const high_transform_2d ht = HIGH_IHT_8[tx_type];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Inverse transform row vectors.
+  for (i = 0; i < 8; ++i) {
+    ht.rows(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+
+  // Inverse transform column vectors.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    ht.cols(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+  }
+}
+
+void vp9_high_idct8x8_10_add_c(const tran_low_t *input, uint8_t *dest8,
+                               int stride, int bd) {
+  tran_low_t out[8 * 8] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[8], temp_out[8];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  // Only first 4 row has non-zero coefs.
+  for (i = 0; i < 4; ++i) {
+    high_idct8(input, outptr, bd);
+    input += 8;
+    outptr += 8;
+  }
+  // Then transform columns.
+  for (i = 0; i < 8; ++i) {
+    for (j = 0; j < 8; ++j)
+      temp_in[j] = out[j * 8 + i];
+    high_idct8(temp_in, temp_out, bd);
+    for (j = 0; j < 8; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+  }
+}
+
+static void high_idct16(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step1[16], step2[16];
+  tran_high_t temp1, temp2;
+  (void) bd;
+
+  // stage 1
+  step1[0] = input[0/2];
+  step1[1] = input[16/2];
+  step1[2] = input[8/2];
+  step1[3] = input[24/2];
+  step1[4] = input[4/2];
+  step1[5] = input[20/2];
+  step1[6] = input[12/2];
+  step1[7] = input[28/2];
+  step1[8] = input[2/2];
+  step1[9] = input[18/2];
+  step1[10] = input[10/2];
+  step1[11] = input[26/2];
+  step1[12] = input[6/2];
+  step1[13] = input[22/2];
+  step1[14] = input[14/2];
+  step1[15] = input[30/2];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  output[0] = WRAPLOW(step2[0] + step2[15]);
+  output[1] = WRAPLOW(step2[1] + step2[14]);
+  output[2] = WRAPLOW(step2[2] + step2[13]);
+  output[3] = WRAPLOW(step2[3] + step2[12]);
+  output[4] = WRAPLOW(step2[4] + step2[11]);
+  output[5] = WRAPLOW(step2[5] + step2[10]);
+  output[6] = WRAPLOW(step2[6] + step2[9]);
+  output[7] = WRAPLOW(step2[7] + step2[8]);
+  output[8] = WRAPLOW(step2[7] - step2[8]);
+  output[9] = WRAPLOW(step2[6] - step2[9]);
+  output[10] = WRAPLOW(step2[5] - step2[10]);
+  output[11] = WRAPLOW(step2[4] - step2[11]);
+  output[12] = WRAPLOW(step2[3] - step2[12]);
+  output[13] = WRAPLOW(step2[2] - step2[13]);
+  output[14] = WRAPLOW(step2[1] - step2[14]);
+  output[15] = WRAPLOW(step2[0] - step2[15]);
+}
+
+void vp9_high_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bd) {
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows.
+  for (i = 0; i < 16; ++i) {
+    high_idct16(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    high_idct16(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+  }
+}
+
+static void high_iadst16(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
+  (void) bd;
+
+  if (!(x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8
+           | x9 | x10 | x11 | x12 | x13 | x14 | x15)) {
+    vpx_memset(output, 0, 16 * sizeof(*output));
+    return;
+  }
+
+  // stage 1
+  s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
+  s1 = x0 * cospi_31_64 - x1 * cospi_1_64;
+  s2 = x2 * cospi_5_64  + x3 * cospi_27_64;
+  s3 = x2 * cospi_27_64 - x3 * cospi_5_64;
+  s4 = x4 * cospi_9_64  + x5 * cospi_23_64;
+  s5 = x4 * cospi_23_64 - x5 * cospi_9_64;
+  s6 = x6 * cospi_13_64 + x7 * cospi_19_64;
+  s7 = x6 * cospi_19_64 - x7 * cospi_13_64;
+  s8 = x8 * cospi_17_64 + x9 * cospi_15_64;
+  s9 = x8 * cospi_15_64 - x9 * cospi_17_64;
+  s10 = x10 * cospi_21_64 + x11 * cospi_11_64;
+  s11 = x10 * cospi_11_64 - x11 * cospi_21_64;
+  s12 = x12 * cospi_25_64 + x13 * cospi_7_64;
+  s13 = x12 * cospi_7_64  - x13 * cospi_25_64;
+  s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
+  s15 = x14 * cospi_3_64  - x15 * cospi_29_64;
+
+  x0 = WRAPLOW(dct_const_round_shift(s0 + s8));
+  x1 = WRAPLOW(dct_const_round_shift(s1 + s9));
+  x2 = WRAPLOW(dct_const_round_shift(s2 + s10));
+  x3 = WRAPLOW(dct_const_round_shift(s3 + s11));
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s12));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s13));
+  x6 = WRAPLOW(dct_const_round_shift(s6 + s14));
+  x7 = WRAPLOW(dct_const_round_shift(s7 + s15));
+  x8  = WRAPLOW(dct_const_round_shift(s0 - s8));
+  x9  = WRAPLOW(dct_const_round_shift(s1 - s9));
+  x10 = WRAPLOW(dct_const_round_shift(s2 - s10));
+  x11 = WRAPLOW(dct_const_round_shift(s3 - s11));
+  x12 = WRAPLOW(dct_const_round_shift(s4 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s5 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s6 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s7 - s15));
+
+  // stage 2
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4;
+  s5 = x5;
+  s6 = x6;
+  s7 = x7;
+  s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+
+  x0 = WRAPLOW(s0 + s4);
+  x1 = WRAPLOW(s1 + s5);
+  x2 = WRAPLOW(s2 + s6);
+  x3 = WRAPLOW(s3 + s7);
+  x4 = WRAPLOW(s0 - s4);
+  x5 = WRAPLOW(s1 - s5);
+  x6 = WRAPLOW(s2 - s6);
+  x7 = WRAPLOW(s3 - s7);
+  x8 = WRAPLOW(dct_const_round_shift(s8 + s12));
+  x9 = WRAPLOW(dct_const_round_shift(s9 + s13));
+  x10 = WRAPLOW(dct_const_round_shift(s10 + s14));
+  x11 = WRAPLOW(dct_const_round_shift(s11 + s15));
+  x12 = WRAPLOW(dct_const_round_shift(s8 - s12));
+  x13 = WRAPLOW(dct_const_round_shift(s9 - s13));
+  x14 = WRAPLOW(dct_const_round_shift(s10 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s11 - s15));
+
+  // stage 3
+  s0 = x0;
+  s1 = x1;
+  s2 = x2;
+  s3 = x3;
+  s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  s8 = x8;
+  s9 = x9;
+  s10 = x10;
+  s11 = x11;
+  s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+
+  x0 = WRAPLOW(s0 + s2);
+  x1 = WRAPLOW(s1 + s3);
+  x2 = WRAPLOW(s0 - s2);
+  x3 = WRAPLOW(s1 - s3);
+  x4 = WRAPLOW(dct_const_round_shift(s4 + s6));
+  x5 = WRAPLOW(dct_const_round_shift(s5 + s7));
+  x6 = WRAPLOW(dct_const_round_shift(s4 - s6));
+  x7 = WRAPLOW(dct_const_round_shift(s5 - s7));
+  x8 = WRAPLOW(s8 + s10);
+  x9 = WRAPLOW(s9 + s11);
+  x10 = WRAPLOW(s8 - s10);
+  x11 = WRAPLOW(s9 - s11);
+  x12 = WRAPLOW(dct_const_round_shift(s12 + s14));
+  x13 = WRAPLOW(dct_const_round_shift(s13 + s15));
+  x14 = WRAPLOW(dct_const_round_shift(s12 - s14));
+  x15 = WRAPLOW(dct_const_round_shift(s13 - s15));
+
+  // stage 4
+  s2 = (- cospi_16_64) * (x2 + x3);
+  s3 = cospi_16_64 * (x2 - x3);
+  s6 = cospi_16_64 * (x6 + x7);
+  s7 = cospi_16_64 * (-x6 + x7);
+  s10 = cospi_16_64 * (x10 + x11);
+  s11 = cospi_16_64 * (-x10 + x11);
+  s14 = (- cospi_16_64) * (x14 + x15);
+  s15 = cospi_16_64 * (x14 - x15);
+
+  x2 = WRAPLOW(dct_const_round_shift(s2));
+  x3 = WRAPLOW(dct_const_round_shift(s3));
+  x6 = WRAPLOW(dct_const_round_shift(s6));
+  x7 = WRAPLOW(dct_const_round_shift(s7));
+  x10 = WRAPLOW(dct_const_round_shift(s10));
+  x11 = WRAPLOW(dct_const_round_shift(s11));
+  x14 = WRAPLOW(dct_const_round_shift(s14));
+  x15 = WRAPLOW(dct_const_round_shift(s15));
+
+  output[0] = WRAPLOW(x0);
+  output[1] = WRAPLOW(-x8);
+  output[2] = WRAPLOW(x12);
+  output[3] = WRAPLOW(-x4);
+  output[4] = WRAPLOW(x6);
+  output[5] = WRAPLOW(x14);
+  output[6] = WRAPLOW(x10);
+  output[7] = WRAPLOW(x2);
+  output[8] = WRAPLOW(x3);
+  output[9] = WRAPLOW(x11);
+  output[10] = WRAPLOW(x15);
+  output[11] = WRAPLOW(x7);
+  output[12] = WRAPLOW(x5);
+  output[13] = WRAPLOW(-x13);
+  output[14] = WRAPLOW(x9);
+  output[15] = WRAPLOW(-x1);
+}
+
+static const high_transform_2d HIGH_IHT_16[] = {
+  { high_idct16,  high_idct16  },  // DCT_DCT  = 0
+  { high_iadst16, high_idct16  },  // ADST_DCT = 1
+  { high_idct16,  high_iadst16 },  // DCT_ADST = 2
+  { high_iadst16, high_iadst16 }   // ADST_ADST = 3
+};
+
+void vp9_high_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int tx_type, int bd) {
+  int i, j;
+  tran_low_t out[16 * 16];
+  tran_low_t *outptr = out;
+  tran_low_t temp_in[16], temp_out[16];
+  const high_transform_2d ht = HIGH_IHT_16[tx_type];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 16; ++i) {
+    ht.rows(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j * 16 + i];
+    ht.cols(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+  }
+}
+
+void vp9_high_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  tran_low_t out[16 * 16] = { 0 };
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[16], temp_out[16];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // First transform rows. Since all non-zero dct coefficients are in
+  // upper-left 4x4 area, we only need to calculate first 4 rows here.
+  for (i = 0; i < 4; ++i) {
+    high_idct16(input, outptr, bd);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns.
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j*16 + i];
+    high_idct16(temp_in, temp_out, bd);
+    for (j = 0; j < 16; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+  }
+}
+
+void vp9_high_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int bd) {
+  int i, j;
+  tran_high_t a1;
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+  for (j = 0; j < 16; ++j) {
+    for (i = 0; i < 16; ++i)
+      dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+
+static void high_idct32(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step1[32], step2[32];
+  tran_high_t temp1, temp2;
+  (void) bd;
+
+  // stage 1
+  step1[0] = input[0];
+  step1[1] = input[16];
+  step1[2] = input[8];
+  step1[3] = input[24];
+  step1[4] = input[4];
+  step1[5] = input[20];
+  step1[6] = input[12];
+  step1[7] = input[28];
+  step1[8] = input[2];
+  step1[9] = input[18];
+  step1[10] = input[10];
+  step1[11] = input[26];
+  step1[12] = input[6];
+  step1[13] = input[22];
+  step1[14] = input[14];
+  step1[15] = input[30];
+
+  temp1 = input[1] * cospi_31_64 - input[31] * cospi_1_64;
+  temp2 = input[1] * cospi_1_64 + input[31] * cospi_31_64;
+  step1[16] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[31] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = input[17] * cospi_15_64 - input[15] * cospi_17_64;
+  temp2 = input[17] * cospi_17_64 + input[15] * cospi_15_64;
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = input[9] * cospi_23_64 - input[23] * cospi_9_64;
+  temp2 = input[9] * cospi_9_64 + input[23] * cospi_23_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = input[25] * cospi_7_64 - input[7] * cospi_25_64;
+  temp2 = input[25] * cospi_25_64 + input[7] * cospi_7_64;
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = input[5] * cospi_27_64 - input[27] * cospi_5_64;
+  temp2 = input[5] * cospi_5_64 + input[27] * cospi_27_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = input[21] * cospi_11_64 - input[11] * cospi_21_64;
+  temp2 = input[21] * cospi_21_64 + input[11] * cospi_11_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = input[13] * cospi_19_64 - input[19] * cospi_13_64;
+  temp2 = input[13] * cospi_13_64 + input[19] * cospi_19_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = input[29] * cospi_3_64 - input[3] * cospi_29_64;
+  temp2 = input[29] * cospi_29_64 + input[3] * cospi_3_64;
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step2[16] = WRAPLOW(step1[16] + step1[17]);
+  step2[17] = WRAPLOW(step1[16] - step1[17]);
+  step2[18] = WRAPLOW(-step1[18] + step1[19]);
+  step2[19] = WRAPLOW(step1[18] + step1[19]);
+  step2[20] = WRAPLOW(step1[20] + step1[21]);
+  step2[21] = WRAPLOW(step1[20] - step1[21]);
+  step2[22] = WRAPLOW(-step1[22] + step1[23]);
+  step2[23] = WRAPLOW(step1[22] + step1[23]);
+  step2[24] = WRAPLOW(step1[24] + step1[25]);
+  step2[25] = WRAPLOW(step1[24] - step1[25]);
+  step2[26] = WRAPLOW(-step1[26] + step1[27]);
+  step2[27] = WRAPLOW(step1[26] + step1[27]);
+  step2[28] = WRAPLOW(step1[28] + step1[29]);
+  step2[29] = WRAPLOW(step1[28] - step1[29]);
+  step2[30] = WRAPLOW(-step1[30] + step1[31]);
+  step2[31] = WRAPLOW(step1[30] + step1[31]);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+
+  step1[8] = WRAPLOW(step2[8] + step2[9]);
+  step1[9] = WRAPLOW(step2[8] - step2[9]);
+  step1[10] = WRAPLOW(-step2[10] + step2[11]);
+  step1[11] = WRAPLOW(step2[10] + step2[11]);
+  step1[12] = WRAPLOW(step2[12] + step2[13]);
+  step1[13] = WRAPLOW(step2[12] - step2[13]);
+  step1[14] = WRAPLOW(-step2[14] + step2[15]);
+  step1[15] = WRAPLOW(step2[14] + step2[15]);
+
+  step1[16] = step2[16];
+  step1[31] = step2[31];
+  temp1 = -step2[17] * cospi_4_64 + step2[30] * cospi_28_64;
+  temp2 = step2[17] * cospi_28_64 + step2[30] * cospi_4_64;
+  step1[17] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[30] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step2[18] * cospi_28_64 - step2[29] * cospi_4_64;
+  temp2 = -step2[18] * cospi_4_64 + step2[29] * cospi_28_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[19] = step2[19];
+  step1[20] = step2[20];
+  temp1 = -step2[21] * cospi_20_64 + step2[26] * cospi_12_64;
+  temp2 = step2[21] * cospi_12_64 + step2[26] * cospi_20_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step2[22] * cospi_12_64 - step2[25] * cospi_20_64;
+  temp2 = -step2[22] * cospi_20_64 + step2[25] * cospi_12_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[27] = step2[27];
+  step1[28] = step2[28];
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[4] = WRAPLOW(step1[4] + step1[5]);
+  step2[5] = WRAPLOW(step1[4] - step1[5]);
+  step2[6] = WRAPLOW(-step1[6] + step1[7]);
+  step2[7] = WRAPLOW(step1[6] + step1[7]);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  step2[16] = WRAPLOW(step1[16] + step1[19]);
+  step2[17] = WRAPLOW(step1[17] + step1[18]);
+  step2[18] = WRAPLOW(step1[17] - step1[18]);
+  step2[19] = WRAPLOW(step1[16] - step1[19]);
+  step2[20] = WRAPLOW(-step1[20] + step1[23]);
+  step2[21] = WRAPLOW(-step1[21] + step1[22]);
+  step2[22] = WRAPLOW(step1[21] + step1[22]);
+  step2[23] = WRAPLOW(step1[20] + step1[23]);
+
+  step2[24] = WRAPLOW(step1[24] + step1[27]);
+  step2[25] = WRAPLOW(step1[25] + step1[26]);
+  step2[26] = WRAPLOW(step1[25] - step1[26]);
+  step2[27] = WRAPLOW(step1[24] - step1[27]);
+  step2[28] = WRAPLOW(-step1[28] + step1[31]);
+  step2[29] = WRAPLOW(-step1[29] + step1[30]);
+  step2[30] = WRAPLOW(step1[29] + step1[30]);
+  step2[31] = WRAPLOW(step1[28] + step1[31]);
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3]);
+  step1[1] = WRAPLOW(step2[1] + step2[2]);
+  step1[2] = WRAPLOW(step2[1] - step2[2]);
+  step1[3] = WRAPLOW(step2[0] - step2[3]);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11]);
+  step1[9] = WRAPLOW(step2[9] + step2[10]);
+  step1[10] = WRAPLOW(step2[9] - step2[10]);
+  step1[11] = WRAPLOW(step2[8] - step2[11]);
+  step1[12] = WRAPLOW(-step2[12] + step2[15]);
+  step1[13] = WRAPLOW(-step2[13] + step2[14]);
+  step1[14] = WRAPLOW(step2[13] + step2[14]);
+  step1[15] = WRAPLOW(step2[12] + step2[15]);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  temp1 = -step2[18] * cospi_8_64 + step2[29] * cospi_24_64;
+  temp2 = step2[18] * cospi_24_64 + step2[29] * cospi_8_64;
+  step1[18] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[29] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step2[19] * cospi_8_64 + step2[28] * cospi_24_64;
+  temp2 = step2[19] * cospi_24_64 + step2[28] * cospi_8_64;
+  step1[19] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[28] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step2[20] * cospi_24_64 - step2[27] * cospi_8_64;
+  temp2 = -step2[20] * cospi_8_64 + step2[27] * cospi_24_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = -step2[21] * cospi_24_64 - step2[26] * cospi_8_64;
+  temp2 = -step2[21] * cospi_8_64 + step2[26] * cospi_24_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[22] = step2[22];
+  step1[23] = step2[23];
+  step1[24] = step2[24];
+  step1[25] = step2[25];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7]);
+  step2[1] = WRAPLOW(step1[1] + step1[6]);
+  step2[2] = WRAPLOW(step1[2] + step1[5]);
+  step2[3] = WRAPLOW(step1[3] + step1[4]);
+  step2[4] = WRAPLOW(step1[3] - step1[4]);
+  step2[5] = WRAPLOW(step1[2] - step1[5]);
+  step2[6] = WRAPLOW(step1[1] - step1[6]);
+  step2[7] = WRAPLOW(step1[0] - step1[7]);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1));
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2));
+  step2[14] = WRAPLOW(step1[14]);
+  step2[15] = WRAPLOW(step1[15]);
+
+  step2[16] = WRAPLOW(step1[16] + step1[23]);
+  step2[17] = WRAPLOW(step1[17] + step1[22]);
+  step2[18] = WRAPLOW(step1[18] + step1[21]);
+  step2[19] = WRAPLOW(step1[19] + step1[20]);
+  step2[20] = WRAPLOW(step1[19] - step1[20]);
+  step2[21] = WRAPLOW(step1[18] - step1[21]);
+  step2[22] = WRAPLOW(step1[17] - step1[22]);
+  step2[23] = WRAPLOW(step1[16] - step1[23]);
+
+  step2[24] = WRAPLOW(-step1[24] + step1[31]);
+  step2[25] = WRAPLOW(-step1[25] + step1[30]);
+  step2[26] = WRAPLOW(-step1[26] + step1[29]);
+  step2[27] = WRAPLOW(-step1[27] + step1[28]);
+  step2[28] = WRAPLOW(step1[27] + step1[28]);
+  step2[29] = WRAPLOW(step1[26] + step1[29]);
+  step2[30] = WRAPLOW(step1[25] + step1[30]);
+  step2[31] = WRAPLOW(step1[24] + step1[31]);
+
+  // stage 7
+  step1[0] = WRAPLOW(step2[0] + step2[15]);
+  step1[1] = WRAPLOW(step2[1] + step2[14]);
+  step1[2] = WRAPLOW(step2[2] + step2[13]);
+  step1[3] = WRAPLOW(step2[3] + step2[12]);
+  step1[4] = WRAPLOW(step2[4] + step2[11]);
+  step1[5] = WRAPLOW(step2[5] + step2[10]);
+  step1[6] = WRAPLOW(step2[6] + step2[9]);
+  step1[7] = WRAPLOW(step2[7] + step2[8]);
+  step1[8] = WRAPLOW(step2[7] - step2[8]);
+  step1[9] = WRAPLOW(step2[6] - step2[9]);
+  step1[10] = WRAPLOW(step2[5] - step2[10]);
+  step1[11] = WRAPLOW(step2[4] - step2[11]);
+  step1[12] = WRAPLOW(step2[3] - step2[12]);
+  step1[13] = WRAPLOW(step2[2] - step2[13]);
+  step1[14] = WRAPLOW(step2[1] - step2[14]);
+  step1[15] = WRAPLOW(step2[0] - step2[15]);
+
+  step1[16] = step2[16];
+  step1[17] = step2[17];
+  step1[18] = step2[18];
+  step1[19] = step2[19];
+  temp1 = (-step2[20] + step2[27]) * cospi_16_64;
+  temp2 = (step2[20] + step2[27]) * cospi_16_64;
+  step1[20] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[27] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = (-step2[21] + step2[26]) * cospi_16_64;
+  temp2 = (step2[21] + step2[26]) * cospi_16_64;
+  step1[21] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[26] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = (-step2[22] + step2[25]) * cospi_16_64;
+  temp2 = (step2[22] + step2[25]) * cospi_16_64;
+  step1[22] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[25] = WRAPLOW(dct_const_round_shift(temp2));
+  temp1 = (-step2[23] + step2[24]) * cospi_16_64;
+  temp2 = (step2[23] + step2[24]) * cospi_16_64;
+  step1[23] = WRAPLOW(dct_const_round_shift(temp1));
+  step1[24] = WRAPLOW(dct_const_round_shift(temp2));
+  step1[28] = step2[28];
+  step1[29] = step2[29];
+  step1[30] = step2[30];
+  step1[31] = step2[31];
+
+  // final stage
+  output[0] = WRAPLOW(step1[0] + step1[31]);
+  output[1] = WRAPLOW(step1[1] + step1[30]);
+  output[2] = WRAPLOW(step1[2] + step1[29]);
+  output[3] = WRAPLOW(step1[3] + step1[28]);
+  output[4] = WRAPLOW(step1[4] + step1[27]);
+  output[5] = WRAPLOW(step1[5] + step1[26]);
+  output[6] = WRAPLOW(step1[6] + step1[25]);
+  output[7] = WRAPLOW(step1[7] + step1[24]);
+  output[8] = WRAPLOW(step1[8] + step1[23]);
+  output[9] = WRAPLOW(step1[9] + step1[22]);
+  output[10] = WRAPLOW(step1[10] + step1[21]);
+  output[11] = WRAPLOW(step1[11] + step1[20]);
+  output[12] = WRAPLOW(step1[12] + step1[19]);
+  output[13] = WRAPLOW(step1[13] + step1[18]);
+  output[14] = WRAPLOW(step1[14] + step1[17]);
+  output[15] = WRAPLOW(step1[15] + step1[16]);
+  output[16] = WRAPLOW(step1[15] - step1[16]);
+  output[17] = WRAPLOW(step1[14] - step1[17]);
+  output[18] = WRAPLOW(step1[13] - step1[18]);
+  output[19] = WRAPLOW(step1[12] - step1[19]);
+  output[20] = WRAPLOW(step1[11] - step1[20]);
+  output[21] = WRAPLOW(step1[10] - step1[21]);
+  output[22] = WRAPLOW(step1[9] - step1[22]);
+  output[23] = WRAPLOW(step1[8] - step1[23]);
+  output[24] = WRAPLOW(step1[7] - step1[24]);
+  output[25] = WRAPLOW(step1[6] - step1[25]);
+  output[26] = WRAPLOW(step1[5] - step1[26]);
+  output[27] = WRAPLOW(step1[4] - step1[27]);
+  output[28] = WRAPLOW(step1[3] - step1[28]);
+  output[29] = WRAPLOW(step1[2] - step1[29]);
+  output[30] = WRAPLOW(step1[1] - step1[30]);
+  output[31] = WRAPLOW(step1[0] - step1[31]);
+}
+
+void vp9_high_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+                                   int stride, int bd) {
+  tran_low_t out[32 * 32];
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  for (i = 0; i < 32; ++i) {
+    tran_low_t zero_coeff[16];
+    for (j = 0; j < 16; ++j)
+      zero_coeff[j] = input[2 * j] | input[2 * j + 1];
+    for (j = 0; j < 8; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 4; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+    for (j = 0; j < 2; ++j)
+      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
+
+    if (zero_coeff[0] | zero_coeff[1])
+      high_idct32(input, outptr, bd);
+    else
+      vpx_memset(outptr, 0, sizeof(tran_low_t) * 32);
+    input += 32;
+    outptr += 32;
+  }
+
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    high_idct32(temp_in, temp_out, bd);
+    for (j = 0; j < 32; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+  }
+}
+
+void vp9_high_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest8,
+                                 int stride, int bd) {
+  tran_low_t out[32 * 32] = {0};
+  tran_low_t *outptr = out;
+  int i, j;
+  tran_low_t temp_in[32], temp_out[32];
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  // Rows
+  // Only upper-left 8x8 has non-zero coeff.
+  for (i = 0; i < 8; ++i) {
+    high_idct32(input, outptr, bd);
+    input += 32;
+    outptr += 32;
+  }
+  // Columns
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j)
+      temp_in[j] = out[j * 32 + i];
+    high_idct32(temp_in, temp_out, bd);
+    for (j = 0; j < 32; ++j)
+      dest[j * stride + i] = clip_pixel_bd_high(
+          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+  }
+}
+
+void vp9_high_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest8,
+                                int stride, int bd) {
+  int i, j;
+  int a1;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
+  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
+  a1 = ROUND_POWER_OF_TWO(out, 6);
+
+  for (j = 0; j < 32; ++j) {
+    for (i = 0; i < 32; ++i)
+      dest[i] = clip_pixel_bd_high(dest[i], a1, bd);
+    dest += stride;
+  }
+}
+
+// idct
+void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd) {
+  if (eob > 1)
+    vp9_high_idct4x4_16_add(input, dest, stride, bd);
+  else
+    vp9_high_idct4x4_1_add(input, dest, stride, bd);
+}
+
+
+void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd) {
+  if (eob > 1)
+    vp9_high_iwht4x4_16_add(input, dest, stride, bd);
+  else
+    vp9_high_iwht4x4_1_add(input, dest, stride, bd);
+}
+
+void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd) {
+  // If dc is 1, then input[0] is the reconstructed value, do not need
+  // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
+
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to decide what to do.
+  // TODO(yunqingwang): "eobs = 1" case is also handled in vp9_short_idct8x8_c.
+  // Combine that with code here.
+  // DC only DCT coefficient
+  if (eob == 1) {
+    vp9_high_idct8x8_1_add(input, dest, stride, bd);
+  } else if (eob <= 10) {
+    vp9_high_idct8x8_10_add(input, dest, stride, bd);
+  } else {
+    vp9_high_idct8x8_64_add(input, dest, stride, bd);
+  }
+}
+
+void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob, int bd) {
+  // The calculation can be simplified if there are not many non-zero dct
+  // coefficients. Use eobs to separate different cases.
+  // DC only DCT coefficient.
+  if (eob == 1) {
+    vp9_high_idct16x16_1_add(input, dest, stride, bd);
+  } else if (eob <= 10) {
+    vp9_high_idct16x16_10_add(input, dest, stride, bd);
+  } else {
+    vp9_high_idct16x16_256_add(input, dest, stride, bd);
+  }
+}
+
+void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+                       int eob, int bd) {
+  // Non-zero coeff only in upper-left 8x8
+  if (eob == 1) {
+    vp9_high_idct32x32_1_add(input, dest, stride, bd);
+  } else if (eob <= 34) {
+    vp9_high_idct32x32_34_add(input, dest, stride, bd);
+  } else {
+    vp9_high_idct32x32_1024_add(input, dest, stride, bd);
+  }
+}
+
+// iht
+void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
+                         uint8_t *dest, int stride, int eob, int bd) {
+  if (tx_type == DCT_DCT)
+    vp9_high_idct4x4_add(input, dest, stride, eob, bd);
+  else
+    vp9_high_iht4x4_16_add(input, dest, stride, tx_type, bd);
+}
+
+void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
+                         uint8_t *dest, int stride, int eob, int bd) {
+  if (tx_type == DCT_DCT) {
+    vp9_high_idct8x8_add(input, dest, stride, eob, bd);
+  } else {
+    vp9_high_iht8x8_64_add(input, dest, stride, tx_type, bd);
+  }
+}
+
+void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd) {
+  if (tx_type == DCT_DCT) {
+    vp9_high_idct16x16_add(input, dest, stride, eob, bd);
+  } else {
+    vp9_high_iht16x16_256_add(input, dest, stride, tx_type, bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h

index 7f595e1..694be3c 100644 (file)
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -36,52 +36,69 @@ extern "C" {
  #define dual_set_epi16(a, b) \
    _mm_set_epi16(b, b, b, b, a, a, a, a)
  
+// Note:
+// tran_low_t  is the datatype used for final transform coefficients.
+// tran_high_t is the datatype used for intermediate transform stages.
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef int64_t tran_high_t;
+typedef int32_t tran_low_t;
+#else
+typedef int32_t tran_high_t;
+typedef int16_t tran_low_t;
+#endif
+
  // Constants:
  //  for (int i = 1; i< 32; ++i)
  //    printf("static const int cospi_%d_64 = %.0f;\n", i,
  //           round(16384 * cos(i*M_PI/64)));
  // Note: sin(k*Pi/64) = cos((32-k)*Pi/64)
-static const int cospi_1_64  = 16364;
-static const int cospi_2_64  = 16305;
-static const int cospi_3_64  = 16207;
-static const int cospi_4_64  = 16069;
-static const int cospi_5_64  = 15893;
-static const int cospi_6_64  = 15679;
-static const int cospi_7_64  = 15426;
-static const int cospi_8_64  = 15137;
-static const int cospi_9_64  = 14811;
-static const int cospi_10_64 = 14449;
-static const int cospi_11_64 = 14053;
-static const int cospi_12_64 = 13623;
-static const int cospi_13_64 = 13160;
-static const int cospi_14_64 = 12665;
-static const int cospi_15_64 = 12140;
-static const int cospi_16_64 = 11585;
-static const int cospi_17_64 = 11003;
-static const int cospi_18_64 = 10394;
-static const int cospi_19_64 = 9760;
-static const int cospi_20_64 = 9102;
-static const int cospi_21_64 = 8423;
-static const int cospi_22_64 = 7723;
-static const int cospi_23_64 = 7005;
-static const int cospi_24_64 = 6270;
-static const int cospi_25_64 = 5520;
-static const int cospi_26_64 = 4756;
-static const int cospi_27_64 = 3981;
-static const int cospi_28_64 = 3196;
-static const int cospi_29_64 = 2404;
-static const int cospi_30_64 = 1606;
-static const int cospi_31_64 = 804;
+static const tran_high_t cospi_1_64  = 16364;
+static const tran_high_t cospi_2_64  = 16305;
+static const tran_high_t cospi_3_64  = 16207;
+static const tran_high_t cospi_4_64  = 16069;
+static const tran_high_t cospi_5_64  = 15893;
+static const tran_high_t cospi_6_64  = 15679;
+static const tran_high_t cospi_7_64  = 15426;
+static const tran_high_t cospi_8_64  = 15137;
+static const tran_high_t cospi_9_64  = 14811;
+static const tran_high_t cospi_10_64 = 14449;
+static const tran_high_t cospi_11_64 = 14053;
+static const tran_high_t cospi_12_64 = 13623;
+static const tran_high_t cospi_13_64 = 13160;
+static const tran_high_t cospi_14_64 = 12665;
+static const tran_high_t cospi_15_64 = 12140;
+static const tran_high_t cospi_16_64 = 11585;
+static const tran_high_t cospi_17_64 = 11003;
+static const tran_high_t cospi_18_64 = 10394;
+static const tran_high_t cospi_19_64 = 9760;
+static const tran_high_t cospi_20_64 = 9102;
+static const tran_high_t cospi_21_64 = 8423;
+static const tran_high_t cospi_22_64 = 7723;
+static const tran_high_t cospi_23_64 = 7005;
+static const tran_high_t cospi_24_64 = 6270;
+static const tran_high_t cospi_25_64 = 5520;
+static const tran_high_t cospi_26_64 = 4756;
+static const tran_high_t cospi_27_64 = 3981;
+static const tran_high_t cospi_28_64 = 3196;
+static const tran_high_t cospi_29_64 = 2404;
+static const tran_high_t cospi_30_64 = 1606;
+static const tran_high_t cospi_31_64 = 804;
  
  //  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
-static const int sinpi_1_9 = 5283;
-static const int sinpi_2_9 = 9929;
-static const int sinpi_3_9 = 13377;
-static const int sinpi_4_9 = 15212;
-
-static INLINE int dct_const_round_shift(int input) {
-  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
+static const tran_high_t sinpi_1_9 = 5283;
+static const tran_high_t sinpi_2_9 = 9929;
+static const tran_high_t sinpi_3_9 = 13377;
+static const tran_high_t sinpi_4_9 = 15212;
+
+static INLINE tran_low_t dct_const_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+#if CONFIG_VP9_HIGHBITDEPTH
+  // For valid highbitdepth VP9 streams, intermediate stage coefficients will
+  // stay within the ranges:
+  // - 8 bit: signed 16 bit integer
+  // - 10 bit: signed 18 bit integer
+  // - 12 bit: signed 20 bit integer
+#elif CONFIG_COEFFICIENT_RANGE_CHECKING
    // For valid VP9 input streams, intermediate stage coefficients should always
    // stay within the range of a signed 16 bit integer. Coefficients can go out
    // of this range for invalid/corrupt VP9 streams. However, strictly checking
@@ -91,32 +108,59 @@ static INLINE int dct_const_round_shift(int input) {
    assert(INT16_MIN <= rv);
    assert(rv <= INT16_MAX);
  #endif
-  return (int16_t)rv;
+  return (tran_low_t)rv;
  }
  
-typedef void (*transform_1d)(const int16_t*, int16_t*);
+typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
  
  typedef struct {
    transform_1d cols, rows;  // vertical and horizontal
  } transform_2d;
  
-void vp9_iwht4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*high_transform_1d)(const tran_low_t*, tran_low_t*, int bd);
  
-void vp9_idct4x4_add(const int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct8x8_add(const int16_t *input, uint8_t *dest, int stride, int eob);
-void vp9_idct16x16_add(const int16_t *input, uint8_t *dest, int stride, int
+typedef struct {
+  high_transform_1d cols, rows;  // vertical and horizontal
+} high_transform_2d;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp9_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void vp9_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void vp9_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                     int eob);
+void vp9_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride, int
                         eob);
-void vp9_idct32x32_add(const int16_t *input, uint8_t *dest, int stride,
+void vp9_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
                         int eob);
  
-void vp9_iht4x4_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                      int stride, int eob);
-void vp9_iht8x8_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                      int stride, int eob);
-void vp9_iht16x16_add(TX_TYPE tx_type, const int16_t *input, uint8_t *dest,
+void vp9_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input, uint8_t *dest,
                        int stride, int eob);
  
-
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd);
+void vp9_high_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd);
+void vp9_high_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                          int eob, int bd);
+void vp9_high_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void vp9_high_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+                            int eob, int bd);
+void vp9_high_iht4x4_add(TX_TYPE tx_type, const tran_low_t *input,
+                         uint8_t *dest, int stride, int eob, int bd);
+void vp9_high_iht8x8_add(TX_TYPE tx_type, const tran_low_t *input,
+                         uint8_t *dest, int stride, int eob, int bd);
+void vp9_high_iht16x16_add(TX_TYPE tx_type, const tran_low_t *input,
+                           uint8_t *dest, int stride, int eob, int bd);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
  #ifdef __cplusplus
  }  // extern "C"
  #endif
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c

index 3b39d42..4d03c4d 100644 (file)
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1193,7 +1193,7 @@ void vp9_filter_block_plane(VP9_COMMON *const cm,
    }
  }
  
-void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
+void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
                            VP9_COMMON *cm,
                            struct macroblockd_plane planes[MAX_MB_PLANE],
                            int start, int stop, int y_only) {
@@ -1247,9 +1247,8 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
                         y_only);
  }
  
-int vp9_loop_filter_worker(void *arg1, void *arg2) {
-  LFWorkerData *const lf_data = (LFWorkerData*)arg1;
-  (void)arg2;
+int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused) {
+  (void)unused;
    vp9_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                         lf_data->start, lf_data->stop, lf_data->y_only);
    return 1;
diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h

index 6fa2773..69e7dd0 100644 (file)
--- a/vp9/common/vp9_loopfilter.h
+++ b/vp9/common/vp9_loopfilter.h
@@ -111,13 +111,13 @@ void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
                             int y_only, int partial_frame);
  
  // Apply the loop filter to [start, stop) macro block rows in frame_buffer.
-void vp9_loop_filter_rows(const YV12_BUFFER_CONFIG *frame_buffer,
+void vp9_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
                            struct VP9Common *cm,
                            struct macroblockd_plane planes[MAX_MB_PLANE],
                            int start, int stop, int y_only);
  
  typedef struct LoopFilterWorkerData {
-  const YV12_BUFFER_CONFIG *frame_buffer;
+  YV12_BUFFER_CONFIG *frame_buffer;
    struct VP9Common *cm;
    struct macroblockd_plane planes[MAX_MB_PLANE];
  
@@ -129,8 +129,8 @@ typedef struct LoopFilterWorkerData {
    int num_lf_workers;
  } LFWorkerData;
  
-// Operates on the rows described by LFWorkerData passed as 'arg1'.
-int vp9_loop_filter_worker(void *arg1, void *arg2);
+// Operates on the rows described by 'lf_data'.
+int vp9_loop_filter_worker(LFWorkerData *const lf_data, void *unused);
  #ifdef __cplusplus
  }  // extern "C"
  #endif
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl

index 667e057..32bcf9a 100644 (file)
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -6,6 +6,7 @@ print <<EOF
  
  #include "vpx/vpx_integer.h"
  #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_idct.h"
  
  struct macroblockd;
  
@@ -45,6 +46,13 @@ if ($opts{arch} eq "x86_64") {
    $avx_x86_64 = $avx2_x86_64 = '';
  }
  
+# optimizations which depend on multiple features
+if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) {
+  $avx2_ssse3 = 'avx2';
+} else {
+  $avx2_ssse3 = '';
+}
+
  #
  # RECON
  #
@@ -296,15 +304,15 @@ specialize qw/vp9_convolve_avg neon_asm dspr2/, "$sse2_x86inc";
  $vp9_convolve_avg_neon_asm=vp9_convolve_avg_neon;
  
  add_proto qw/void vp9_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2 avx2/;
+specialize qw/vp9_convolve8 sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
  $vp9_convolve8_neon_asm=vp9_convolve8_neon;
  
  add_proto qw/void vp9_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2 avx2/;
+specialize qw/vp9_convolve8_horiz sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
  $vp9_convolve8_horiz_neon_asm=vp9_convolve8_horiz_neon;
  
  add_proto qw/void vp9_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2 avx2/;
+specialize qw/vp9_convolve8_vert sse2 ssse3 neon_asm dspr2/, "$avx2_ssse3";
  $vp9_convolve8_vert_neon_asm=vp9_convolve8_vert_neon;
  
  add_proto qw/void vp9_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
@@ -322,68 +330,177 @@ $vp9_convolve8_avg_vert_neon_asm=vp9_convolve8_avg_vert_neon;
  #
  # dct
  #
-add_proto qw/void vp9_idct4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
-$vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct4x4_1_add/;
+
+  add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct4x4_16_add/;
+
+  add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct8x8_1_add/;
+
+  add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct8x8_64_add/;
+
+  add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct8x8_12_add/;
+
+  add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct16x16_1_add/;
+
+  add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct16x16_256_add/;
+
+  add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct16x16_10_add/;
+
+  add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct32x32_1024_add/;
+
+  add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct32x32_34_add/;
+
+  add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct32x32_1_add/;
+
+  add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+  specialize qw/vp9_iht4x4_16_add/;
+
+  add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+  specialize qw/vp9_iht8x8_64_add/;
+
+  add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+  specialize qw/vp9_iht16x16_256_add/;
+
+  # dct and add
+
+  add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_iwht4x4_1_add/;
  
-add_proto qw/void vp9_idct4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
-$vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+  add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_iwht4x4_16_add/;
+} else {
+  add_proto qw/void vp9_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct4x4_1_add sse2 neon_asm dspr2/;
+  $vp9_idct4x4_1_add_neon_asm=vp9_idct4x4_1_add_neon;
+
+  add_proto qw/void vp9_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct4x4_16_add sse2 neon_asm dspr2/;
+  $vp9_idct4x4_16_add_neon_asm=vp9_idct4x4_16_add_neon;
+
+  add_proto qw/void vp9_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
+  $vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+
+  add_proto qw/void vp9_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
+  $vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+
+  add_proto qw/void vp9_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
+  $vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+
+  add_proto qw/void vp9_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
+  $vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+
+  add_proto qw/void vp9_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
+  $vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+
+  add_proto qw/void vp9_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
+  $vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+
+  add_proto qw/void vp9_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
+  $vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
  
-add_proto qw/void vp9_idct8x8_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_1_add sse2 neon_asm dspr2/;
-$vp9_idct8x8_1_add_neon_asm=vp9_idct8x8_1_add_neon;
+  add_proto qw/void vp9_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
+  $vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
  
-add_proto qw/void vp9_idct8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_64_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
-$vp9_idct8x8_64_add_neon_asm=vp9_idct8x8_64_add_neon;
+  add_proto qw/void vp9_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
+  $vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
  
-add_proto qw/void vp9_idct8x8_12_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct8x8_12_add sse2 neon_asm dspr2/, "$ssse3_x86_64";
-$vp9_idct8x8_12_add_neon_asm=vp9_idct8x8_12_add_neon;
+  add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+  specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
+  $vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
  
-add_proto qw/void vp9_idct16x16_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_1_add sse2 neon_asm dspr2/;
-$vp9_idct16x16_1_add_neon_asm=vp9_idct16x16_1_add_neon;
+  add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
+  specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
+  $vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
  
-add_proto qw/void vp9_idct16x16_256_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_256_add sse2 ssse3 neon_asm dspr2/;
-$vp9_idct16x16_256_add_neon_asm=vp9_idct16x16_256_add_neon;
+  add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
+  specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+
+  # dct and add
+
+  add_proto qw/void vp9_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_iwht4x4_1_add/;
+
+  add_proto qw/void vp9_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
+  specialize qw/vp9_iwht4x4_16_add/;
+}
+
+
+# High bitdepth functions
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+#
+# dct
+#
+add_proto qw/void vp9_high_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct4x4_1_add/;
  
-add_proto qw/void vp9_idct16x16_10_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct16x16_10_add sse2 ssse3 neon_asm dspr2/;
-$vp9_idct16x16_10_add_neon_asm=vp9_idct16x16_10_add_neon;
+add_proto qw/void vp9_high_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct4x4_16_add/;
  
-add_proto qw/void vp9_idct32x32_1024_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_1024_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_1024_add_neon_asm=vp9_idct32x32_1024_add_neon;
+add_proto qw/void vp9_high_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct8x8_1_add/;
  
-add_proto qw/void vp9_idct32x32_34_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_34_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_34_add_neon_asm=vp9_idct32x32_1024_add_neon;
+add_proto qw/void vp9_high_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct8x8_64_add/;
  
-add_proto qw/void vp9_idct32x32_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_idct32x32_1_add sse2 neon_asm dspr2/;
-$vp9_idct32x32_1_add_neon_asm=vp9_idct32x32_1_add_neon;
+add_proto qw/void vp9_high_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct8x8_10_add/;
  
-add_proto qw/void vp9_iht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
-specialize qw/vp9_iht4x4_16_add sse2 neon_asm dspr2/;
-$vp9_iht4x4_16_add_neon_asm=vp9_iht4x4_16_add_neon;
+add_proto qw/void vp9_high_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct16x16_1_add/;
  
-add_proto qw/void vp9_iht8x8_64_add/, "const int16_t *input, uint8_t *dest, int dest_stride, int tx_type";
-specialize qw/vp9_iht8x8_64_add sse2 neon_asm dspr2/;
-$vp9_iht8x8_64_add_neon_asm=vp9_iht8x8_64_add_neon;
+add_proto qw/void vp9_high_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct16x16_256_add/;
  
-add_proto qw/void vp9_iht16x16_256_add/, "const int16_t *input, uint8_t *output, int pitch, int tx_type";
-specialize qw/vp9_iht16x16_256_add sse2 dspr2/;
+add_proto qw/void vp9_high_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct16x16_10_add/;
+
+add_proto qw/void vp9_high_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct32x32_1024_add/;
+
+add_proto qw/void vp9_high_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct32x32_34_add/;
+
+add_proto qw/void vp9_high_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_idct32x32_1_add/;
+
+add_proto qw/void vp9_high_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+specialize qw/vp9_high_iht4x4_16_add/;
+
+add_proto qw/void vp9_high_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd";
+specialize qw/vp9_high_iht8x8_64_add/;
+
+add_proto qw/void vp9_high_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd";
+specialize qw/vp9_high_iht16x16_256_add/;
  
  # dct and add
  
-add_proto qw/void vp9_iwht4x4_1_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_iwht4x4_1_add/;
+add_proto qw/void vp9_high_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_iwht4x4_1_add/;
  
-add_proto qw/void vp9_iwht4x4_16_add/, "const int16_t *input, uint8_t *dest, int dest_stride";
-specialize qw/vp9_iwht4x4_16_add/;
+add_proto qw/void vp9_high_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd";
+specialize qw/vp9_high_iwht4x4_16_add/;
+}
  
  #
  # Encoder functions below this point.
@@ -699,23 +816,42 @@ add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
  specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
  # ENCODEMB INVOKE
  
-add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
-
  add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
  specialize qw/vp9_subtract_block neon/, "$sse2_x86inc";
  
-add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+# the transform coefficients are held in 32-bit
+# values, so the assembler code for  vp9_block_error can no longer be used.
+  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/vp9_block_error/;
+
+  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp/;
  
-add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
+  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp_32x32/;
  
-add_proto qw/void vp9_quantize_b/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b/;
  
-add_proto qw/void vp9_quantize_b_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_b_32x32/, "$ssse3_x86_64";
+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b_32x32/;
+} else {
+  add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
+  specialize qw/vp9_block_error avx2/;
+
+  add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
+
+  add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
+
+  add_proto qw/void vp9_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b/, "$ssse3_x86_64";
+
+  add_proto qw/void vp9_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_quantize_b_32x32/;
+}
  
  #
  # Structured Similarity (SSIM)
@@ -729,44 +865,86 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
  }
  
  # fdct functions
-add_proto qw/void vp9_fht4x4/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht4x4 sse2/;
  
-add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht8x8 sse2/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht4x4/;
+
+  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht8x8/;
  
-add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht16x16 sse2/;
+  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht16x16/;
  
-add_proto qw/void vp9_fwht4x4/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fwht4x4/;
  
-add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct4x4_1 sse2/;
+  add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct4x4_1/;
  
-add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct4x4 sse2/;
+  add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct4x4/;
  
-add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8_1 sse2 neon/;
+  add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct8x8_1/;
  
-add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
+  add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct8x8/;
  
-add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct16x16_1 sse2/;
+  add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct16x16_1/;
  
-add_proto qw/void vp9_fdct16x16/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct16x16 sse2/;
+  add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct16x16/;
  
-add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32_1 sse2/;
+  add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct32x32_1/;
  
-add_proto qw/void vp9_fdct32x32/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32 sse2 avx2/;
+  add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct32x32/;
+
+  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct32x32_rd/;
+} else {
+  add_proto qw/void vp9_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht4x4 sse2/;
  
-add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+  add_proto qw/void vp9_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht8x8 sse2/;
+
+  add_proto qw/void vp9_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_fht16x16 sse2/;
+
+  add_proto qw/void vp9_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fwht4x4/, "$mmx_x86inc";
+
+  add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct4x4_1 sse2/;
+
+  add_proto qw/void vp9_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct4x4 sse2/;
+
+  add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct8x8_1 sse2 neon/;
+
+  add_proto qw/void vp9_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
+
+  add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct16x16_1 sse2/;
+
+  add_proto qw/void vp9_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct16x16 sse2/;
+
+  add_proto qw/void vp9_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct32x32_1 sse2/;
+
+  add_proto qw/void vp9_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct32x32 sse2 avx2/;
+
+  add_proto qw/void vp9_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_fdct32x32_rd sse2 avx2/;
+}
  
  #
  # Motion search
@@ -788,6 +966,654 @@ specialize qw/vp9_full_range_search/;
  add_proto qw/void vp9_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
  specialize qw/vp9_temporal_filter_apply sse2/;
  
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+
+  # variance
+  add_proto qw/unsigned int vp9_high_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_variance4x4/;
+
+  add_proto qw/void vp9_high_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_high_get8x8var/;
+
+  add_proto qw/void vp9_high_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_high_get16x16var/;
+
+  add_proto qw/unsigned int vp9_high_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_variance4x4/;
+
+  add_proto qw/void vp9_high_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_high_10_get8x8var/;
+
+  add_proto qw/void vp9_high_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_high_10_get16x16var/;
+
+  add_proto qw/unsigned int vp9_high_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_variance4x4/;
+
+  add_proto qw/void vp9_high_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_high_12_get8x8var/;
+
+  add_proto qw/void vp9_high_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+  specialize qw/vp9_high_12_get16x16var/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_sub_pixel_variance4x4/;
+
+  add_proto qw/unsigned int vp9_high_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_sub_pixel_avg_variance4x4/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_sub_pixel_variance4x4/;
+
+  add_proto qw/unsigned int vp9_high_10_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_10_sub_pixel_avg_variance4x4/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance64x64/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance32x64/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance64x32/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance32x16/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance16x32/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance32x32/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance16x16/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance8x16/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance16x8/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance8x8/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance8x4/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance4x8/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_sub_pixel_variance4x4/;
+
+  add_proto qw/unsigned int vp9_high_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
+  specialize qw/vp9_high_12_sub_pixel_avg_variance4x4/;
+
+  add_proto qw/unsigned int vp9_high_sad64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_high_sad64x64/;
+
+  add_proto qw/unsigned int vp9_high_sad32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_high_sad32x64/;
+
+  add_proto qw/unsigned int vp9_high_sad64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_high_sad64x32/;
+
+  add_proto qw/unsigned int vp9_high_sad32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_high_sad32x16/;
+
+  add_proto qw/unsigned int vp9_high_sad16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_high_sad16x32/;
+
+  add_proto qw/unsigned int vp9_high_sad32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_high_sad32x32/;
+
+  add_proto qw/unsigned int vp9_high_sad16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_high_sad16x16/;
+
+  add_proto qw/unsigned int vp9_high_sad16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_high_sad16x8/;
+
+  add_proto qw/unsigned int vp9_high_sad8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_high_sad8x16/;
+
+  add_proto qw/unsigned int vp9_high_sad8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_high_sad8x8/;
+
+  add_proto qw/unsigned int vp9_high_sad8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_high_sad8x4/;
+
+  add_proto qw/unsigned int vp9_high_sad4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride";
+  specialize qw/vp9_high_sad4x8/;
+
+  add_proto qw/unsigned int vp9_high_sad4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride";
+  specialize qw/vp9_high_sad4x4/;
+
+  add_proto qw/unsigned int vp9_high_sad64x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad64x64_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad32x64_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad32x64_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad64x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad64x32_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad32x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad32x16_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad16x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad16x32_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad32x32_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad32x32_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad16x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad16x16_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad16x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad16x8_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad8x16_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad8x16_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad8x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad8x8_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad8x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad8x4_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad4x8_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad4x8_avg/;
+
+  add_proto qw/unsigned int vp9_high_sad4x4_avg/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, const uint8_t *second_pred";
+  specialize qw/vp9_high_sad4x4_avg/;
+
+  add_proto qw/void vp9_high_sad64x64x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad64x64x3/;
+
+  add_proto qw/void vp9_high_sad32x32x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad32x32x3/;
+
+  add_proto qw/void vp9_high_sad16x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad16x16x3/;
+
+  add_proto qw/void vp9_high_sad16x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad16x8x3/;
+
+  add_proto qw/void vp9_high_sad8x16x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad8x16x3/;
+
+  add_proto qw/void vp9_high_sad8x8x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad8x8x3/;
+
+  add_proto qw/void vp9_high_sad4x4x3/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad4x4x3/;
+
+  add_proto qw/void vp9_high_sad64x64x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad64x64x8/;
+
+  add_proto qw/void vp9_high_sad32x32x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad32x32x8/;
+
+  add_proto qw/void vp9_high_sad16x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad16x16x8/;
+
+  add_proto qw/void vp9_high_sad16x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad16x8x8/;
+
+  add_proto qw/void vp9_high_sad8x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad8x16x8/;
+
+  add_proto qw/void vp9_high_sad8x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad8x8x8/;
+
+  add_proto qw/void vp9_high_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad8x4x8/;
+
+  add_proto qw/void vp9_high_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad4x8x8/;
+
+  add_proto qw/void vp9_high_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
+  specialize qw/vp9_high_sad4x4x8/;
+
+  add_proto qw/void vp9_high_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad64x64x4d/;
+
+  add_proto qw/void vp9_high_sad32x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad32x64x4d/;
+
+  add_proto qw/void vp9_high_sad64x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad64x32x4d/;
+
+  add_proto qw/void vp9_high_sad32x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad32x16x4d/;
+
+  add_proto qw/void vp9_high_sad16x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad16x32x4d/;
+
+  add_proto qw/void vp9_high_sad32x32x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad32x32x4d/;
+
+  add_proto qw/void vp9_high_sad16x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad16x16x4d/;
+
+  add_proto qw/void vp9_high_sad16x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad16x8x4d/;
+
+  add_proto qw/void vp9_high_sad8x16x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad8x16x4d/;
+
+  add_proto qw/void vp9_high_sad8x8x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad8x8x4d/;
+
+  # TODO(jingning): need to convert these 4x8/8x4 functions into sse2 form
+  add_proto qw/void vp9_high_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad8x4x4d/;
+
+  add_proto qw/void vp9_high_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad4x8x4d/;
+
+  add_proto qw/void vp9_high_sad4x4x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
+  specialize qw/vp9_high_sad4x4x4d/;
+
+  add_proto qw/unsigned int vp9_high_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_mse16x16/;
+
+  add_proto qw/unsigned int vp9_high_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_mse8x16/;
+
+  add_proto qw/unsigned int vp9_high_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_mse16x8/;
+
+  add_proto qw/unsigned int vp9_high_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_mse8x8/;
+
+  add_proto qw/unsigned int vp9_high_10_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_mse16x16/;
+
+  add_proto qw/unsigned int vp9_high_10_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_mse8x16/;
+
+  add_proto qw/unsigned int vp9_high_10_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_mse16x8/;
+
+  add_proto qw/unsigned int vp9_high_10_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_10_mse8x8/;
+
+  add_proto qw/unsigned int vp9_high_12_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_mse16x16/;
+
+  add_proto qw/unsigned int vp9_high_12_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_mse8x16/;
+
+  add_proto qw/unsigned int vp9_high_12_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_mse16x8/;
+
+  add_proto qw/unsigned int vp9_high_12_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+  specialize qw/vp9_high_12_mse8x8/;
+
+  # ENCODEMB INVOKE
+
+  add_proto qw/int64_t vp9_high_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
+  specialize qw/vp9_high_block_error/;
+
+  add_proto qw/void vp9_high_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+  specialize qw/vp9_high_subtract_block/;
+
+  add_proto qw/void vp9_high_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_high_quantize_fp/;
+
+  add_proto qw/void vp9_high_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_high_quantize_fp_32x32/;
+
+  add_proto qw/void vp9_high_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_high_quantize_b/;
+
+  add_proto qw/void vp9_high_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vp9_high_quantize_b_32x32/;
+
+  #
+  # Structured Similarity (SSIM)
+  #
+  if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+    add_proto qw/void vp9_high_ssim_parms_8x8/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+    specialize qw/vp9_high_ssim_parms_8x8/;
+
+    add_proto qw/void vp9_high_ssim_parms_8x8_shift/, "uint16_t *s, int sp, uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr, unsigned int bd, unsigned int shift";
+    specialize qw/vp9_high_ssim_parms_8x8_shift/;
+  }
+
+  # fdct functions
+  add_proto qw/void vp9_high_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_high_fht4x4/;
+
+  add_proto qw/void vp9_high_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_high_fht8x8/;
+
+  add_proto qw/void vp9_high_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp9_high_fht16x16/;
+
+  add_proto qw/void vp9_high_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fwht4x4/;
+
+  add_proto qw/void vp9_high_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct4x4/;
+
+  add_proto qw/void vp9_high_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct8x8_1/;
+
+  add_proto qw/void vp9_high_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct8x8/;
+
+  add_proto qw/void vp9_high_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct16x16_1/;
+
+  add_proto qw/void vp9_high_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct16x16/;
+
+  add_proto qw/void vp9_high_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct32x32_1/;
+
+  add_proto qw/void vp9_high_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct32x32/;
+
+  add_proto qw/void vp9_high_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
+  specialize qw/vp9_high_fdct32x32_rd/;
+
+  add_proto qw/void vp9_high_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
+  specialize qw/vp9_high_temporal_filter_apply/;
+
+}
+# End vp9_high encoder functions
+
  }
  # end encoder functions
  1;
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c

index 1b4904c..b6847b9 100644 (file)
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -139,25 +139,25 @@ void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
                             filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
    } \
  }
-#if HAVE_AVX2
+#if HAVE_AVX2 && HAVE_SSSE3
  filter8_1dfunction vp9_filter_block1d16_v8_avx2;
  filter8_1dfunction vp9_filter_block1d16_h8_avx2;
  filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-#if (ARCH_X86_64)
+#if ARCH_X86_64
  filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
  filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
  filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
  #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
  #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
  #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
-#else
+#else  // ARCH_X86
  filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
  filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
  filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
  #define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
  #define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
  #define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
-#endif
+#endif  // ARCH_X86_64 / ARCH_X86
  filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
  filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
  filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
@@ -190,9 +190,9 @@ FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
  //                          const int16_t *filter_y, int y_step_q4,
  //                          int w, int h);
  FUN_CONV_2D(, avx2);
-#endif
+#endif  // HAVE_AX2 && HAVE_SSSE3
  #if HAVE_SSSE3
-#if (ARCH_X86_64)
+#if ARCH_X86_64
  filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
  filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
  filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
@@ -204,14 +204,14 @@ filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
  #define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
  #define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
  #define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
-#else
+#else  // ARCH_X86
  filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
  filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
  filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
  filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
  filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
  filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#endif
+#endif  // ARCH_X86_64 / ARCH_X86
  filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
  filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
  filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
@@ -270,7 +270,7 @@ FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
  //                              int w, int h);
  FUN_CONV_2D(, ssse3);
  FUN_CONV_2D(avg_ , ssse3);
-#endif
+#endif  // HAVE_SSSE3
  
  #if HAVE_SSE2
  filter8_1dfunction vp9_filter_block1d16_v8_sse2;
@@ -336,4 +336,4 @@ FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_, sse2);
  //                             int w, int h);
  FUN_CONV_2D(, sse2);
  FUN_CONV_2D(avg_ , sse2);
-#endif
+#endif  // HAVE_SSE2
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c

index aecd906..7615cdd 100644 (file)
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -195,7 +195,7 @@ static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
    struct macroblockd_plane *const pd = &xd->plane[plane];
    if (eob > 0) {
      TX_TYPE tx_type = DCT_DCT;
-    int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+    tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
      if (xd->lossless) {
        tx_type = DCT_DCT;
        vp9_iwht4x4_add(dqcoeff, dst, stride, eob);
@@ -960,9 +960,8 @@ static const uint8_t *decode_tiles(VP9Decoder *pbi,
    return vp9_reader_find_end(&tile_data->bit_reader);
  }
  
-static int tile_worker_hook(void *arg1, void *arg2) {
-  TileWorkerData *const tile_data = (TileWorkerData*)arg1;
-  const TileInfo *const tile = (TileInfo*)arg2;
+static int tile_worker_hook(TileWorkerData *const tile_data,
+                            const TileInfo *const tile) {
    int mi_row, mi_col;
  
    for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
@@ -1223,6 +1222,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
      }
  
      setup_frame_size(cm, rb);
+    pbi->need_resync = 0;
    } else {
      cm->intra_only = cm->show_frame ? 0 : vp9_rb_read_bit(rb);
  
@@ -1246,6 +1246,7 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
  
        pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
        setup_frame_size(cm, rb);
+      pbi->need_resync = 0;
      } else {
        pbi->refresh_frame_flags = vp9_rb_read_literal(rb, REF_FRAMES);
        for (i = 0; i < REFS_PER_FRAME; ++i) {
@@ -1274,6 +1275,12 @@ static size_t read_uncompressed_header(VP9Decoder *pbi,
      }
    }
  
+  if (pbi->need_resync) {
+    vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
+                       "Keyframe / intra-only frame required to reset decoder"
+                       " state");
+  }
+
    if (!cm->error_resilient_mode) {
      cm->refresh_frame_context = vp9_rb_read_bit(rb);
      cm->frame_parallel_decoding_mode = vp9_rb_read_bit(rb);
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c

index 9106b0d..6ee3d70 100644 (file)
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -60,6 +60,7 @@ VP9Decoder *vp9_decoder_create() {
    }
  
    cm->error.setjmp = 1;
+  pbi->need_resync = 1;
    initialize_dec();
  
    // Initialize the references to not point to any frame buffers.
@@ -238,6 +239,7 @@ int vp9_receive_compressed_data(VP9Decoder *pbi,
    cm->new_fb_idx = get_free_fb(cm);
  
    if (setjmp(cm->error.jmp)) {
+    pbi->need_resync = 1;
      cm->error.setjmp = 0;
      vp9_clear_system_state();
  
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h

index 848d212..4f52bb9 100644 (file)
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -58,6 +58,7 @@ typedef struct VP9Decoder {
  
    int max_threads;
    int inv_tile_order;
+  int need_resync;  // wait for key/intra-only frame
  } VP9Decoder;
  
  int vp9_receive_compressed_data(struct VP9Decoder *pbi,
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c

index 91cdf38..76ca1ae 100644 (file)
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -51,7 +51,7 @@
    } while (0)
  
  static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type,
-                       int16_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
+                       tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
                         int ctx, const int16_t *scan, const int16_t *nb,
                         vp9_reader *r) {
    const int max_eob = 16 << (tx_size << 1);
diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c

index b82ea6a..6635880 100644 (file)
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c
@@ -121,10 +121,10 @@ static void loop_filter_rows_mt(const YV12_BUFFER_CONFIG *const frame_buffer,
  }
  
  // Row-based multi-threaded loopfilter hook
-static int loop_filter_row_worker(void *arg1, void *arg2) {
-  TileWorkerData *const tile_data = (TileWorkerData*)arg1;
+static int loop_filter_row_worker(TileWorkerData *const tile_data,
+                                  void *unused) {
    LFWorkerData *const lf_data = &tile_data->lfdata;
-  (void) arg2;
+  (void)unused;
    loop_filter_rows_mt(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
                        lf_data->start, lf_data->stop, lf_data->y_only,
                        lf_data->lf_sync, lf_data->num_lf_workers);
@@ -145,15 +145,13 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
    const int num_workers = MIN(pbi->max_threads & ~1, tile_cols);
    int i;
  
-  // Allocate memory used in thread synchronization.
-  // This always needs to be done even if frame_filter_level is 0.
+  if (!frame_filter_level) return;
+
    if (!lf_sync->sync_range || cm->last_height != cm->height) {
      vp9_loop_filter_dealloc(lf_sync);
-    vp9_loop_filter_alloc(cm, lf_sync, sb_rows, cm->width);
+    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width);
    }
  
-  if (!frame_filter_level) return;
-
    vp9_loop_filter_frame_init(cm, frame_filter_level);
  
    // Initialize cur_sb_col to -1 for all SB rows.
@@ -216,7 +214,7 @@ static int get_sync_range(int width) {
  }
  
  // Allocate memory for lf row synchronization
-void vp9_loop_filter_alloc(VP9_COMMON *cm, VP9LfSync *lf_sync, int rows,
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
                             int width) {
    lf_sync->rows = rows;
  #if CONFIG_MULTITHREAD
diff --git a/vp9/decoder/vp9_dthread.h b/vp9/decoder/vp9_dthread.h

index 8b02ef7..b1fbdeb 100644 (file)
--- a/vp9/decoder/vp9_dthread.h
+++ b/vp9/decoder/vp9_dthread.h
@@ -42,8 +42,8 @@ typedef struct VP9LfSyncData {
  } VP9LfSync;
  
  // Allocate memory for loopfilter row synchronization.
-void vp9_loop_filter_alloc(struct VP9Common *cm, VP9LfSync *lf_sync,
-                           int rows, int width);
+void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
+                           int width);
  
  // Deallocate loopfilter synchronization related mutex and data.
  void vp9_loop_filter_dealloc(VP9LfSync *lf_sync);
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h

index b726383..767bd7f 100644 (file)
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -28,8 +28,8 @@ typedef struct {
  
  struct macroblock_plane {
    DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
-  int16_t *qcoeff;
-  int16_t *coeff;
+  tran_low_t *qcoeff;
+  tran_low_t *coeff;
    uint16_t *eobs;
    struct buf_2d src;
  
@@ -119,8 +119,12 @@ struct macroblock {
    // Used to store sub partition's choices.
    MV pred_mv[MAX_REF_FRAMES];
  
-  void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
-  void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob);
+  void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
+  void (*itxm_add)(const tran_low_t *input, uint8_t *dest, int stride, int eob);
+#if CONFIG_VP9_HIGHBITDEPTH
+  void (*high_itxm_add)(const tran_low_t *input, uint8_t *dest, int stride,
+                        int eob, int bd);
+#endif
  };
  
  #ifdef __cplusplus
diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c

index 9b7a932..12acc51 100644 (file)
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -30,13 +30,13 @@ static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
    for (i = 0; i < MAX_MB_PLANE; ++i) {
      for (k = 0; k < 3; ++k) {
        CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+                      vpx_memalign(16, num_pix * sizeof(*ctx->coeff[i][k])));
        CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+                      vpx_memalign(16, num_pix * sizeof(*ctx->qcoeff[i][k])));
        CHECK_MEM_ERROR(cm, ctx->dqcoeff[i][k],
-                      vpx_memalign(16, num_pix * sizeof(int16_t)));
+                      vpx_memalign(16, num_pix * sizeof(*ctx->dqcoeff[i][k])));
        CHECK_MEM_ERROR(cm, ctx->eobs[i][k],
-                      vpx_memalign(16, num_pix * sizeof(uint16_t)));
+                      vpx_memalign(16, num_pix * sizeof(*ctx->eobs[i][k])));
        ctx->coeff_pbuf[i][k]   = ctx->coeff[i][k];
        ctx->qcoeff_pbuf[i][k]  = ctx->qcoeff[i][k];
        ctx->dqcoeff_pbuf[i][k] = ctx->dqcoeff[i][k];
diff --git a/vp9/encoder/vp9_context_tree.h b/vp9/encoder/vp9_context_tree.h

index 236389b..97f0741 100644 (file)
--- a/vp9/encoder/vp9_context_tree.h
+++ b/vp9/encoder/vp9_context_tree.h
@@ -19,15 +19,15 @@ struct VP9_COMP;
  typedef struct {
    MODE_INFO mic;
    uint8_t *zcoeff_blk;
-  int16_t *coeff[MAX_MB_PLANE][3];
-  int16_t *qcoeff[MAX_MB_PLANE][3];
-  int16_t *dqcoeff[MAX_MB_PLANE][3];
+  tran_low_t *coeff[MAX_MB_PLANE][3];
+  tran_low_t *qcoeff[MAX_MB_PLANE][3];
+  tran_low_t *dqcoeff[MAX_MB_PLANE][3];
    uint16_t *eobs[MAX_MB_PLANE][3];
  
    // dual buffer pointers, 0: in use, 1: best in store
-  int16_t *coeff_pbuf[MAX_MB_PLANE][3];
-  int16_t *qcoeff_pbuf[MAX_MB_PLANE][3];
-  int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
+  tran_low_t *coeff_pbuf[MAX_MB_PLANE][3];
+  tran_low_t *qcoeff_pbuf[MAX_MB_PLANE][3];
+  tran_low_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
    uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
  
    int is_coded;
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c

index 59222f0..eff8996 100644 (file)
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -18,15 +18,17 @@
  #include "vp9/common/vp9_idct.h"
  #include "vp9/common/vp9_systemdependent.h"
  
-static INLINE int fdct_round_shift(int input) {
-  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  assert(INT16_MIN <= rv && rv <= INT16_MAX);
+static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  // TODO(debargha, peter.derivaz): Find new bounds for this assert
+  // and make the bounds consts.
+  // assert(INT16_MIN <= rv && rv <= INT16_MAX);
    return rv;
  }
  
-static void fdct4(const int16_t *input, int16_t *output) {
-  int16_t step[4];
-  int temp1, temp2;
+static void fdct4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t step[4];
+  tran_high_t temp1, temp2;
  
    step[0] = input[0] + input[3];
    step[1] = input[1] + input[2];
@@ -43,9 +45,9 @@ static void fdct4(const int16_t *input, int16_t *output) {
    output[3] = fdct_round_shift(temp2);
  }
  
-void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_1_c(const int16_t *input, tran_low_t *output, int stride) {
    int r, c;
-  int16_t sum = 0;
+  tran_low_t sum = 0;
    for (r = 0; r < 4; ++r)
      for (c = 0; c < 4; ++c)
        sum += input[r * stride + c];
@@ -54,7 +56,7 @@ void vp9_fdct4x4_1_c(const int16_t *input, int16_t *output, int stride) {
    output[1] = 0;
  }
  
-void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
    // The 2D transform is done with two passes which are actually pretty
    // similar. In the first one, we transform the columns and transpose
    // the results. In the second one, we transform the rows. To achieve that,
@@ -63,22 +65,23 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
    // in normal/row positions).
    int pass;
    // We need an intermediate buffer between passes.
-  int16_t intermediate[4 * 4];
-  const int16_t *in = input;
-  int16_t *out = intermediate;
+  tran_low_t intermediate[4 * 4];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
    // Do the two transform/transpose passes
    for (pass = 0; pass < 2; ++pass) {
-    /*canbe16*/ int input[4];
-    /*canbe16*/ int step[4];
-    /*needs32*/ int temp1, temp2;
+    tran_high_t input[4];      // canbe16
+    tran_high_t step[4];       // canbe16
+    tran_high_t temp1, temp2;  // needs32
      int i;
      for (i = 0; i < 4; ++i) {
        // Load inputs.
        if (0 == pass) {
-        input[0] = in[0 * stride] * 16;
-        input[1] = in[1 * stride] * 16;
-        input[2] = in[2 * stride] * 16;
-        input[3] = in[3 * stride] * 16;
+        input[0] = in_pass0[0 * stride] * 16;
+        input[1] = in_pass0[1 * stride] * 16;
+        input[2] = in_pass0[2 * stride] * 16;
+        input[3] = in_pass0[3 * stride] * 16;
          if (i == 0 && input[0]) {
            input[0] += 1;
          }
@@ -102,6 +105,7 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
        out[1] = fdct_round_shift(temp1);
        out[3] = fdct_round_shift(temp2);
        // Do next column (which is a transposed row in second/horizontal pass)
+      in_pass0++;
        in++;
        out += 4;
      }
@@ -119,9 +123,9 @@ void vp9_fdct4x4_c(const int16_t *input, int16_t *output, int stride) {
    }
  }
  
-static void fadst4(const int16_t *input, int16_t *output) {
-  int x0, x1, x2, x3;
-  int s0, s1, s2, s3, s4, s5, s6, s7;
+static void fadst4(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t x0, x1, x2, x3;
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
  
    x0 = input[0];
    x1 = input[1];
@@ -166,15 +170,15 @@ static const transform_2d FHT_4[] = {
    { fadst4, fadst4 }   // ADST_ADST = 3
  };
  
-void vp9_fht4x4_c(const int16_t *input, int16_t *output,
+void vp9_fht4x4_c(const int16_t *input, tran_low_t *output,
                    int stride, int tx_type) {
    if (tx_type == DCT_DCT) {
      vp9_fdct4x4_c(input, output, stride);
    } else {
-    int16_t out[4 * 4];
-    int16_t *outptr = &out[0];
+    tran_low_t out[4 * 4];
+    tran_low_t *outptr = &out[0];
      int i, j;
-    int16_t temp_in[4], temp_out[4];
+    tran_low_t temp_in[4], temp_out[4];
      const transform_2d ht = FHT_4[tx_type];
  
      // Columns
@@ -199,10 +203,10 @@ void vp9_fht4x4_c(const int16_t *input, int16_t *output,
    }
  }
  
-static void fdct8(const int16_t *input, int16_t *output) {
-  /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
-  /*needs32*/ int t0, t1, t2, t3;
-  /*canbe16*/ int x0, x1, x2, x3;
+static void fdct8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+  tran_high_t t0, t1, t2, t3;                  // needs32
+  tran_high_t x0, x1, x2, x3;                  // canbe16
  
    // stage 1
    s0 = input[0] + input[7];
@@ -251,9 +255,9 @@ static void fdct8(const int16_t *input, int16_t *output) {
    output[7] = fdct_round_shift(t3);
  }
  
-void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct8x8_1_c(const int16_t *input, tran_low_t *output, int stride) {
    int r, c;
-  int16_t sum = 0;
+  tran_low_t sum = 0;
    for (r = 0; r < 8; ++r)
      for (c = 0; c < 8; ++c)
        sum += input[r * stride + c];
@@ -262,16 +266,16 @@ void vp9_fdct8x8_1_c(const int16_t *input, int16_t *output, int stride) {
    output[1] = 0;
  }
  
-void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
+void vp9_fdct8x8_c(const int16_t *input, tran_low_t *final_output, int stride) {
    int i, j;
-  int16_t intermediate[64];
+  tran_low_t intermediate[64];
  
    // Transform columns
    {
-    int16_t *output = intermediate;
-    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
-    /*needs32*/ int t0, t1, t2, t3;
-    /*canbe16*/ int x0, x1, x2, x3;
+    tran_low_t *output = intermediate;
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
  
      int i;
      for (i = 0; i < 8; i++) {
@@ -333,9 +337,9 @@ void vp9_fdct8x8_c(const int16_t *input, int16_t *final_output, int stride) {
    }
  }
  
-void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct16x16_1_c(const int16_t *input, tran_low_t *output, int stride) {
    int r, c;
-  int16_t sum = 0;
+  tran_low_t sum = 0;
    for (r = 0; r < 16; ++r)
      for (c = 0; c < 16; ++c)
        sum += input[r * stride + c];
@@ -344,7 +348,7 @@ void vp9_fdct16x16_1_c(const int16_t *input, int16_t *output, int stride) {
    output[1] = 0;
  }
  
-void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
    // The 2D transform is done with two passes which are actually pretty
    // similar. In the first one, we transform the columns and transpose
    // the results. In the second one, we transform the rows. To achieve that,
@@ -353,37 +357,38 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
    // in normal/row positions).
    int pass;
    // We need an intermediate buffer between passes.
-  int16_t intermediate[256];
-  const int16_t *in = input;
-  int16_t *out = intermediate;
+  tran_low_t intermediate[256];
+  const int16_t *in_pass0 = input;
+  const tran_low_t *in = NULL;
+  tran_low_t *out = intermediate;
    // Do the two transform/transpose passes
    for (pass = 0; pass < 2; ++pass) {
-    /*canbe16*/ int step1[8];
-    /*canbe16*/ int step2[8];
-    /*canbe16*/ int step3[8];
-    /*canbe16*/ int input[8];
-    /*needs32*/ int temp1, temp2;
+    tran_high_t step1[8];      // canbe16
+    tran_high_t step2[8];      // canbe16
+    tran_high_t step3[8];      // canbe16
+    tran_high_t input[8];      // canbe16
+    tran_high_t temp1, temp2;  // needs32
      int i;
      for (i = 0; i < 16; i++) {
        if (0 == pass) {
          // Calculate input for the first 8 results.
-        input[0] = (in[0 * stride] + in[15 * stride]) * 4;
-        input[1] = (in[1 * stride] + in[14 * stride]) * 4;
-        input[2] = (in[2 * stride] + in[13 * stride]) * 4;
-        input[3] = (in[3 * stride] + in[12 * stride]) * 4;
-        input[4] = (in[4 * stride] + in[11 * stride]) * 4;
-        input[5] = (in[5 * stride] + in[10 * stride]) * 4;
-        input[6] = (in[6 * stride] + in[ 9 * stride]) * 4;
-        input[7] = (in[7 * stride] + in[ 8 * stride]) * 4;
+        input[0] = (in_pass0[0 * stride] + in_pass0[15 * stride]) * 4;
+        input[1] = (in_pass0[1 * stride] + in_pass0[14 * stride]) * 4;
+        input[2] = (in_pass0[2 * stride] + in_pass0[13 * stride]) * 4;
+        input[3] = (in_pass0[3 * stride] + in_pass0[12 * stride]) * 4;
+        input[4] = (in_pass0[4 * stride] + in_pass0[11 * stride]) * 4;
+        input[5] = (in_pass0[5 * stride] + in_pass0[10 * stride]) * 4;
+        input[6] = (in_pass0[6 * stride] + in_pass0[ 9 * stride]) * 4;
+        input[7] = (in_pass0[7 * stride] + in_pass0[ 8 * stride]) * 4;
          // Calculate input for the next 8 results.
-        step1[0] = (in[7 * stride] - in[ 8 * stride]) * 4;
-        step1[1] = (in[6 * stride] - in[ 9 * stride]) * 4;
-        step1[2] = (in[5 * stride] - in[10 * stride]) * 4;
-        step1[3] = (in[4 * stride] - in[11 * stride]) * 4;
-        step1[4] = (in[3 * stride] - in[12 * stride]) * 4;
-        step1[5] = (in[2 * stride] - in[13 * stride]) * 4;
-        step1[6] = (in[1 * stride] - in[14 * stride]) * 4;
-        step1[7] = (in[0 * stride] - in[15 * stride]) * 4;
+        step1[0] = (in_pass0[7 * stride] - in_pass0[ 8 * stride]) * 4;
+        step1[1] = (in_pass0[6 * stride] - in_pass0[ 9 * stride]) * 4;
+        step1[2] = (in_pass0[5 * stride] - in_pass0[10 * stride]) * 4;
+        step1[3] = (in_pass0[4 * stride] - in_pass0[11 * stride]) * 4;
+        step1[4] = (in_pass0[3 * stride] - in_pass0[12 * stride]) * 4;
+        step1[5] = (in_pass0[2 * stride] - in_pass0[13 * stride]) * 4;
+        step1[6] = (in_pass0[1 * stride] - in_pass0[14 * stride]) * 4;
+        step1[7] = (in_pass0[0 * stride] - in_pass0[15 * stride]) * 4;
        } else {
          // Calculate input for the first 8 results.
          input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2);
@@ -406,9 +411,9 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
        }
        // Work on the first eight values; fdct8(input, even_results);
        {
-        /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
-        /*needs32*/ int t0, t1, t2, t3;
-        /*canbe16*/ int x0, x1, x2, x3;
+        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+        tran_high_t t0, t1, t2, t3;                  // needs32
+        tran_high_t x0, x1, x2, x3;                  // canbe16
  
          // stage 1
          s0 = input[0] + input[7];
@@ -514,6 +519,7 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
        }
        // Do next column (which is a transposed row in second/horizontal pass)
        in++;
+      in_pass0++;
        out += 16;
      }
      // Setup in/out for next pass.
@@ -522,17 +528,17 @@ void vp9_fdct16x16_c(const int16_t *input, int16_t *output, int stride) {
    }
  }
  
-static void fadst8(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7;
+static void fadst8(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
  
-  int x0 = input[7];
-  int x1 = input[0];
-  int x2 = input[5];
-  int x3 = input[2];
-  int x4 = input[3];
-  int x5 = input[4];
-  int x6 = input[1];
-  int x7 = input[6];
+  tran_high_t x0 = input[7];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[5];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[3];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[1];
+  tran_high_t x7 = input[6];
  
    // stage 1
    s0 = cospi_2_64  * x0 + cospi_30_64 * x1;
@@ -600,15 +606,15 @@ static const transform_2d FHT_8[] = {
    { fadst8, fadst8 }   // ADST_ADST = 3
  };
  
-void vp9_fht8x8_c(const int16_t *input, int16_t *output,
+void vp9_fht8x8_c(const int16_t *input, tran_low_t *output,
                    int stride, int tx_type) {
    if (tx_type == DCT_DCT) {
      vp9_fdct8x8_c(input, output, stride);
    } else {
-    int16_t out[64];
-    int16_t *outptr = &out[0];
+    tran_low_t out[64];
+    tran_low_t *outptr = &out[0];
      int i, j;
-    int16_t temp_in[8], temp_out[8];
+    tran_low_t temp_in[8], temp_out[8];
      const transform_2d ht = FHT_8[tx_type];
  
      // Columns
@@ -633,17 +639,18 @@ void vp9_fht8x8_c(const int16_t *input, int16_t *output,
  
  /* 4-point reversible, orthonormal Walsh-Hadamard in 3.5 adds, 0.5 shifts per
     pixel. */
-void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
    int i;
-  int a1, b1, c1, d1, e1;
-  const int16_t *ip = input;
-  int16_t *op = output;
+  tran_high_t a1, b1, c1, d1, e1;
+  const int16_t *ip_pass0 = input;
+  const tran_low_t *ip = NULL;
+  tran_low_t *op = output;
  
    for (i = 0; i < 4; i++) {
-    a1 = ip[0 * stride];
-    b1 = ip[1 * stride];
-    c1 = ip[2 * stride];
-    d1 = ip[3 * stride];
+    a1 = ip_pass0[0 * stride];
+    b1 = ip_pass0[1 * stride];
+    c1 = ip_pass0[2 * stride];
+    d1 = ip_pass0[3 * stride];
  
      a1 += b1;
      d1 = d1 - c1;
@@ -657,7 +664,7 @@ void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
      op[8] = d1;
      op[12] = b1;
  
-    ip++;
+    ip_pass0++;
      op++;
    }
    ip = output;
@@ -687,12 +694,12 @@ void vp9_fwht4x4_c(const int16_t *input, int16_t *output, int stride) {
  }
  
  // Rewrote to use same algorithm as others.
-static void fdct16(const int16_t in[16], int16_t out[16]) {
-  /*canbe16*/ int step1[8];
-  /*canbe16*/ int step2[8];
-  /*canbe16*/ int step3[8];
-  /*canbe16*/ int input[8];
-  /*needs32*/ int temp1, temp2;
+static void fdct16(const tran_low_t in[16], tran_low_t out[16]) {
+  tran_high_t step1[8];      // canbe16
+  tran_high_t step2[8];      // canbe16
+  tran_high_t step3[8];      // canbe16
+  tran_high_t input[8];      // canbe16
+  tran_high_t temp1, temp2;  // needs32
  
    // step 1
    input[0] = in[0] + in[15];
@@ -715,9 +722,9 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
  
    // fdct8(step, step);
    {
-    /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7;
-    /*needs32*/ int t0, t1, t2, t3;
-    /*canbe16*/ int x0, x1, x2, x3;
+    tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
+    tran_high_t t0, t1, t2, t3;                  // needs32
+    tran_high_t x0, x1, x2, x3;                  // canbe16
  
      // stage 1
      s0 = input[0] + input[7];
@@ -828,25 +835,26 @@ static void fdct16(const int16_t in[16], int16_t out[16]) {
    out[15] = fdct_round_shift(temp2);
  }
  
-static void fadst16(const int16_t *input, int16_t *output) {
-  int s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15;
-
-  int x0 = input[15];
-  int x1 = input[0];
-  int x2 = input[13];
-  int x3 = input[2];
-  int x4 = input[11];
-  int x5 = input[4];
-  int x6 = input[9];
-  int x7 = input[6];
-  int x8 = input[7];
-  int x9 = input[8];
-  int x10 = input[5];
-  int x11 = input[10];
-  int x12 = input[3];
-  int x13 = input[12];
-  int x14 = input[1];
-  int x15 = input[14];
+static void fadst16(const tran_low_t *input, tran_low_t *output) {
+  tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
+  tran_high_t s9, s10, s11, s12, s13, s14, s15;
+
+  tran_high_t x0 = input[15];
+  tran_high_t x1 = input[0];
+  tran_high_t x2 = input[13];
+  tran_high_t x3 = input[2];
+  tran_high_t x4 = input[11];
+  tran_high_t x5 = input[4];
+  tran_high_t x6 = input[9];
+  tran_high_t x7 = input[6];
+  tran_high_t x8 = input[7];
+  tran_high_t x9 = input[8];
+  tran_high_t x10 = input[5];
+  tran_high_t x11 = input[10];
+  tran_high_t x12 = input[3];
+  tran_high_t x13 = input[12];
+  tran_high_t x14 = input[1];
+  tran_high_t x15 = input[14];
  
    // stage 1
    s0 = x0 * cospi_1_64  + x1 * cospi_31_64;
@@ -997,15 +1005,15 @@ static const transform_2d FHT_16[] = {
    { fadst16, fadst16 }   // ADST_ADST = 3
  };
  
-void vp9_fht16x16_c(const int16_t *input, int16_t *output,
+void vp9_fht16x16_c(const int16_t *input, tran_low_t *output,
                      int stride, int tx_type) {
    if (tx_type == DCT_DCT) {
      vp9_fdct16x16_c(input, output, stride);
    } else {
-    int16_t out[256];
-    int16_t *outptr = &out[0];
+    tran_low_t out[256];
+    tran_low_t *outptr = &out[0];
      int i, j;
-    int16_t temp_in[16], temp_out[16];
+    tran_low_t temp_in[16], temp_out[16];
      const transform_2d ht = FHT_16[tx_type];
  
      // Columns
@@ -1028,19 +1036,21 @@ void vp9_fht16x16_c(const int16_t *input, int16_t *output,
    }
  }
  
-static INLINE int dct_32_round(int input) {
-  int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  assert(-131072 <= rv && rv <= 131071);
+static INLINE tran_high_t dct_32_round(tran_high_t input) {
+  tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+  // TODO(debargha, peter.derivaz): Find new bounds for this assert,
+  // and make the bounds consts.
+  // assert(-131072 <= rv && rv <= 131071);
    return rv;
  }
  
-static INLINE int half_round_shift(int input) {
-  int rv = (input + 1 + (input < 0)) >> 2;
+static INLINE tran_high_t half_round_shift(tran_high_t input) {
+  tran_high_t rv = (input + 1 + (input < 0)) >> 2;
    return rv;
  }
  
-static void fdct32(const int *input, int *output, int round) {
-  int step[32];
+static void fdct32(const tran_high_t *input, tran_high_t *output, int round) {
+  tran_high_t step[32];
    // Stage 1
    step[0] = input[0] + input[(32 - 1)];
    step[1] = input[1] + input[(32 - 2)];
@@ -1362,9 +1372,9 @@ static void fdct32(const int *input, int *output, int round) {
    output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64);
  }
  
-void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) {
+void vp9_fdct32x32_1_c(const int16_t *input, tran_low_t *output, int stride) {
    int r, c;
-  int16_t sum = 0;
+  tran_low_t sum = 0;
    for (r = 0; r < 32; ++r)
      for (c = 0; c < 32; ++c)
        sum += input[r * stride + c];
@@ -1373,13 +1383,13 @@ void vp9_fdct32x32_1_c(const int16_t *input, int16_t *output, int stride) {
    output[1] = 0;
  }
  
-void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
+void vp9_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
    int i, j;
-  int output[32 * 32];
+  tran_high_t output[32 * 32];
  
    // Columns
    for (i = 0; i < 32; ++i) {
-    int temp_in[32], temp_out[32];
+    tran_high_t temp_in[32], temp_out[32];
      for (j = 0; j < 32; ++j)
        temp_in[j] = input[j * stride + i] * 4;
      fdct32(temp_in, temp_out, 0);
@@ -1389,7 +1399,7 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
  
    // Rows
    for (i = 0; i < 32; ++i) {
-    int temp_in[32], temp_out[32];
+    tran_high_t temp_in[32], temp_out[32];
      for (j = 0; j < 32; ++j)
        temp_in[j] = output[j + i * 32];
      fdct32(temp_in, temp_out, 0);
@@ -1401,13 +1411,13 @@ void vp9_fdct32x32_c(const int16_t *input, int16_t *out, int stride) {
  // Note that although we use dct_32_round in dct32 computation flow,
  // this 2d fdct32x32 for rate-distortion optimization loop is operating
  // within 16 bits precision.
-void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
+void vp9_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
    int i, j;
-  int output[32 * 32];
+  tran_high_t output[32 * 32];
  
    // Columns
    for (i = 0; i < 32; ++i) {
-    int temp_in[32], temp_out[32];
+    tran_high_t temp_in[32], temp_out[32];
      for (j = 0; j < 32; ++j)
        temp_in[j] = input[j * stride + i] * 4;
      fdct32(temp_in, temp_out, 0);
@@ -1420,7 +1430,7 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
  
    // Rows
    for (i = 0; i < 32; ++i) {
-    int temp_in[32], temp_out[32];
+    tran_high_t temp_in[32], temp_out[32];
      for (j = 0; j < 32; ++j)
        temp_in[j] = output[j + i * 32];
      fdct32(temp_in, temp_out, 1);
@@ -1428,3 +1438,61 @@ void vp9_fdct32x32_rd_c(const int16_t *input, int16_t *out, int stride) {
        out[j + i * 32] = temp_out[j];
    }
  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  vp9_fdct4x4_c(input, output, stride);
+}
+
+void vp9_high_fht4x4_c(const int16_t *input, tran_low_t *output,
+                       int stride, int tx_type) {
+  vp9_fht4x4_c(input, output, stride, tx_type);
+}
+
+void vp9_high_fdct8x8_1_c(const int16_t *input, tran_low_t *final_output,
+                          int stride) {
+  vp9_fdct8x8_1_c(input, final_output, stride);
+}
+
+void vp9_high_fdct8x8_c(const int16_t *input, tran_low_t *final_output,
+                        int stride) {
+  vp9_fdct8x8_c(input, final_output, stride);
+}
+
+void vp9_high_fdct16x16_1_c(const int16_t *input, tran_low_t *output,
+                            int stride) {
+  vp9_fdct16x16_1_c(input, output, stride);
+}
+
+void vp9_high_fdct16x16_c(const int16_t *input, tran_low_t *output,
+                          int stride) {
+  vp9_fdct16x16_c(input, output, stride);
+}
+
+void vp9_high_fht8x8_c(const int16_t *input, tran_low_t *output,
+                  int stride, int tx_type) {
+  vp9_fht8x8_c(input, output, stride, tx_type);
+}
+
+void vp9_high_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride) {
+  vp9_fwht4x4_c(input, output, stride);
+}
+
+void vp9_high_fht16x16_c(const int16_t *input, tran_low_t *output,
+                    int stride, int tx_type) {
+  vp9_fht16x16_c(input, output, stride, tx_type);
+}
+
+void vp9_high_fdct32x32_1_c(const int16_t *input, tran_low_t *out, int stride) {
+  vp9_fdct32x32_1_c(input, out, stride);
+}
+
+void vp9_high_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
+  vp9_fdct32x32_c(input, out, stride);
+}
+
+void vp9_high_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
+                             int stride) {
+  vp9_fdct32x32_rd_c(input, out, stride);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c

index c4cf5ee..75b9449 100644 (file)
--- a/vp9/encoder/vp9_denoiser.c
+++ b/vp9/encoder/vp9_denoiser.c
@@ -89,9 +89,9 @@ static VP9_DENOISER_DECISION denoiser_filter(const uint8_t *sig, int sig_stride,
    int total_adj = 0;
    int shift_inc = 1;
  
-  /* If motion_magnitude is small, making the denoiser more aggressive by
-   * increasing the adjustment for each level. Add another increment for
-   * blocks that are labeled for increase denoising. */
+  // If motion_magnitude is small, making the denoiser more aggressive by
+  // increasing the adjustment for each level. Add another increment for
+  // blocks that are labeled for increase denoising.
    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD) {
      if (increase_denoising) {
        shift_inc = 2;
diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h

index a913add..fa714b1 100644 (file)
--- a/vp9/encoder/vp9_denoiser.h
+++ b/vp9/encoder/vp9_denoiser.h
@@ -18,7 +18,7 @@
  extern "C" {
  #endif
  
-#define MOTION_MAGNITUDE_THRESHOLD (8*3)
+#define MOTION_MAGNITUDE_THRESHOLD (8 * 3)
  
  typedef enum vp9_denoiser_decision {
    COPY_BLOCK,
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c

index 6678450..794e6d0 100644 (file)
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -107,9 +107,9 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
    vp9_token_state tokens[1025][2];
    unsigned best_index[1025][2];
    uint8_t token_cache[1024];
-  const int16_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    const int eob = p->eobs[block];
    const PLANE_TYPE type = pd->plane_type;
    const int default_eob = 16 << (tx_size << 1);
@@ -294,22 +294,33 @@ static int optimize_b(MACROBLOCK *mb, int plane, int block,
  }
  
  static INLINE void fdct32x32(int rd_transform,
-                             const int16_t *src, int16_t *dst, int src_stride) {
+                             const int16_t *src, tran_low_t *dst,
+                             int src_stride) {
    if (rd_transform)
      vp9_fdct32x32_rd(src, dst, src_stride);
    else
      vp9_fdct32x32(src, dst, src_stride);
  }
  
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void high_fdct32x32(int rd_transform, const int16_t *src,
+                                  tran_low_t *dst, int src_stride) {
+  if (rd_transform)
+    vp9_high_fdct32x32_rd(src, dst, src_stride);
+  else
+    vp9_high_fdct32x32(src, dst, src_stride);
+}
+#endif
+
  void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block,
                          BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
    MACROBLOCKD *const xd = &x->e_mbd;
    const struct macroblock_plane *const p = &x->plane[plane];
    const struct macroblockd_plane *const pd = &xd->plane[plane];
    const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    uint16_t *const eob = &p->eobs[block];
    const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
    int i, j;
@@ -357,9 +368,9 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block,
    MACROBLOCKD *const xd = &x->e_mbd;
    const struct macroblock_plane *const p = &x->plane[plane];
    const struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    uint16_t *const eob = &p->eobs[block];
    const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
    int i, j;
@@ -405,9 +416,9 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block,
    const struct macroblock_plane *const p = &x->plane[plane];
    const struct macroblockd_plane *const pd = &xd->plane[plane];
    const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    uint16_t *const eob = &p->eobs[block];
    const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
    int i, j;
@@ -458,7 +469,7 @@ static void encode_block(int plane, int block, BLOCK_SIZE plane_bsize,
    struct optimize_ctx *const ctx = args->ctx;
    struct macroblock_plane *const p = &x->plane[plane];
    struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    int i, j;
    uint8_t *dst;
    ENTROPY_CONTEXT *a, *l;
@@ -538,7 +549,7 @@ static void encode_block_pass1(int plane, int block, BLOCK_SIZE plane_bsize,
    MACROBLOCKD *const xd = &x->e_mbd;
    struct macroblock_plane *const p = &x->plane[plane];
    struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    int i, j;
    uint8_t *dst;
    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
@@ -587,9 +598,9 @@ static void encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
    MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
    struct macroblock_plane *const p = &x->plane[plane];
    struct macroblockd_plane *const pd = &xd->plane[plane];
-  int16_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  int16_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    const scan_order *scan_order;
    TX_TYPE tx_type;
    PREDICTION_MODE mode;
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c

index b6e606d..b3884d0 100644 (file)
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -556,6 +556,9 @@ static void init_config(struct VP9_COMP *cpi, VP9EncoderConfig *oxcf) {
  
    cm->profile = oxcf->profile;
    cm->bit_depth = oxcf->bit_depth;
+#if CONFIG_VP9_HIGHBITDEPTH
+  cm->use_highbitdepth = oxcf->use_highbitdepth;
+#endif
    cm->color_space = UNKNOWN;
  
    cm->width = oxcf->width;
@@ -613,6 +616,11 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
      assert(cm->bit_depth > VPX_BITS_8);
  
    cpi->oxcf = *oxcf;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cpi->oxcf.use_highbitdepth) {
+    cpi->mb.e_mbd.bd = (int)cm->bit_depth;
+  }
+#endif
  
    rc->baseline_gf_interval = DEFAULT_GF_INTERVAL;
  
@@ -1272,7 +1280,10 @@ static void generate_psnr_packet(VP9_COMP *cpi) {
      pkt.data.psnr.psnr[i] = psnr.psnr[i];
    }
    pkt.kind = VPX_CODEC_PSNR_PKT;
-  vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
+  if (is_two_pass_svc(cpi))
+    cpi->svc.layer_context[cpi->svc.spatial_layer_id].psnr_pkt = pkt.data.psnr;
+  else
+    vpx_codec_pkt_list_add(cpi->output_pkt_list, &pkt);
  }
  
  int vp9_use_as_reference(VP9_COMP *cpi, int ref_frame_flags) {
@@ -2768,7 +2779,16 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
    if (oxcf->pass == 1 &&
        (!cpi->use_svc || is_two_pass_svc(cpi))) {
      const int lossless = is_lossless_requested(oxcf);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->oxcf.use_highbitdepth)
+      cpi->mb.fwd_txm4x4 = lossless ? vp9_high_fwht4x4 : vp9_high_fdct4x4;
+    else
+      cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+    cpi->mb.high_itxm_add = lossless ? vp9_high_iwht4x4_add :
+                                       vp9_high_idct4x4_add;
+#else
      cpi->mb.fwd_txm4x4 = lossless ? vp9_fwht4x4 : vp9_fdct4x4;
+#endif
      cpi->mb.itxm_add = lossless ? vp9_iwht4x4_add : vp9_idct4x4_add;
      vp9_first_pass(cpi, source);
    } else if (oxcf->pass == 2 &&
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h

index 0d3c4c1..80774de 100644 (file)
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -217,6 +217,9 @@ typedef struct VP9EncoderConfig {
  
    vp8e_tuning tuning;
    vp9e_tune_content content;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int use_highbitdepth;
+#endif
  } VP9EncoderConfig;
  
  static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c

index 8041b59..54b57cf 100644 (file)
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -437,41 +437,51 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
    vp9_set_quantizer(cm, find_fp_qindex());
  
    if (lc != NULL) {
-    MV_REFERENCE_FRAME ref_frame = LAST_FRAME;
      twopass = &lc->twopass;
  
-    if (cpi->common.current_video_frame == 0) {
-      cpi->ref_frame_flags = 0;
+    cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
+    cpi->ref_frame_flags = VP9_LAST_FLAG;
+
+    if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <
+        REF_FRAMES) {
+      cpi->gld_fb_idx =
+          cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;
+      cpi->ref_frame_flags |= VP9_GOLD_FLAG;
+      cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);
      } else {
-    if (lc->current_video_frame_in_layer <
-        (unsigned int)cpi->svc.number_temporal_layers)
-        cpi->ref_frame_flags = VP9_GOLD_FLAG;
-      else
-        cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
+      cpi->refresh_golden_frame = 0;
      }
  
+    if (lc->current_video_frame_in_layer == 0)
+      cpi->ref_frame_flags = 0;
+
      vp9_scale_references(cpi);
  
      // Use either last frame or alt frame for motion search.
      if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
        first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
-      ref_frame = LAST_FRAME;
        if (first_ref_buf == NULL)
          first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);
-    } else if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
-      first_ref_buf = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
-      ref_frame = GOLDEN_FRAME;
-      if (first_ref_buf == NULL)
-        first_ref_buf = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+    }
+
+    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+      const int ref_idx =
+          cm->ref_frame_map[get_ref_frame_idx(cpi, GOLDEN_FRAME)];
+      const int scaled_idx = cpi->scaled_ref_idx[GOLDEN_FRAME - 1];
+
+      gld_yv12 = (scaled_idx != ref_idx) ? &cm->frame_bufs[scaled_idx].buf :
+                 get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+    } else {
+      gld_yv12 = NULL;
      }
  
      recon_y_stride = new_yv12->y_stride;
      recon_uv_stride = new_yv12->uv_stride;
      uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
  
-    // Disable golden frame for svc first pass for now.
-    gld_yv12 = NULL;
-    set_ref_ptrs(cm, xd, ref_frame, NONE);
+    set_ref_ptrs(cm, xd,
+                 (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME: NONE,
+                 (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
  
      cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
                                          &cpi->scaled_source);
@@ -581,7 +591,8 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
        x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
  
        // Other than for the first frame do a motion search.
-      if (cm->current_video_frame > 0) {
+      if ((lc == NULL && cm->current_video_frame > 0) ||
+          (lc != NULL && lc->current_video_frame_in_layer > 0)) {
          int tmp_err, motion_error, raw_motion_error;
          // Assume 0,0 motion with no mv overhead.
          MV mv = {0, 0} , tmp_mv = {0, 0};
@@ -628,7 +639,9 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
            }
  
            // Search in an older reference frame.
-          if (cm->current_video_frame > 1 && gld_yv12 != NULL) {
+          if (((lc == NULL && cm->current_video_frame > 1) ||
+               (lc != NULL && lc->current_video_frame_in_layer > 1))
+              && gld_yv12 != NULL) {
              // Assume 0,0 motion with no mv overhead.
              int gf_motion_error;
  
@@ -893,7 +906,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
  
    // Special case for the first frame. Copy into the GF buffer as a second
    // reference.
-  if (cm->current_video_frame == 0 && gld_yv12 != NULL) {
+  if (cm->current_video_frame == 0 && gld_yv12 != NULL && lc == NULL) {
      vp8_yv12_copy_frame(lst_yv12, gld_yv12);
    }
  
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c

index d365489..5557d7f 100644 (file)
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -77,7 +77,6 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
    while (filter_step > 0) {
      const int filt_high = MIN(filt_mid + filter_step, max_filter_level);
      const int filt_low = MAX(filt_mid - filter_step, min_filter_level);
-    int filt_err;
  
      // Bias against raising loop filter in favor of lowering it.
      int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step;
@@ -92,17 +91,14 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
      if (filt_direction <= 0 && filt_low != filt_mid) {
        // Get Low filter error score
        if (ss_err[filt_low] < 0) {
-        filt_err = try_filter_frame(sd, cpi, filt_low, partial_frame);
-        ss_err[filt_low] = filt_err;
-      } else {
-        filt_err = ss_err[filt_low];
+        ss_err[filt_low] = try_filter_frame(sd, cpi, filt_low, partial_frame);
        }
        // If value is close to the best so far then bias towards a lower loop
        // filter value.
-      if ((filt_err - bias) < best_err) {
+      if ((ss_err[filt_low] - bias) < best_err) {
          // Was it actually better than the previous best?
-        if (filt_err < best_err)
-          best_err = filt_err;
+        if (ss_err[filt_low] < best_err)
+          best_err = ss_err[filt_low];
  
          filt_best = filt_low;
        }
@@ -111,14 +107,11 @@ static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
      // Now look at filt_high
      if (filt_direction >= 0 && filt_high != filt_mid) {
        if (ss_err[filt_high] < 0) {
-        filt_err = try_filter_frame(sd, cpi, filt_high, partial_frame);
-        ss_err[filt_high] = filt_err;
-      } else {
-        filt_err = ss_err[filt_high];
+        ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
        }
        // Was it better than the previous best?
-      if (filt_err < (best_err - bias)) {
-        best_err = filt_err;
+      if (ss_err[filt_high] < (best_err - bias)) {
+        best_err = ss_err[filt_high];
          filt_best = filt_high;
        }
      }
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c

index eababdb..d49eb95 100644 (file)
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -19,9 +19,9 @@
  #include "vp9/encoder/vp9_quantize.h"
  #include "vp9/encoder/vp9_rd.h"
  
-void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
                       const int16_t *round_ptr, const int16_t quant,
-                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                       const int16_t dequant_ptr, uint16_t *eob_ptr) {
    const int rc = 0;
    const int coeff = coeff_ptr[rc];
@@ -40,9 +40,9 @@ void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
    *eob_ptr = eob + 1;
  }
  
-void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                             const int16_t *round_ptr, const int16_t quant,
-                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t dequant_ptr, uint16_t *eob_ptr) {
    const int rc = 0;
    const int coeff = coeff_ptr[rc];
@@ -62,11 +62,11 @@ void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
    *eob_ptr = eob + 1;
  }
  
-void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                         int skip_block,
                         const int16_t *zbin_ptr, const int16_t *round_ptr,
                         const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-                       int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                         const int16_t *dequant_ptr,
                         int zbin_oq_value, uint16_t *eob_ptr,
                         const int16_t *scan, const int16_t *iscan) {
@@ -78,13 +78,13 @@ void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
    (void)zbin_oq_value;
    (void)iscan;
  
-  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
  
    if (!skip_block) {
      // Quantization pass: All coefficients with index >= zero_flag are
      // skippable. Note: zero_flag can be zero.
-    for (i = 0; i < count; i++) {
+    for (i = 0; i < n_coeffs; i++) {
        const int rc = scan[i];
        const int coeff = coeff_ptr[rc];
        const int coeff_sign = (coeff >> 31);
@@ -105,12 +105,12 @@ void vp9_quantize_fp_c(const int16_t *coeff_ptr, intptr_t count,
  
  // TODO(jingning) Refactor this file and combine functions with similar
  // operations.
-void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                               int skip_block,
                               const int16_t *zbin_ptr, const int16_t *round_ptr,
                               const int16_t *quant_ptr,
                               const int16_t *quant_shift_ptr,
-                             int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr,
                               int zbin_oq_value, uint16_t *eob_ptr,
                               const int16_t *scan, const int16_t *iscan) {
@@ -120,8 +120,8 @@ void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
    (void)zbin_oq_value;
    (void)iscan;
  
-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
  
    if (!skip_block) {
      for (i = 0; i < n_coeffs; i++) {
@@ -146,27 +146,27 @@ void vp9_quantize_fp_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
    *eob_ptr = eob + 1;
  }
  
-void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
+void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        int skip_block,
                        const int16_t *zbin_ptr, const int16_t *round_ptr,
                        const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
-                      int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                        const int16_t *dequant_ptr,
                        int zbin_oq_value, uint16_t *eob_ptr,
                        const int16_t *scan, const int16_t *iscan) {
-  int i, non_zero_count = (int)count, eob = -1;
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
    const int zbins[2] = { zbin_ptr[0] + zbin_oq_value,
                           zbin_ptr[1] + zbin_oq_value };
    const int nzbins[2] = { zbins[0] * -1,
                            zbins[1] * -1 };
    (void)iscan;
  
-  vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
  
    if (!skip_block) {
      // Pre-scan pass
-    for (i = (int)count - 1; i >= 0; i--) {
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
        const int rc = scan[i];
        const int coeff = coeff_ptr[rc];
  
@@ -199,12 +199,12 @@ void vp9_quantize_b_c(const int16_t *coeff_ptr, intptr_t count,
    *eob_ptr = eob + 1;
  }
  
-void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
+void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                              int skip_block,
                              const int16_t *zbin_ptr, const int16_t *round_ptr,
                              const int16_t *quant_ptr,
                              const int16_t *quant_shift_ptr,
-                            int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr,
                              int zbin_oq_value, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
@@ -217,8 +217,8 @@ void vp9_quantize_b_32x32_c(const int16_t *coeff_ptr, intptr_t n_coeffs,
    int i, eob = -1;
    (void)iscan;
  
-  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
-  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(int16_t));
+  vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
  
    if (!skip_block) {
      // Pre-scan pass
@@ -280,13 +280,19 @@ static void invert_quant(int16_t *quant, int16_t *shift, int d) {
    *shift = 1 << (16 - l);
  }
  
+static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
+  int quant = vp9_dc_quant(q, 0);
+  (void) bit_depth;
+  return q == 0 ? 64 : (quant < 148 ? 84 : 80);
+}
+
  void vp9_init_quantizer(VP9_COMP *cpi) {
    VP9_COMMON *const cm = &cpi->common;
    QUANTS *const quants = &cpi->quants;
    int i, q, quant;
  
    for (q = 0; q < QINDEX_RANGE; q++) {
-    const int qzbin_factor = q == 0 ? 64 : (vp9_dc_quant(q, 0) < 148 ? 84 : 80);
+    const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
      const int qrounding_factor = q == 0 ? 64 : 48;
  
      for (i = 0; i < 2; ++i) {
diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h

index 262529b..d7edb0b 100644 (file)
--- a/vp9/encoder/vp9_quantize.h
+++ b/vp9/encoder/vp9_quantize.h
@@ -37,17 +37,29 @@ typedef struct {
    DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
  } QUANTS;
  
-void vp9_quantize_dc(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
                       const int16_t *round_ptr, const int16_t quant_ptr,
-                     int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                     tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                       const int16_t dequant_ptr, uint16_t *eob_ptr);
-void vp9_quantize_dc_32x32(const int16_t *coeff_ptr, int skip_block,
+void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
                             const int16_t *round_ptr, const int16_t quant_ptr,
-                           int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t dequant_ptr, uint16_t *eob_ptr);
  void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
                                  const int16_t *scan, const int16_t *iscan);
  
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_quantize_dc(const tran_low_t *coeff_ptr, int skip_block,
+                          const int16_t *round_ptr, const int16_t quant_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t dequant_ptr, uint16_t *eob_ptr);
+void vp9_high_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                                const int16_t *round_ptr,
+                                const int16_t quant_ptr, tran_low_t *qcoeff_ptr,
+                                tran_low_t *dqcoeff_ptr,
+                                const int16_t dequant_ptr, uint16_t *eob_ptr);
+#endif
+
  struct VP9_COMP;
  struct VP9Common;
  
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c

index 22505fb..9df85de 100644 (file)
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -41,9 +41,14 @@
  #define RD_THRESH_MAX_FACT 64
  #define RD_THRESH_INC      1
  
-#define LAST_FRAME_MODE_MASK    0xFFEDCD60
-#define GOLDEN_FRAME_MODE_MASK  0xFFDA3BB0
-#define ALT_REF_MODE_MASK       0xFFC648D0
+#define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << INTRA_FRAME))
+#define GOLDEN_FRAME_MODE_MASK  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | \
+                                 (1 << INTRA_FRAME))
+#define ALT_REF_MODE_MASK       ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
+                                 (1 << INTRA_FRAME))
+
+#define SECOND_REF_FRAME_MASK   ((1 << ALTREF_FRAME) | 0x01)
  
  #define MIN_EARLY_TERM_INDEX    3
  
@@ -244,7 +249,7 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize,
    *out_dist_sum = dist_sum << 4;
  }
  
-int64_t vp9_block_error_c(const int16_t *coeff, const int16_t *dqcoeff,
+int64_t vp9_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
                            intptr_t block_size, int64_t *ssz) {
    int i;
    int64_t error = 0, sqcoeff = 0;
@@ -283,7 +288,7 @@ static INLINE int cost_coeffs(MACROBLOCK *x,
    const PLANE_TYPE type = pd->plane_type;
    const int16_t *band_count = &band_counts[tx_size][1];
    const int eob = p->eobs[block];
-  const int16_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
    unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
                     x->token_costs[tx_size][type][is_inter_block(mbmi)];
    uint8_t token_cache[32 * 32];
@@ -353,8 +358,8 @@ static void dist_block(int plane, int block, TX_SIZE tx_size,
    const struct macroblockd_plane *const pd = &xd->plane[plane];
    int64_t this_sse;
    int shift = tx_size == TX_32X32 ? 0 : 2;
-  int16_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
    args->dist = vp9_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
                                 &this_sse) >> shift;
    args->sse  = this_sse >> shift;
@@ -400,8 +405,8 @@ static void block_rd_txfm(int plane, int block, BLOCK_SIZE plane_bsize,
        dist_block(plane, block, tx_size, args);
      } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] == 2) {
        // compute DC coefficient
-      int16_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
-      int16_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
+      tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
+      tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
        vp9_xform_quant_dc(x, plane, block, plane_bsize, tx_size);
        args->sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
        args->dist = args->sse;
@@ -685,7 +690,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
          uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
          int16_t *const src_diff = raster_block_offset_int16(BLOCK_8X8, block,
                                                              p->src_diff);
-        int16_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
          xd->mi[0]->bmi[block].as_mode = mode;
          vp9_predict_intra_block(xd, block, 1,
                                  TX_4X4, mode,
@@ -1132,7 +1137,7 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi,
    for (idy = 0; idy < height / 4; ++idy) {
      for (idx = 0; idx < width / 4; ++idx) {
        int64_t ssz, rd, rd1, rd2;
-      int16_t* coeff;
+      tran_low_t* coeff;
  
        k += (idy * 2 + idx);
        coeff = BLOCK_OFFSET(p->coeff, k);
@@ -1224,11 +1229,9 @@ static INLINE int mv_has_subpel(const MV *mv) {
  // TODO(aconverse): Find out if this is still productive then clean up or remove
  static int check_best_zero_mv(
      const VP9_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
-    int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
-    int inter_mode_mask, int this_mode,
+    int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int this_mode,
      const MV_REFERENCE_FRAME ref_frames[2]) {
-  if ((inter_mode_mask & (1 << ZEROMV)) &&
-      (this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
+  if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
        frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
        (ref_frames[1] == NONE ||
         frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
@@ -1346,7 +1349,6 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
            continue;
  
          if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                                inter_mode_mask,
                                  this_mode, mbmi->ref_frame))
            continue;
  
@@ -2575,6 +2577,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    vp9_prob comp_mode_p;
    int64_t best_intra_rd = INT64_MAX;
    int64_t best_inter_rd = INT64_MAX;
+  unsigned int best_pred_sse = UINT_MAX;
    PREDICTION_MODE best_intra_mode = DC_PRED;
    MV_REFERENCE_FRAME best_inter_ref_frame = LAST_FRAME;
    int rate_uv_intra[TX_SIZES], rate_uv_tokenonly[TX_SIZES];
@@ -2583,14 +2586,12 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    PREDICTION_MODE mode_uv[TX_SIZES];
    int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q);
    int best_skip2 = 0;
-  int mode_skip_mask = 0;
+  uint8_t ref_frame_skip_mask[2] = { 0 };
+  uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
    int mode_skip_start = cpi->sf.mode_skip_start + 1;
    const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
    const int *const rd_thresh_freq_fact = rd_opt->thresh_freq_fact[bsize];
    const int mode_search_skip_flags = cpi->sf.mode_search_skip_flags;
-  const int intra_y_mode_mask =
-      cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]];
-  int inter_mode_mask = cpi->sf.inter_mode_mask[bsize];
    vp9_zero(best_mbmode);
    x->skip_encode = cpi->sf.skip_encode_frame && x->q_index < QIDX_SKIP_THRESH;
  
@@ -2627,23 +2628,17 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
    }
  
    for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    // All modes from vp9_mode_order that use this frame as any ref
-    static const int ref_frame_mask_all[] = {
-        0x0, 0x123291, 0x25c444, 0x39b722
-    };
-    // Fixed mv modes (NEARESTMV, NEARMV, ZEROMV) from vp9_mode_order that use
-    // this frame as their primary ref
-    static const int ref_frame_mask_fixedmv[] = {
-        0x0, 0x121281, 0x24c404, 0x080102
-    };
      if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
-      // Skip modes for missing references
-      mode_skip_mask |= ref_frame_mask_all[ref_frame];
+      // Skip checking missing references in both single and compound reference
+      // modes. Note that a mode will be skipped iff both reference frames
+      // are masked out.
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
      } else if (cpi->sf.reference_masking) {
        for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
          // Skip fixed mv modes for poor references
          if ((x->pred_mv_sad[ref_frame] >> 2) > x->pred_mv_sad[i]) {
-          mode_skip_mask |= ref_frame_mask_fixedmv[ref_frame];
+          mode_skip_mask[ref_frame] |= INTER_NEAREST_NEAR_ZERO;
            break;
          }
        }
@@ -2652,7 +2647,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
      // then do nothing if the current ref frame is not allowed..
      if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
          vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
-      mode_skip_mask |= ref_frame_mask_all[ref_frame];
+      ref_frame_skip_mask[0] |= (1 << ref_frame);
+      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
      }
    }
  
@@ -2665,15 +2661,32 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
      // an unfiltered alternative. We allow near/nearest as well
      // because they may result in zero-zero MVs but be cheaper.
      if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-      mode_skip_mask =
-          ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA));
+      ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << GOLDEN_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+      mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
        if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
-        mode_skip_mask |= (1 << THR_NEARA);
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
        if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
-        mode_skip_mask |= (1 << THR_NEARESTA);
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
+    }
+  }
+
+  if (cpi->rc.is_src_frame_alt_ref) {
+    if (cpi->sf.alt_ref_search_fp) {
+      mode_skip_mask[ALTREF_FRAME] = 0;
+      ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
+      ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
      }
    }
  
+  if (bsize > cpi->sf.max_intra_bsize) {
+    ref_frame_skip_mask[0] |= (1 << INTRA_FRAME);
+    ref_frame_skip_mask[1] |= (1 << INTRA_FRAME);
+  }
+
+  mode_skip_mask[INTRA_FRAME] |=
+      ~(cpi->sf.intra_y_mode_mask[max_txsize_lookup[bsize]]);
+
    for (mode_index = 0; mode_index < MAX_MODES; ++mode_index) {
      int mode_excluded = 0;
      int64_t this_rd = INT64_MAX;
@@ -2690,8 +2703,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
  
      this_mode = vp9_mode_order[mode_index].mode;
      ref_frame = vp9_mode_order[mode_index].ref_frame[0];
-    if (ref_frame != INTRA_FRAME && !(inter_mode_mask & (1 << this_mode)))
-      continue;
      second_ref_frame = vp9_mode_order[mode_index].ref_frame[1];
  
      // Look at the reference frame of the best mode so far and set the
@@ -2701,13 +2712,15 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
          case INTRA_FRAME:
            break;
          case LAST_FRAME:
-          mode_skip_mask |= LAST_FRAME_MODE_MASK;
+          ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
            break;
          case GOLDEN_FRAME:
-          mode_skip_mask |= GOLDEN_FRAME_MODE_MASK;
+          ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
            break;
          case ALTREF_FRAME:
-          mode_skip_mask |= ALT_REF_MODE_MASK;
+          ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK;
            break;
          case NONE:
          case MAX_REF_FRAMES:
@@ -2716,17 +2729,11 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
        }
      }
  
-    if (cpi->sf.alt_ref_search_fp && cpi->rc.is_src_frame_alt_ref) {
-      mode_skip_mask = 0;
-      if (!(ref_frame == ALTREF_FRAME && second_ref_frame == NONE))
-        continue;
-    }
-
-    if (bsize > cpi->sf.max_intra_bsize)
-      if (ref_frame == INTRA_FRAME)
-        continue;
+    if (ref_frame_skip_mask[0] & (1 << ref_frame) &&
+        ref_frame_skip_mask[1] & (1 << MAX(0, second_ref_frame)))
+      continue;
  
-    if (mode_skip_mask & (1 << mode_index))
+    if (mode_skip_mask[ref_frame] & (1 << this_mode))
        continue;
  
      // Test best rd so far against threshold for trying this mode.
@@ -2804,11 +2811,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
  
      if (ref_frame == INTRA_FRAME) {
        if (cpi->sf.adaptive_mode_search)
-        if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_intra_rd)
+        if ((x->source_variance << num_pels_log2_lookup[bsize]) > best_pred_sse)
            continue;
  
-      if (!(intra_y_mode_mask & (1 << this_mode)))
-        continue;
        if (this_mode != DC_PRED) {
          // Disable intra modes other than DC_PRED for blocks with low variance
          // Threshold for intra skipping based on source variance
@@ -2833,7 +2838,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
      } else {
        const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
        if (!check_best_zero_mv(cpi, mbmi->mode_context, frame_mv,
-                              inter_mode_mask, this_mode, ref_frames))
+                              this_mode, ref_frames))
          continue;
      }
  
@@ -2983,7 +2988,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
            mbmi->mv[0].as_int = 0;
            max_plane = 1;
          } else {
-          best_intra_rd = x->pred_sse[ref_frame];
+          best_pred_sse = x->pred_sse[ref_frame];
          }
  
          *returnrate = rate2;
@@ -3170,7 +3175,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
      vp9_zero(best_tx_diff);
    }
  
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
    store_coding_context(x, ctx, best_mode_index, best_pred_diff,
                         best_tx_diff, best_filter_diff, best_mode_skippable);
  
@@ -3450,14 +3454,12 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
      // If the segment reference frame feature is enabled....
      // then do nothing if the current ref frame is not allowed..
      if (vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME) &&
-        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) !=
-            (int)ref_frame) {
+        vp9_get_segdata(seg, segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame) {
        continue;
      // Disable this drop out case if the ref frame
      // segment level feature is enabled for this segment. This is to
      // prevent the possibility that we end up unable to pick any mode.
-    } else if (!vp9_segfeature_active(seg, segment_id,
-                                      SEG_LVL_REF_FRAME)) {
+    } else if (!vp9_segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) {
        // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
        // unless ARNR filtering is enabled in which case we want
        // an unfiltered alternative. We allow near/nearest as well
@@ -3883,7 +3885,6 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x,
      vp9_zero(best_filter_diff);
    }
  
-  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
    store_coding_context(x, ctx, best_ref_index,
                         best_pred_diff, best_tx_diff, best_filter_diff, 0);
  
diff --git a/vp9/encoder/vp9_sad.c b/vp9/encoder/vp9_sad.c

index d062636..cee6ce1 100644 (file)
--- a/vp9/encoder/vp9_sad.c
+++ b/vp9/encoder/vp9_sad.c
@@ -14,6 +14,9 @@
  #include "./vpx_config.h"
  
  #include "vpx/vpx_integer.h"
+#if CONFIG_VP9_HIGHBITDEPTH
+#include "vp9/common/vp9_common.h"
+#endif
  #include "vp9/encoder/vp9_variance.h"
  
  static INLINE unsigned int sad(const uint8_t *a, int a_stride,
@@ -131,3 +134,138 @@ sadMxN(4, 4)
  sadMxNxK(4, 4, 3)
  sadMxNxK(4, 4, 8)
  sadMxNx4D(4, 4)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE unsigned int high_sad(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride,
+                                    int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+static INLINE unsigned int high_sadb(const uint8_t *a8, int a_stride,
+                                     const uint16_t *b, int b_stride,
+                                     int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+  }
+  return sad;
+}
+
+#define high_sadMxN(m, n) \
+unsigned int vp9_high_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride) { \
+  return high_sad(src, src_stride, ref, ref_stride, m, n); \
+} \
+unsigned int vp9_high_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \
+                                           const uint8_t *ref, int ref_stride, \
+                                           const uint8_t *second_pred) { \
+  uint16_t comp_pred[m * n]; \
+  vp9_high_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \
+  return high_sadb(src, src_stride, comp_pred, m, m, n); \
+}
+
+#define high_sadMxNxK(m, n, k) \
+void vp9_high_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \
+                                     const uint8_t *ref, int ref_stride, \
+                                     unsigned int *sads) { \
+  int i; \
+  for (i = 0; i < k; ++i) \
+    sads[i] = vp9_high_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \
+}
+
+#define high_sadMxNx4D(m, n) \
+void vp9_high_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \
+                                  const uint8_t *const refs[], \
+                                  int ref_stride, unsigned int *sads) { \
+  int i; \
+  for (i = 0; i < 4; ++i) \
+    sads[i] = vp9_high_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \
+}
+
+// 64x64
+high_sadMxN(64, 64)
+high_sadMxNxK(64, 64, 3)
+high_sadMxNxK(64, 64, 8)
+high_sadMxNx4D(64, 64)
+
+// 64x32
+high_sadMxN(64, 32)
+high_sadMxNx4D(64, 32)
+
+// 32x64
+high_sadMxN(32, 64)
+high_sadMxNx4D(32, 64)
+
+// 32x32
+high_sadMxN(32, 32)
+high_sadMxNxK(32, 32, 3)
+high_sadMxNxK(32, 32, 8)
+high_sadMxNx4D(32, 32)
+
+// 32x16
+high_sadMxN(32, 16)
+high_sadMxNx4D(32, 16)
+
+// 16x32
+high_sadMxN(16, 32)
+high_sadMxNx4D(16, 32)
+
+// 16x16
+high_sadMxN(16, 16)
+high_sadMxNxK(16, 16, 3)
+high_sadMxNxK(16, 16, 8)
+high_sadMxNx4D(16, 16)
+
+// 16x8
+high_sadMxN(16, 8)
+high_sadMxNxK(16, 8, 3)
+high_sadMxNxK(16, 8, 8)
+high_sadMxNx4D(16, 8)
+
+// 8x16
+high_sadMxN(8, 16)
+high_sadMxNxK(8, 16, 3)
+high_sadMxNxK(8, 16, 8)
+high_sadMxNx4D(8, 16)
+
+// 8x8
+high_sadMxN(8, 8)
+high_sadMxNxK(8, 8, 3)
+high_sadMxNxK(8, 8, 8)
+high_sadMxNx4D(8, 8)
+
+// 8x4
+high_sadMxN(8, 4)
+high_sadMxNxK(8, 4, 8)
+high_sadMxNx4D(8, 4)
+
+// 4x8
+high_sadMxN(4, 8)
+high_sadMxNxK(4, 8, 8)
+high_sadMxNx4D(4, 8)
+
+// 4x4
+high_sadMxN(4, 4)
+high_sadMxNxK(4, 4, 3)
+high_sadMxNxK(4, 4, 8)
+high_sadMxNx4D(4, 4)
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c

index 0dcbb08..889a8be 100644 (file)
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -13,43 +13,6 @@
  #include "vp9/encoder/vp9_encoder.h"
  #include "vp9/encoder/vp9_speed_features.h"
  
-enum {
-  INTRA_ALL       = (1 << DC_PRED) |
-                    (1 << V_PRED) | (1 << H_PRED) |
-                    (1 << D45_PRED) | (1 << D135_PRED) |
-                    (1 << D117_PRED) | (1 << D153_PRED) |
-                    (1 << D207_PRED) | (1 << D63_PRED) |
-                    (1 << TM_PRED),
-  INTRA_DC        = (1 << DC_PRED),
-  INTRA_DC_TM     = (1 << DC_PRED) | (1 << TM_PRED),
-  INTRA_DC_H_V    = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
-  INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) |
-                    (1 << H_PRED)
-};
-
-enum {
-  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
-  INTER_NEAREST = (1 << NEARESTMV),
-  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV)
-};
-
-enum {
-  DISABLE_ALL_INTER_SPLIT   = (1 << THR_COMP_GA) |
-                              (1 << THR_COMP_LA) |
-                              (1 << THR_ALTR) |
-                              (1 << THR_GOLD) |
-                              (1 << THR_LAST),
-
-  DISABLE_ALL_SPLIT         = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
-
-  DISABLE_COMPOUND_SPLIT    = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
-
-  LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) |
-                              (1 << THR_COMP_LA) |
-                              (1 << THR_ALTR) |
-                              (1 << THR_GOLD)
-};
-
  // Intra only frames, golden frames (except alt ref overlays) and
  // alt ref frames tend to be coded at a higher than ambient quality
  static int frame_is_boosted(const VP9_COMP *cpi) {
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h

index 33c441f..0f49154 100644 (file)
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -17,6 +17,44 @@
  extern "C" {
  #endif
  
+enum {
+  INTRA_ALL       = (1 << DC_PRED) |
+                    (1 << V_PRED) | (1 << H_PRED) |
+                    (1 << D45_PRED) | (1 << D135_PRED) |
+                    (1 << D117_PRED) | (1 << D153_PRED) |
+                    (1 << D207_PRED) | (1 << D63_PRED) |
+                    (1 << TM_PRED),
+  INTRA_DC        = (1 << DC_PRED),
+  INTRA_DC_TM     = (1 << DC_PRED) | (1 << TM_PRED),
+  INTRA_DC_H_V    = (1 << DC_PRED) | (1 << V_PRED) | (1 << H_PRED),
+  INTRA_DC_TM_H_V = (1 << DC_PRED) | (1 << TM_PRED) | (1 << V_PRED) |
+                    (1 << H_PRED)
+};
+
+enum {
+  INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
+  INTER_NEAREST = (1 << NEARESTMV),
+  INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
+  INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV),
+};
+
+enum {
+  DISABLE_ALL_INTER_SPLIT   = (1 << THR_COMP_GA) |
+                              (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) |
+                              (1 << THR_GOLD) |
+                              (1 << THR_LAST),
+
+  DISABLE_ALL_SPLIT         = (1 << THR_INTRA) | DISABLE_ALL_INTER_SPLIT,
+
+  DISABLE_COMPOUND_SPLIT    = (1 << THR_COMP_GA) | (1 << THR_COMP_LA),
+
+  LAST_AND_INTRA_SPLIT_ONLY = (1 << THR_COMP_GA) |
+                              (1 << THR_COMP_LA) |
+                              (1 << THR_ALTR) |
+                              (1 << THR_GOLD)
+};
+
  typedef enum {
    DIAMOND = 0,
    NSTEP = 1,
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h

index 1fc43a4..d180d1a 100644 (file)
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -36,6 +36,7 @@ typedef struct {
    int gold_ref_idx;
    int has_alt_frame;
    size_t layer_size;
+  struct vpx_psnr_pkt psnr_pkt;
  } LAYER_CONTEXT;
  
  typedef struct {
diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c

index 6068b85..263883f 100644 (file)
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c
@@ -212,7 +212,7 @@ static void tokenize_b(int plane, int block, BLOCK_SIZE plane_bsize,
    TOKENEXTRA *t = *tp;        /* store tokens starting here */
    int eob = p->eobs[block];
    const PLANE_TYPE type = pd->plane_type;
-  const int16_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
    const int segment_id = mbmi->segment_id;
    const int16_t *scan, *nb;
    const scan_order *so;
diff --git a/vp9/encoder/vp9_variance.c b/vp9/encoder/vp9_variance.c

index afbb191..c97f93f 100644 (file)
--- a/vp9/encoder/vp9_variance.c
+++ b/vp9/encoder/vp9_variance.c
@@ -267,3 +267,375 @@ void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
      ref += ref_stride;
    }
  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void high_variance64(const uint8_t *a8, int  a_stride,
+                     const uint8_t *b8, int  b_stride,
+                     int w, int h, uint64_t *sse,
+                     uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+void high_variance(const uint8_t *a8, int  a_stride,
+                   const uint8_t *b8, int  b_stride,
+                   int w, int h, unsigned int *sse,
+                   int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sse = sse_long;
+  *sum = sum_long;
+}
+
+void high_10_variance(const uint8_t *a8, int  a_stride,
+                      const uint8_t *b8, int  b_stride,
+                      int w, int h, unsigned int *sse,
+                      int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sum = ROUND_POWER_OF_TWO(sum_long, 2);
+  *sse = ROUND_POWER_OF_TWO(sse_long, 4);
+}
+
+void high_12_variance(const uint8_t *a8, int  a_stride,
+                      const uint8_t *b8, int  b_stride,
+                      int w, int h, unsigned int *sse,
+                      int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  high_variance64(a8, a_stride, b8, b_stride, w, h, &sse_long, &sum_long);
+  *sum = ROUND_POWER_OF_TWO(sum_long, 4);
+  *sse = ROUND_POWER_OF_TWO(sse_long, 8);
+}
+
+static void high_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const int16_t *vp9_filter) {
+  unsigned int i, j;
+  uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src_ptr8);
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      output_ptr[j] =
+          ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+                             (int)src_ptr[pixel_step] * vp9_filter[1],
+                             FILTER_BITS);
+
+      src_ptr++;
+    }
+
+    // Next row...
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+static void high_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const int16_t *vp9_filter) {
+  unsigned int  i, j;
+
+  for (i = 0; i < output_height; i++) {
+    for (j = 0; j < output_width; j++) {
+      output_ptr[j] =
+          ROUND_POWER_OF_TWO((int)src_ptr[0] * vp9_filter[0] +
+                             (int)src_ptr[pixel_step] * vp9_filter[1],
+                             FILTER_BITS);
+      src_ptr++;
+    }
+
+    src_ptr += src_pixels_per_line - output_width;
+    output_ptr += output_width;
+  }
+}
+
+#define HIGH_VAR(W, H) \
+unsigned int vp9_high_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                            const uint8_t *b, int b_stride, \
+                                            unsigned int *sse) { \
+  int sum; \
+  high_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vp9_high_10_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                               const uint8_t *b, int b_stride, \
+                                                unsigned int *sse) { \
+  int sum; \
+  high_10_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vp9_high_12_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                               const uint8_t *b, int b_stride, \
+                                               unsigned int *sse) { \
+  int sum; \
+  high_12_variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define HIGH_SUBPIX_VAR(W, H) \
+unsigned int vp9_high_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                          BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  return vp9_high_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+                                        dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_10_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                          BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  return vp9_high_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+                                           dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_12_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                          BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  return vp9_high_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
+                                           dst_stride, sse); \
+}
+
+#define HIGH_SUBPIX_AVG_VAR(W, H) \
+unsigned int vp9_high_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                          BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
+                         W); \
+\
+  return vp9_high_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                        dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_10_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                          BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
+                         W); \
+\
+  return vp9_high_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                        dst_stride, sse); \
+} \
+\
+unsigned int vp9_high_12_sub_pixel_avg_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  unsigned int *sse, \
+  const uint8_t *second_pred) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, temp3, H * W); \
+\
+  high_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
+                                         W, BILINEAR_FILTERS_2TAP(xoffset)); \
+  high_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                          BILINEAR_FILTERS_2TAP(yoffset)); \
+\
+  vp9_high_comp_avg_pred(temp3, second_pred, W, H, CONVERT_TO_BYTEPTR(temp2), \
+                         W); \
+\
+  return vp9_high_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
+                                        dst_stride, sse); \
+}
+
+#define HIGH_GET_VAR(S) \
+void vp9_high_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                  const uint8_t *ref, int ref_stride, \
+                                  unsigned int *sse, int *sum) { \
+  high_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vp9_high_10_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                     const uint8_t *ref, int ref_stride, \
+                                     unsigned int *sse, int *sum) { \
+  high_10_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+} \
+\
+void vp9_high_12_get##S##x##S##var_c(const uint8_t *src, int src_stride, \
+                                     const uint8_t *ref, int ref_stride, \
+                                     unsigned int *sse, int *sum) { \
+  high_12_variance(src, src_stride, ref, ref_stride, S, S, sse, sum); \
+}
+
+#define HIGH_MSE(W, H) \
+unsigned int vp9_high_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+                                       const uint8_t *ref, int ref_stride, \
+                                       unsigned int *sse) { \
+  int sum; \
+  high_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+} \
+\
+unsigned int vp9_high_10_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref, int ref_stride, \
+                                          unsigned int *sse) { \
+  int sum; \
+  high_10_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+} \
+\
+unsigned int vp9_high_12_mse##W##x##H##_c(const uint8_t *src, int src_stride, \
+                                          const uint8_t *ref, int ref_stride, \
+                                          unsigned int *sse) { \
+  int sum; \
+  high_12_variance(src, src_stride, ref, ref_stride, W, H, sse, &sum); \
+  return *sse; \
+}
+
+HIGH_GET_VAR(8)
+HIGH_GET_VAR(16)
+
+HIGH_MSE(16, 16)
+HIGH_MSE(16, 8)
+HIGH_MSE(8, 16)
+HIGH_MSE(8, 8)
+
+HIGH_VAR(4, 4)
+HIGH_SUBPIX_VAR(4, 4)
+HIGH_SUBPIX_AVG_VAR(4, 4)
+
+HIGH_VAR(4, 8)
+HIGH_SUBPIX_VAR(4, 8)
+HIGH_SUBPIX_AVG_VAR(4, 8)
+
+HIGH_VAR(8, 4)
+HIGH_SUBPIX_VAR(8, 4)
+HIGH_SUBPIX_AVG_VAR(8, 4)
+
+HIGH_VAR(8, 8)
+HIGH_SUBPIX_VAR(8, 8)
+HIGH_SUBPIX_AVG_VAR(8, 8)
+
+HIGH_VAR(8, 16)
+HIGH_SUBPIX_VAR(8, 16)
+HIGH_SUBPIX_AVG_VAR(8, 16)
+
+HIGH_VAR(16, 8)
+HIGH_SUBPIX_VAR(16, 8)
+HIGH_SUBPIX_AVG_VAR(16, 8)
+
+HIGH_VAR(16, 16)
+HIGH_SUBPIX_VAR(16, 16)
+HIGH_SUBPIX_AVG_VAR(16, 16)
+
+HIGH_VAR(16, 32)
+HIGH_SUBPIX_VAR(16, 32)
+HIGH_SUBPIX_AVG_VAR(16, 32)
+
+HIGH_VAR(32, 16)
+HIGH_SUBPIX_VAR(32, 16)
+HIGH_SUBPIX_AVG_VAR(32, 16)
+
+HIGH_VAR(32, 32)
+HIGH_SUBPIX_VAR(32, 32)
+HIGH_SUBPIX_AVG_VAR(32, 32)
+
+HIGH_VAR(32, 64)
+HIGH_SUBPIX_VAR(32, 64)
+HIGH_SUBPIX_AVG_VAR(32, 64)
+
+HIGH_VAR(64, 32)
+HIGH_SUBPIX_VAR(64, 32)
+HIGH_SUBPIX_AVG_VAR(64, 32)
+
+HIGH_VAR(64, 64)
+HIGH_SUBPIX_VAR(64, 64)
+HIGH_SUBPIX_AVG_VAR(64, 64)
+
+void vp9_high_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
+                            int width, int height, const uint8_t *ref8,
+                            int ref_stride) {
+  int i, j;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; i++) {
+    for (j = 0; j < width; j++) {
+      const int tmp = pred[j] + ref[j];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += ref_stride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h

index 4a194b7..c51d08d 100644 (file)
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -22,6 +22,23 @@ void variance(const uint8_t *a, int a_stride,
                int  w, int  h,
                unsigned int *sse, int *sum);
  
+#if CONFIG_VP9_HIGHBITDEPTH
+void high_variance(const uint8_t *a8, int a_stride,
+                   const uint8_t *b8, int b_stride,
+                   int w, int h,
+                   unsigned int *sse, int *sum);
+
+void high_10_variance(const uint8_t *a8, int a_stride,
+                      const uint8_t *b8, int b_stride,
+                      int w, int h,
+                      unsigned int *sse, int *sum);
+
+void high_12_variance(const uint8_t *a8, int a_stride,
+                      const uint8_t *b8, int b_stride,
+                      int w, int h,
+                      unsigned int *sse, int *sum);
+#endif
+
  typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
                                      int source_stride,
                                      const uint8_t *ref_ptr,
@@ -81,6 +98,11 @@ typedef struct vp9_variance_vtable {
  void vp9_comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int width,
                         int height, const uint8_t *ref, int ref_stride);
  
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp9_high_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred, int width,
+                            int height, const uint8_t *ref, int ref_stride);
+#endif
+
  #ifdef __cplusplus
  }  // extern "C"
  #endif
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c

index 0f0b7a5..0851d8a 100644 (file)
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -686,6 +686,10 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
  
      if (res == VPX_CODEC_OK) {
        set_encoder_config(&priv->oxcf, &priv->cfg, &priv->extra_cfg);
+#if CONFIG_VP9_HIGHBITDEPTH
+      priv->oxcf.use_highbitdepth =
+          (ctx->init_flags & VPX_CODEC_USE_HIGHBITDEPTH) ? 1 : 0;
+#endif
        priv->cpi = vp9_create_compressor(&priv->oxcf);
        if (priv->cpi == NULL)
          res = VPX_CODEC_MEM_ERROR;
@@ -981,15 +985,20 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t  *ctx,
          cx_data_sz -= size;
  #if CONFIG_SPATIAL_SVC
          if (is_two_pass_svc(cpi)) {
-          vpx_codec_cx_pkt_t pkt;
+          vpx_codec_cx_pkt_t pkt_sizes, pkt_psnr;
            int i;
-          vp9_zero(pkt);
-          pkt.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
+          vp9_zero(pkt_sizes);
+          vp9_zero(pkt_psnr);
+          pkt_sizes.kind = VPX_CODEC_SPATIAL_SVC_LAYER_SIZES;
+          pkt_psnr.kind = VPX_CODEC_SPATIAL_SVC_LAYER_PSNR;
            for (i = 0; i < cpi->svc.number_spatial_layers; ++i) {
-            pkt.data.layer_sizes[i] = cpi->svc.layer_context[i].layer_size;
-            cpi->svc.layer_context[i].layer_size = 0;
+            LAYER_CONTEXT *lc = &cpi->svc.layer_context[i];
+            pkt_sizes.data.layer_sizes[i] = lc->layer_size;
+            pkt_psnr.data.layer_psnr[i] = lc->psnr_pkt;
+            lc->layer_size = 0;
            }
-          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_sizes);
+          vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt_psnr);
          }
  #endif
        }
@@ -1333,6 +1342,9 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
  CODEC_INTERFACE(vpx_codec_vp9_cx) = {
    "WebM Project VP9 Encoder" VERSION_STRING,
    VPX_CODEC_INTERNAL_ABI_VERSION,
+#if CONFIG_VP9_HIGHBITDEPTH
+  VPX_CODEC_CAP_HIGHBITDEPTH |
+#endif
    VPX_CODEC_CAP_ENCODER | VPX_CODEC_CAP_PSNR,  // vpx_codec_caps_t
    encoder_init,       // vpx_codec_init_fn_t
    encoder_destroy,    // vpx_codec_destroy_fn_t
diff --git a/vpx/exports_enc b/vpx/exports_enc

index 07f0280..40478eb 100644 (file)
--- a/vpx/exports_enc
+++ b/vpx/exports_enc
@@ -8,17 +8,10 @@ text vpx_codec_get_preview_frame
  text vpx_codec_set_cx_data_buf
  text vpx_svc_dump_statistics
  text vpx_svc_encode
-text vpx_svc_get_buffer
-text vpx_svc_get_encode_frame_count
-text vpx_svc_get_frame_size
  text vpx_svc_get_message
  text vpx_svc_init
-text vpx_svc_is_keyframe
  text vpx_svc_release
-text vpx_svc_set_keyframe
  text vpx_svc_set_options
  text vpx_svc_set_quantizers
  text vpx_svc_set_scale_factors
  text vpx_svc_get_layer_resolution
-text vpx_svc_get_rc_stats_buffer_size
-text vpx_svc_get_rc_stats_buffer
diff --git a/vpx/src/svc_encodeframe.c b/vpx/src/svc_encodeframe.c

index c465a38..ff54b8d 100644 (file)
--- a/vpx/src/svc_encodeframe.c
+++ b/vpx/src/svc_encodeframe.c
@@ -21,7 +21,6 @@
  #include <stdlib.h>
  #include <string.h>
  #define VPX_DISABLE_CTRL_TYPECHECKS 1
-#define VPX_CODEC_DISABLE_COMPAT 1
  #include "./vpx_config.h"
  #include "vpx/svc_context.h"
  #include "vpx/vp8cx.h"
@@ -107,73 +106,14 @@ typedef struct SvcInternal {
    int kf_dist;  // distance between keyframes
  
    // state variables
-  int encode_frame_count;
-  int frame_received;
-  int frame_within_gop;
-  int layers;
+  int psnr_pkt_received;
    int layer;
-  int is_keyframe;
    int use_multiple_frame_contexts;
  
-  FrameData *frame_list;
-  FrameData *frame_temp;
-
-  char *rc_stats_buf;
-  size_t rc_stats_buf_size;
-  size_t rc_stats_buf_used;
-
    char message_buffer[2048];
    vpx_codec_ctx_t *codec_ctx;
  } SvcInternal;
  
-// create FrameData from encoder output
-static struct FrameData *fd_create(void *buf, size_t size,
-                                   vpx_codec_frame_flags_t flags) {
-  struct FrameData *const frame_data =
-      (struct FrameData *)vpx_malloc(sizeof(*frame_data));
-  if (frame_data == NULL) {
-    return NULL;
-  }
-  frame_data->buf = vpx_malloc(size);
-  if (frame_data->buf == NULL) {
-    vpx_free(frame_data);
-    return NULL;
-  }
-  vpx_memcpy(frame_data->buf, buf, size);
-  frame_data->size = size;
-  frame_data->flags = flags;
-  return frame_data;
-}
-
-// free FrameData
-static void fd_free(struct FrameData *p) {
-  if (p) {
-    if (p->buf)
-      vpx_free(p->buf);
-    vpx_free(p);
-  }
-}
-
-// add FrameData to list
-static void fd_list_add(struct FrameData **list, struct FrameData *layer_data) {
-  struct FrameData **p = list;
-
-  while (*p != NULL) p = &(*p)->next;
-  *p = layer_data;
-  layer_data->next = NULL;
-}
-
-// free FrameData list
-static void fd_free_list(struct FrameData *list) {
-  struct FrameData *p = list;
-
-  while (p) {
-    list = list->next;
-    fd_free(p);
-    p = list;
-  }
-}
-
  static SvcInternal *get_svc_internal(SvcContext *svc_ctx) {
    if (svc_ctx == NULL) return NULL;
    if (svc_ctx->internal == NULL) {
@@ -398,13 +338,14 @@ vpx_codec_err_t vpx_svc_set_scale_factors(SvcContext *svc_ctx,
    return VPX_CODEC_OK;
  }
  
-void assign_layer_bitrates(const SvcInternal *const si,
+void assign_layer_bitrates(const SvcContext *svc_ctx,
                             vpx_codec_enc_cfg_t *const enc_cfg) {
    int i;
+  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
  
    if (si->bitrates[0] != 0) {
      enc_cfg->rc_target_bitrate = 0;
-    for (i = 0; i < si->layers; ++i) {
+    for (i = 0; i < svc_ctx->spatial_layers; ++i) {
        enc_cfg->ss_target_bitrate[i] = (unsigned int)si->bitrates[i];
        enc_cfg->rc_target_bitrate += si->bitrates[i];
      }
@@ -412,7 +353,7 @@ void assign_layer_bitrates(const SvcInternal *const si,
      float total = 0;
      float alloc_ratio[VPX_SS_MAX_LAYERS] = {0};
  
-    for (i = 0; i < si->layers; ++i) {
+    for (i = 0; i < svc_ctx->spatial_layers; ++i) {
        if (si->scaling_factor_den[i] > 0) {
          alloc_ratio[i] = (float)(si->scaling_factor_num[i] * 1.0 /
              si->scaling_factor_den[i]);
@@ -422,7 +363,7 @@ void assign_layer_bitrates(const SvcInternal *const si,
        }
      }
  
-    for (i = 0; i < si->layers; ++i) {
+    for (i = 0; i < svc_ctx->spatial_layers; ++i) {
        if (total > 0) {
          enc_cfg->ss_target_bitrate[i] = (unsigned int)
              (enc_cfg->rc_target_bitrate * alloc_ratio[i] / total);
@@ -501,12 +442,10 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
    if (svc_ctx->temporal_layers > VPX_TS_MAX_LAYERS)
      svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS;
  
-  si->layers = svc_ctx->spatial_layers;
-
-  assign_layer_bitrates(si, enc_cfg);
+  assign_layer_bitrates(svc_ctx, enc_cfg);
  
  #if CONFIG_SPATIAL_SVC
-  for (i = 0; i < si->layers; ++i)
+  for (i = 0; i < svc_ctx->spatial_layers; ++i)
      enc_cfg->ss_enable_auto_alt_ref[i] = si->enable_auto_alt_ref[i];
  #endif
  
@@ -520,24 +459,9 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
    }
  
    // modify encoder configuration
-  enc_cfg->ss_number_layers = si->layers;
+  enc_cfg->ss_number_layers = svc_ctx->spatial_layers;
    enc_cfg->ts_number_layers = svc_ctx->temporal_layers;
  
-  // TODO(ivanmaltz): determine if these values need to be set explicitly for
-  // svc, or if the normal default/override mechanism can be used
-  enc_cfg->rc_dropframe_thresh = 0;
-  enc_cfg->rc_resize_allowed = 0;
-
-  if (enc_cfg->g_pass == VPX_RC_ONE_PASS) {
-    enc_cfg->rc_min_quantizer = 33;
-    enc_cfg->rc_max_quantizer = 33;
-  }
-
-  enc_cfg->rc_undershoot_pct = 100;
-  enc_cfg->rc_overshoot_pct = 15;
-  enc_cfg->rc_buf_initial_sz = 500;
-  enc_cfg->rc_buf_optimal_sz = 600;
-  enc_cfg->rc_buf_sz = 1000;
    if (enc_cfg->g_error_resilient == 0 && si->use_multiple_frame_contexts == 0)
      enc_cfg->g_error_resilient = 1;
  
@@ -549,7 +473,6 @@ vpx_codec_err_t vpx_svc_init(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
    }
  
    vpx_codec_control(codec_ctx, VP9E_SET_SVC, 1);
-  vpx_codec_control(codec_ctx, VP8E_SET_TOKEN_PARTITIONS, 1);
  
    return VPX_CODEC_OK;
  }
@@ -561,10 +484,10 @@ vpx_codec_err_t vpx_svc_get_layer_resolution(const SvcContext *svc_ctx,
    int w, h, num, den;
    const SvcInternal *const si = get_const_svc_internal(svc_ctx);
  
-  if (svc_ctx == NULL || si == NULL || width == NULL || height == NULL) {
+  if (svc_ctx == NULL || si == NULL || width == NULL || height == NULL)
+    return VPX_CODEC_INVALID_PARAM;
+  if (layer < 0 || layer >= svc_ctx->spatial_layers)
      return VPX_CODEC_INVALID_PARAM;
-  }
-  if (layer < 0 || layer >= si->layers) return VPX_CODEC_INVALID_PARAM;
  
    num = si->scaling_factor_num[layer];
    den = si->scaling_factor_den[layer];
@@ -608,7 +531,6 @@ static void set_svc_parameters(SvcContext *svc_ctx,
      svc_params.max_quantizer = codec_ctx->config.enc->rc_max_quantizer;
    }
  
-  svc_params.distance_from_i_frame = si->frame_within_gop;
    vpx_codec_control(codec_ctx, VP9E_SET_SVC_PARAMETERS, &svc_params);
  }
  
@@ -622,31 +544,16 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
    vpx_codec_err_t res;
    vpx_codec_iter_t iter;
    const vpx_codec_cx_pkt_t *cx_pkt;
-  int layer_for_psnr = 0;
    SvcInternal *const si = get_svc_internal(svc_ctx);
    if (svc_ctx == NULL || codec_ctx == NULL || si == NULL) {
      return VPX_CODEC_INVALID_PARAM;
    }
  
    svc_log_reset(svc_ctx);
-  si->rc_stats_buf_used = 0;
-
-  si->layers = svc_ctx->spatial_layers;
-  if (si->encode_frame_count == 0) {
-    si->frame_within_gop = 0;
-  }
-  si->is_keyframe = (si->frame_within_gop == 0);
-
-  if (rawimg != NULL) {
-    svc_log(svc_ctx, SVC_LOG_DEBUG,
-            "vpx_svc_encode  layers: %d, frame_count: %d, "
-            "frame_within_gop: %d\n", si->layers, si->encode_frame_count,
-            si->frame_within_gop);
-  }
  
    if (rawimg != NULL) {
      // encode each layer
-    for (si->layer = 0; si->layer < si->layers; ++si->layer) {
+    for (si->layer = 0; si->layer < svc_ctx->spatial_layers; ++si->layer) {
        set_svc_parameters(svc_ctx, codec_ctx);
      }
    }
@@ -660,64 +567,40 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
    iter = NULL;
    while ((cx_pkt = vpx_codec_get_cx_data(codec_ctx, &iter))) {
      switch (cx_pkt->kind) {
-      case VPX_CODEC_CX_FRAME_PKT: {
-        fd_list_add(&si->frame_list, fd_create(cx_pkt->data.frame.buf,
-                                               cx_pkt->data.frame.sz,
-                                               cx_pkt->data.frame.flags));
-
-        svc_log(svc_ctx, SVC_LOG_DEBUG, "SVC frame: %d, kf: %d, size: %d, "
-                "pts: %d\n", si->frame_received,
-                (cx_pkt->data.frame.flags & VPX_FRAME_IS_KEY) ? 1 : 0,
-                (int)cx_pkt->data.frame.sz, (int)cx_pkt->data.frame.pts);
-
-        ++si->frame_received;
-        layer_for_psnr = 0;
-        break;
-      }
-      case VPX_CODEC_PSNR_PKT: {
+#if CONFIG_SPATIAL_SVC
+      case VPX_CODEC_SPATIAL_SVC_LAYER_PSNR: {
          int i;
-        svc_log(svc_ctx, SVC_LOG_DEBUG,
-                "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
-                "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                si->frame_received, layer_for_psnr,
-                cx_pkt->data.psnr.psnr[0], cx_pkt->data.psnr.psnr[1],
-                cx_pkt->data.psnr.psnr[2], cx_pkt->data.psnr.psnr[3]);
-        svc_log(svc_ctx, SVC_LOG_DEBUG,
-                "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
-                "%2.3f  %2.3f  %2.3f  %2.3f \n",
-                si->frame_received, layer_for_psnr,
-                cx_pkt->data.psnr.sse[0], cx_pkt->data.psnr.sse[1],
-                cx_pkt->data.psnr.sse[2], cx_pkt->data.psnr.sse[3]);
-        for (i = 0; i < COMPONENTS; i++) {
-          si->psnr_sum[layer_for_psnr][i] += cx_pkt->data.psnr.psnr[i];
-          si->sse_sum[layer_for_psnr][i] += cx_pkt->data.psnr.sse[i];
-        }
-        ++layer_for_psnr;
-        break;
-      }
-      case VPX_CODEC_STATS_PKT: {
-        size_t new_size = si->rc_stats_buf_used +
-            cx_pkt->data.twopass_stats.sz;
-
-        if (new_size > si->rc_stats_buf_size) {
-          char *p = (char*)realloc(si->rc_stats_buf, new_size);
-          if (p == NULL) {
-            svc_log(svc_ctx, SVC_LOG_ERROR, "Error allocating stats buf\n");
-            return VPX_CODEC_MEM_ERROR;
+        for (i = 0; i < svc_ctx->spatial_layers; ++i) {
+          int j;
+          svc_log(svc_ctx, SVC_LOG_DEBUG,
+                  "SVC frame: %d, layer: %d, PSNR(Total/Y/U/V): "
+                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                  si->psnr_pkt_received, i,
+                  cx_pkt->data.layer_psnr[i].psnr[0],
+                  cx_pkt->data.layer_psnr[i].psnr[1],
+                  cx_pkt->data.layer_psnr[i].psnr[2],
+                  cx_pkt->data.layer_psnr[i].psnr[3]);
+          svc_log(svc_ctx, SVC_LOG_DEBUG,
+                  "SVC frame: %d, layer: %d, SSE(Total/Y/U/V): "
+                  "%2.3f  %2.3f  %2.3f  %2.3f \n",
+                  si->psnr_pkt_received, i,
+                  cx_pkt->data.layer_psnr[i].sse[0],
+                  cx_pkt->data.layer_psnr[i].sse[1],
+                  cx_pkt->data.layer_psnr[i].sse[2],
+                  cx_pkt->data.layer_psnr[i].sse[3]);
+
+          for (j = 0; j < COMPONENTS; ++j) {
+            si->psnr_sum[i][j] +=
+                cx_pkt->data.layer_psnr[i].psnr[j];
+            si->sse_sum[i][j] += cx_pkt->data.layer_psnr[i].sse[j];
            }
-          si->rc_stats_buf = p;
-          si->rc_stats_buf_size = new_size;
          }
-
-        memcpy(si->rc_stats_buf + si->rc_stats_buf_used,
-               cx_pkt->data.twopass_stats.buf, cx_pkt->data.twopass_stats.sz);
-        si->rc_stats_buf_used += cx_pkt->data.twopass_stats.sz;
+        ++si->psnr_pkt_received;
          break;
        }
-#if CONFIG_SPATIAL_SVC
        case VPX_CODEC_SPATIAL_SVC_LAYER_SIZES: {
          int i;
-        for (i = 0; i < si->layers; ++i)
+        for (i = 0; i < svc_ctx->spatial_layers; ++i)
            si->bytes_sum[i] += cx_pkt->data.layer_sizes[i];
          break;
        }
@@ -728,11 +611,6 @@ vpx_codec_err_t vpx_svc_encode(SvcContext *svc_ctx, vpx_codec_ctx_t *codec_ctx,
      }
    }
  
-  if (rawimg != NULL) {
-    ++si->frame_within_gop;
-    ++si->encode_frame_count;
-  }
-
    return VPX_CODEC_OK;
  }
  
@@ -742,47 +620,6 @@ const char *vpx_svc_get_message(const SvcContext *svc_ctx) {
    return si->message_buffer;
  }
  
-// We will maintain a list of output frame buffers since with lag_in_frame
-// we need to output all frame buffers at the end. vpx_svc_get_buffer() will
-// remove a frame buffer from the list the put it to a temporal pointer, which
-// will be removed at the next vpx_svc_get_buffer() or when closing encoder.
-void *vpx_svc_get_buffer(SvcContext *svc_ctx) {
-  SvcInternal *const si = get_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return NULL;
-
-  if (si->frame_temp)
-    fd_free(si->frame_temp);
-
-  si->frame_temp = si->frame_list;
-  si->frame_list = si->frame_list->next;
-
-  return si->frame_temp->buf;
-}
-
-size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return 0;
-  return si->frame_list->size;
-}
-
-int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return 0;
-  return si->encode_frame_count;
-}
-
-int vpx_svc_is_keyframe(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL || si->frame_list == NULL) return 0;
-  return (si->frame_list->flags & VPX_FRAME_IS_KEY) != 0;
-}
-
-void vpx_svc_set_keyframe(SvcContext *svc_ctx) {
-  SvcInternal *const si = get_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return;
-  si->frame_within_gop = 0;
-}
-
  static double calc_psnr(double d) {
    if (d == 0) return 100;
    return -10.0 * log(d) / log(10.0);
@@ -790,7 +627,7 @@ static double calc_psnr(double d) {
  
  // dump accumulated statistics and reset accumulated values
  const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
-  int number_of_frames, encode_frame_count;
+  int number_of_frames;
    int i, j;
    uint32_t bytes_total = 0;
    double scale[COMPONENTS];
@@ -803,12 +640,11 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
  
    svc_log_reset(svc_ctx);
  
-  encode_frame_count = si->encode_frame_count;
-  if (si->encode_frame_count <= 0) return vpx_svc_get_message(svc_ctx);
+  number_of_frames = si->psnr_pkt_received;
+  if (number_of_frames <= 0) return vpx_svc_get_message(svc_ctx);
  
    svc_log(svc_ctx, SVC_LOG_INFO, "\n");
-  for (i = 0; i < si->layers; ++i) {
-    number_of_frames = encode_frame_count;
+  for (i = 0; i < svc_ctx->spatial_layers; ++i) {
  
      svc_log(svc_ctx, SVC_LOG_INFO,
              "Layer %d Average PSNR=[%2.3f, %2.3f, %2.3f, %2.3f], Bytes=[%u]\n",
@@ -843,7 +679,7 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx) {
    }
  
    // only display statistics once
-  si->encode_frame_count = 0;
+  si->psnr_pkt_received = 0;
  
    svc_log(svc_ctx, SVC_LOG_INFO, "Total Bytes=[%u]\n", bytes_total);
    return vpx_svc_get_message(svc_ctx);
@@ -856,26 +692,8 @@ void vpx_svc_release(SvcContext *svc_ctx) {
    // SvcInternal if it was not already allocated
    si = (SvcInternal *)svc_ctx->internal;
    if (si != NULL) {
-    fd_free(si->frame_temp);
-    fd_free_list(si->frame_list);
-    if (si->rc_stats_buf) {
-      free(si->rc_stats_buf);
-    }
      free(si);
      svc_ctx->internal = NULL;
    }
  }
  
-size_t vpx_svc_get_rc_stats_buffer_size(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return 0;
-  return si->rc_stats_buf_used;
-}
-
-char *vpx_svc_get_rc_stats_buffer(const SvcContext *svc_ctx) {
-  const SvcInternal *const si = get_const_svc_internal(svc_ctx);
-  if (svc_ctx == NULL || si == NULL) return NULL;
-  return si->rc_stats_buf;
-}
-
-
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c

index 1903b55..cd10c41 100644 (file)
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -15,8 +15,8 @@
   */
  #include <limits.h>
  #include <string.h>
-#include "vpx/internal/vpx_codec_internal.h"
  #include "vpx_config.h"
+#include "vpx/internal/vpx_codec_internal.h"
  
  #define SAVE_STATUS(ctx,var) (ctx?(ctx->err = var):var)
  
diff --git a/vpx/svc_context.h b/vpx/svc_context.h

index eea3b13..fe638e4 100644 (file)
--- a/vpx/svc_context.h
+++ b/vpx/svc_context.h
@@ -96,49 +96,12 @@ const char *vpx_svc_dump_statistics(SvcContext *svc_ctx);
  const char *vpx_svc_get_message(const SvcContext *svc_ctx);
  
  /**
- * return size of encoded data to be returned by vpx_svc_get_buffer.
- * it needs to be called before vpx_svc_get_buffer.
- */
-size_t vpx_svc_get_frame_size(const SvcContext *svc_ctx);
-
-/**
- * return buffer with encoded data. encoder will maintain a list of frame
- * buffers. each call of vpx_svc_get_buffer() will return one frame.
- */
-void *vpx_svc_get_buffer(SvcContext *svc_ctx);
-
-/**
- * return size of two pass rate control stats data to be returned by
- * vpx_svc_get_rc_stats_buffer
- */
-size_t vpx_svc_get_rc_stats_buffer_size(const SvcContext *svc_ctx);
-
-/**
- * return buffer two pass of rate control stats data
- */
-char *vpx_svc_get_rc_stats_buffer(const SvcContext *svc_ctx);
-
-/**
   * return spatial resolution of the specified layer
   */
  vpx_codec_err_t vpx_svc_get_layer_resolution(const SvcContext *svc_ctx,
                                               int layer,
                                               unsigned int *width,
                                               unsigned int *height);
-/**
- * return number of frames that have been encoded
- */
-int vpx_svc_get_encode_frame_count(const SvcContext *svc_ctx);
-
-/**
- * return 1 if last encoded frame was a keyframe
- */
-int vpx_svc_is_keyframe(const SvcContext *svc_ctx);
-
-/**
- * force the next frame to be a keyframe
- */
-void vpx_svc_set_keyframe(SvcContext *svc_ctx);
  
  #ifdef __cplusplus
  }  // extern "C"
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h

index 796a7a1..1ee9726 100644 (file)
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -307,7 +307,6 @@ typedef struct vpx_svc_parameters {
    int temporal_layer;         /**< current temporal layer number - 0 = base */
    int max_quantizer;          /**< max quantizer for current layer */
    int min_quantizer;          /**< min quantizer for current layer */
-  int distance_from_i_frame;  /**< frame number within current gop */
    int lst_fb_idx;             /**< last frame frame buffer index */
    int gld_fb_idx;             /**< golden frame frame buffer index */
    int alt_fb_idx;             /**< alt reference frame frame buffer index */
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h

index fdabed1..996fcb4 100644 (file)
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -163,6 +163,7 @@ extern "C" {
      VPX_CODEC_PSNR_PKT,        /**< PSNR statistics for this frame */
  #if CONFIG_SPATIAL_SVC
      VPX_CODEC_SPATIAL_SVC_LAYER_SIZES, /**< Sizes for each layer in this frame*/
+    VPX_CODEC_SPATIAL_SVC_LAYER_PSNR, /**< PSNR for each layer in this frame*/
  #endif
      VPX_CODEC_CUSTOM_PKT = 256 /**< Algorithm extensions  */
    };
@@ -202,6 +203,7 @@ extern "C" {
        vpx_fixed_buf_t raw;     /**< data for arbitrary packets */
  #if CONFIG_SPATIAL_SVC
        size_t layer_sizes[VPX_SS_MAX_LAYERS];
+      struct vpx_psnr_pkt layer_psnr[VPX_SS_MAX_LAYERS];
  #endif
  
        /* This packet size is fixed to allow codecs to extend this
diff --git a/vpx/vpx_image.h b/vpx/vpx_image.h

index 0b7bb90..ef6d1dd 100644 (file)
--- a/vpx/vpx_image.h
+++ b/vpx/vpx_image.h
@@ -64,40 +64,6 @@ extern "C" {
      VPX_IMG_FMT_I44416    = VPX_IMG_FMT_I444 | VPX_IMG_FMT_HIGHBITDEPTH
    } vpx_img_fmt_t; /**< alias for enum vpx_img_fmt */
  
-#if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
-#define IMG_FMT_PLANAR         VPX_IMG_FMT_PLANAR     /**< \deprecated Use #VPX_IMG_FMT_PLANAR */
-#define IMG_FMT_UV_FLIP        VPX_IMG_FMT_UV_FLIP    /**< \deprecated Use #VPX_IMG_FMT_UV_FLIP */
-#define IMG_FMT_HAS_ALPHA      VPX_IMG_FMT_HAS_ALPHA  /**< \deprecated Use #VPX_IMG_FMT_HAS_ALPHA */
-
-  /*!\brief Deprecated list of supported image formats
-   * \deprecated New code should use #vpx_img_fmt
-   */
-#define img_fmt   vpx_img_fmt
-  /*!\brief alias for enum img_fmt.
-   * \deprecated New code should use #vpx_img_fmt_t
-   */
-#define img_fmt_t vpx_img_fmt_t
-
-#define IMG_FMT_NONE       VPX_IMG_FMT_NONE       /**< \deprecated Use #VPX_IMG_FMT_NONE */
-#define IMG_FMT_RGB24      VPX_IMG_FMT_RGB24      /**< \deprecated Use #VPX_IMG_FMT_RGB24 */
-#define IMG_FMT_RGB32      VPX_IMG_FMT_RGB32      /**< \deprecated Use #VPX_IMG_FMT_RGB32 */
-#define IMG_FMT_RGB565     VPX_IMG_FMT_RGB565     /**< \deprecated Use #VPX_IMG_FMT_RGB565 */
-#define IMG_FMT_RGB555     VPX_IMG_FMT_RGB555     /**< \deprecated Use #VPX_IMG_FMT_RGB555 */
-#define IMG_FMT_UYVY       VPX_IMG_FMT_UYVY       /**< \deprecated Use #VPX_IMG_FMT_UYVY */
-#define IMG_FMT_YUY2       VPX_IMG_FMT_YUY2       /**< \deprecated Use #VPX_IMG_FMT_YUY2 */
-#define IMG_FMT_YVYU       VPX_IMG_FMT_YVYU       /**< \deprecated Use #VPX_IMG_FMT_YVYU */
-#define IMG_FMT_BGR24      VPX_IMG_FMT_BGR24      /**< \deprecated Use #VPX_IMG_FMT_BGR24 */
-#define IMG_FMT_RGB32_LE   VPX_IMG_FMT_RGB32_LE   /**< \deprecated Use #VPX_IMG_FMT_RGB32_LE */
-#define IMG_FMT_ARGB       VPX_IMG_FMT_ARGB       /**< \deprecated Use #VPX_IMG_FMT_ARGB */
-#define IMG_FMT_ARGB_LE    VPX_IMG_FMT_ARGB_LE    /**< \deprecated Use #VPX_IMG_FMT_ARGB_LE */
-#define IMG_FMT_RGB565_LE  VPX_IMG_FMT_RGB565_LE  /**< \deprecated Use #VPX_IMG_FMT_RGB565_LE */
-#define IMG_FMT_RGB555_LE  VPX_IMG_FMT_RGB555_LE  /**< \deprecated Use #VPX_IMG_FMT_RGB555_LE */
-#define IMG_FMT_YV12       VPX_IMG_FMT_YV12       /**< \deprecated Use #VPX_IMG_FMT_YV12 */
-#define IMG_FMT_I420       VPX_IMG_FMT_I420       /**< \deprecated Use #VPX_IMG_FMT_I420 */
-#define IMG_FMT_VPXYV12    VPX_IMG_FMT_VPXYV12    /**< \deprecated Use #VPX_IMG_FMT_VPXYV12 */
-#define IMG_FMT_VPXI420    VPX_IMG_FMT_VPXI420    /**< \deprecated Use #VPX_IMG_FMT_VPXI420 */
-#endif /* VPX_CODEC_DISABLE_COMPAT */
-
    /**\brief Image Descriptor */
    typedef struct vpx_image {
      vpx_img_fmt_t fmt; /**< Image Format */
@@ -121,13 +87,6 @@ extern "C" {
  #define VPX_PLANE_U      1   /**< U (Chroma) plane */
  #define VPX_PLANE_V      2   /**< V (Chroma) plane */
  #define VPX_PLANE_ALPHA  3   /**< A (Transparency) plane */
-#if !defined(VPX_CODEC_DISABLE_COMPAT) || !VPX_CODEC_DISABLE_COMPAT
-#define PLANE_PACKED     VPX_PLANE_PACKED
-#define PLANE_Y          VPX_PLANE_Y
-#define PLANE_U          VPX_PLANE_U
-#define PLANE_V          VPX_PLANE_V
-#define PLANE_ALPHA      VPX_PLANE_ALPHA
-#endif
      unsigned char *planes[4];  /**< pointer to the top left pixel for each plane */
      int      stride[4];  /**< stride between rows for each plane */
  
diff --git a/vpxdec.c b/vpxdec.c

index 6470081..091522f 100644 (file)
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -24,7 +24,6 @@
  #include "./args.h"
  #include "./ivfdec.h"
  
-#define VPX_CODEC_DISABLE_COMPAT 1
  #include "vpx/vpx_decoder.h"
  #include "vpx_ports/mem_ops.h"
  #include "vpx_ports/vpx_timer.h"
diff --git a/y4minput.c b/y4minput.c

index bcc742a..34ea96d 100644 (file)
--- a/y4minput.c
+++ b/y4minput.c
@@ -1041,12 +1041,12 @@ int y4m_input_fetch_frame(y4m_input *_y4m, FILE *_fin, vpx_image_t *_img) {
    c_w *= bytes_per_sample;
    c_h = (_y4m->pic_h + _y4m->dst_c_dec_v - 1) / _y4m->dst_c_dec_v;
    c_sz = c_w * c_h;
-  _img->stride[PLANE_Y] = _img->stride[PLANE_ALPHA] =
+  _img->stride[VPX_PLANE_Y] = _img->stride[VPX_PLANE_ALPHA] =
        _y4m->pic_w * bytes_per_sample;
-  _img->stride[PLANE_U] = _img->stride[PLANE_V] = c_w;
-  _img->planes[PLANE_Y] = _y4m->dst_buf;
-  _img->planes[PLANE_U] = _y4m->dst_buf + pic_sz;
-  _img->planes[PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
-  _img->planes[PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;
+  _img->stride[VPX_PLANE_U] = _img->stride[VPX_PLANE_V] = c_w;
+  _img->planes[VPX_PLANE_Y] = _y4m->dst_buf;
+  _img->planes[VPX_PLANE_U] = _y4m->dst_buf + pic_sz;
+  _img->planes[VPX_PLANE_V] = _y4m->dst_buf + pic_sz + c_sz;
+  _img->planes[VPX_PLANE_ALPHA] = _y4m->dst_buf + pic_sz + 2 * c_sz;
    return 1;
  }
author	Deb Mukherjee <debargha@google.com>
	Fri, 12 Sep 2014 23:42:18 +0000 (16:42 -0700)
committer	Gerrit Code Review <gerrit@gerrit.golo.chromium.org>
	Fri, 12 Sep 2014 23:42:18 +0000 (16:42 -0700)
build/make/rtcd.pl		patch \| blob \| history
configure		patch \| blob \| history
examples.mk		patch \| blob \| history
examples/decode_to_md5.c		patch \| blob \| history
examples/decode_with_drops.c		patch \| blob \| history
examples/postproc.c		patch \| blob \| history
examples/set_maps.c		patch \| blob \| history
examples/simple_decoder.c		patch \| blob \| history
examples/simple_encoder.c		patch \| blob \| history
examples/twopass_encoder.c		patch \| blob \| history
examples/vp8_multi_resolution_encoder.c		patch \| blob \| history
examples/vp8cx_set_ref.c		patch \| blob \| history
examples/vp9_lossless_encoder.c	[new file with mode: 0644]	patch \| blob
examples/vp9_spatial_svc_encoder.c		patch \| blob \| history
examples/vpx_temporal_svc_encoder.c		patch \| blob \| history
libs.mk		patch \| blob \| history
test/convolve_test.cc		patch \| blob \| history
test/dct16x16_test.cc		patch \| blob \| history
test/dct32x32_test.cc		patch \| blob \| history
test/fdct4x4_test.cc		patch \| blob \| history
test/fdct8x8_test.cc		patch \| blob \| history
test/idct8x8_test.cc		patch \| blob \| history
test/invalid_file_test.cc		patch \| blob \| history
test/partial_idct_test.cc		patch \| blob \| history
test/svc_test.cc		patch \| blob \| history
test/test-data.sha1		patch \| blob \| history
test/test.mk		patch \| blob \| history
vp8/encoder/denoising.c		patch \| blob \| history
vp8/encoder/encodemb.c		patch \| blob \| history
vp8/encoder/quantize.c		patch \| blob \| history
vp9/common/vp9_blockd.h		patch \| blob \| history
vp9/common/vp9_idct.c		patch \| blob \| history
vp9/common/vp9_idct.h		patch \| blob \| history
vp9/common/vp9_loopfilter.c		patch \| blob \| history
vp9/common/vp9_loopfilter.h		patch \| blob \| history
vp9/common/vp9_rtcd_defs.pl		patch \| blob \| history
vp9/common/x86/vp9_asm_stubs.c		patch \| blob \| history
vp9/decoder/vp9_decodeframe.c		patch \| blob \| history
vp9/decoder/vp9_decoder.c		patch \| blob \| history
vp9/decoder/vp9_decoder.h		patch \| blob \| history
vp9/decoder/vp9_detokenize.c		patch \| blob \| history
vp9/decoder/vp9_dthread.c		patch \| blob \| history
vp9/decoder/vp9_dthread.h		patch \| blob \| history
vp9/encoder/vp9_block.h		patch \| blob \| history
vp9/encoder/vp9_context_tree.c		patch \| blob \| history
vp9/encoder/vp9_context_tree.h		patch \| blob \| history
vp9/encoder/vp9_dct.c		patch \| blob \| history
vp9/encoder/vp9_denoiser.c		patch \| blob \| history
vp9/encoder/vp9_denoiser.h		patch \| blob \| history
vp9/encoder/vp9_encodemb.c		patch \| blob \| history
vp9/encoder/vp9_encoder.c		patch \| blob \| history
vp9/encoder/vp9_encoder.h		patch \| blob \| history
vp9/encoder/vp9_firstpass.c		patch \| blob \| history
vp9/encoder/vp9_picklpf.c		patch \| blob \| history
vp9/encoder/vp9_quantize.c		patch \| blob \| history
vp9/encoder/vp9_quantize.h		patch \| blob \| history
vp9/encoder/vp9_rdopt.c		patch \| blob \| history
vp9/encoder/vp9_sad.c		patch \| blob \| history
vp9/encoder/vp9_speed_features.c		patch \| blob \| history
vp9/encoder/vp9_speed_features.h		patch \| blob \| history
vp9/encoder/vp9_svc_layercontext.h		patch \| blob \| history
vp9/encoder/vp9_tokenize.c		patch \| blob \| history
vp9/encoder/vp9_variance.c		patch \| blob \| history
vp9/encoder/vp9_variance.h		patch \| blob \| history
vp9/vp9_cx_iface.c		patch \| blob \| history
vpx/exports_enc		patch \| blob \| history
vpx/src/svc_encodeframe.c		patch \| blob \| history
vpx/src/vpx_encoder.c		patch \| blob \| history
vpx/svc_context.h		patch \| blob \| history
vpx/vp8cx.h		patch \| blob \| history
vpx/vpx_encoder.h		patch \| blob \| history
vpx/vpx_image.h		patch \| blob \| history
vpxdec.c		patch \| blob \| history
y4minput.c		patch \| blob \| history