From 8b0c11c3588963fa02be0cad36a6a23cdb748cf9 Mon Sep 17 00:00:00 2001
From: Ranjit Kumar Tulabandu <ranjit.tulabandu@ittiam.com>
Date: Tue, 27 Dec 2016 18:45:43 +0530
Subject: [PATCH] Multi-threading of first pass stats collection

(yunqingwang)
1. Rebased the patch. Incorporated recent first pass changes.
2. Turned on the first pass unit test.

Change-Id: Ia2f7ba8152d0b6dd6bf8efb9dfaf505ba7d8edee
---
 test/vp9_ethread_test.cc       |    9 +-
 vp9/encoder/vp9_block.h        |    4 +
 vp9/encoder/vp9_encodemb.c     |    7 +-
 vp9/encoder/vp9_encoder.c      |   21 +
 vp9/encoder/vp9_encoder.h      |   46 ++
 vp9/encoder/vp9_ethread.c      |  324 +++++++++++-
 vp9/encoder/vp9_ethread.h      |   35 ++
 vp9/encoder/vp9_firstpass.c    | 1120 +++++++++++++++++++++++-----------------
 vp9/encoder/vp9_firstpass.h    |   47 ++
 vp9/encoder/vp9_job_queue.h    |   46 ++
 vp9/encoder/vp9_multi_thread.c |  282 ++++++++++
 vp9/encoder/vp9_multi_thread.h |   38 ++
 vp9/vp9_cx_iface.c             |   13 +
 vp9/vp9cx.mk                   |    3 +
 vpx/vp8cx.h                    |   11 +
 vpxenc.c                       |    5 +
 16 files changed, 1496 insertions(+), 515 deletions(-)
 create mode 100644 vp9/encoder/vp9_job_queue.h
 create mode 100644 vp9/encoder/vp9_multi_thread.c
 create mode 100644 vp9/encoder/vp9_multi_thread.h

diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index ee52256..d779930 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -82,9 +82,8 @@ class VPxFirstPassEncoderThreadTest
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 0);
 
       // For now, new_mt_mode only works for 2-pass encoding.
-      // Enable this once the fp mt patch is checked in.
-      // if (encoding_mode_ == ::libvpx_test::kTwoPassGood)
-      //  encoder->Control(VP9E_SET_NEW_MT, new_mt_mode_);
+      if (encoding_mode_ == ::libvpx_test::kTwoPassGood)
+        encoder->Control(VP9E_SET_NEW_MT, new_mt_mode_);
 
       encoder_initialized_ = true;
     }
@@ -131,7 +130,7 @@ static void compare_fp_stats(vpx_fixed_buf_t *fp_stats) {
 
     for (j = 0; j < kDbl; ++j) {
       EXPECT_LE(fabs(*frame_stats1 - *frame_stats2),
-                fabs(*frame_stats1) / 1000.0);
+                fabs(*frame_stats1) / 10000.0);
       frame_stats1++;
       frame_stats2++;
     }
@@ -146,7 +145,7 @@ static void compare_fp_stats(vpx_fixed_buf_t *fp_stats) {
 }
 
 TEST_P(VPxFirstPassEncoderThreadTest, FirstPassStatsTest) {
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 50);
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
 
   first_pass_only_ = 1;
   cfg_.rc_target_bitrate = 1000;
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 0d5075c..91d07e3 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -131,6 +131,10 @@ struct macroblock {
   int use_lp32x32fdct;
   int skip_encode;
 
+  // In first pass, intra prediction is done based on source pixels
+  // at tile boundaries
+  int fp_src_pred;
+
   // use fast quantization process
   int quant_fp;
 
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 2cb137d..1dc8d34 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -773,9 +773,10 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
     }
   }
 
-  vp9_predict_intra_block(xd, bwl, tx_size, mode, x->skip_encode ? src : dst,
-                          x->skip_encode ? src_stride : dst_stride, dst,
-                          dst_stride, col, row, plane);
+  vp9_predict_intra_block(
+      xd, bwl, tx_size, mode, (x->skip_encode || x->fp_src_pred) ? src : dst,
+      (x->skip_encode || x->fp_src_pred) ? src_stride : dst_stride, dst,
+      dst_stride, col, row, plane);
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 26326fc..5dbe62d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -50,6 +50,7 @@
 #include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mbgraph.h"
+#include "vp9/encoder/vp9_multi_thread.h"
 #include "vp9/encoder/vp9_noise_estimate.h"
 #include "vp9/encoder/vp9_picklpf.h"
 #include "vp9/encoder/vp9_ratectrl.h"
@@ -1563,6 +1564,13 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) {
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
 #endif
+
+  // Enable multi-threading for first pass.
+  cpi->new_mt = 0;
+  if (((cpi->oxcf.mode == GOOD || cpi->oxcf.mode == BEST) &&
+       cpi->oxcf.speed < 5 && cpi->oxcf.pass == 1) &&
+      cpi->oxcf.new_mt)
+    cpi->new_mt = 1;
 }
 
 #ifndef M_LOG2_E
@@ -1719,6 +1727,12 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
   }
 #endif
 
+#if ENABLE_MT_BIT_MATCH
+  CHECK_MEM_ERROR(
+      cm, cpi->twopass.fp_mb_float_stats,
+      vpx_calloc(cm->MBs * sizeof(*cpi->twopass.fp_mb_float_stats), 1));
+#endif
+
   cpi->refresh_alt_ref_frame = 0;
   cpi->multi_arf_last_grp_enabled = 0;
 
@@ -2076,6 +2090,7 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
   }
   vpx_free(cpi->tile_thr_data);
   vpx_free(cpi->workers);
+  vp9_row_mt_mem_dealloc(cpi);
 
   if (cpi->num_workers > 1) {
     vp9_loop_filter_dealloc(&cpi->lf_row_sync);
@@ -2098,6 +2113,11 @@ void vp9_remove_compressor(VP9_COMP *cpi) {
   }
 #endif
 
+#if ENABLE_MT_BIT_MATCH
+  vpx_free(cpi->twopass.fp_mb_float_stats);
+  cpi->twopass.fp_mb_float_stats = NULL;
+#endif
+
   vp9_remove_common(cm);
   vp9_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_VP9_POSTPROC
@@ -4802,6 +4822,7 @@ int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags,
     for (i = 0; i < MAX_REF_FRAMES; ++i) cpi->scaled_ref_idx[i] = INVALID_IDX;
   }
 
+  cpi->td.mb.fp_src_pred = 0;
   if (oxcf->pass == 1 && (!cpi->use_svc || is_two_pass_svc(cpi))) {
     const int lossless = is_lossless_requested(oxcf);
 #if CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 00552e1..e1046f1 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -33,7 +33,9 @@
 #include "vp9/encoder/vp9_aq_cyclicrefresh.h"
 #include "vp9/encoder/vp9_context_tree.h"
 #include "vp9/encoder/vp9_encodemb.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_job_queue.h"
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/encoder/vp9_mbgraph.h"
 #include "vp9/encoder/vp9_mcomp.h"
@@ -256,6 +258,8 @@ typedef struct VP9EncoderConfig {
   int render_width;
   int render_height;
   VP9E_TEMPORAL_LAYERING_MODE temporal_layering_mode;
+
+  int new_mt;
 } VP9EncoderConfig;
 
 static INLINE int is_lossless_requested(const VP9EncoderConfig *cfg) {
@@ -269,8 +273,34 @@ typedef struct TileDataEnc {
   int mode_map[BLOCK_SIZES][MAX_MODES];
   int m_search_count;
   int ex_search_count;
+  FIRSTPASS_DATA fp_data;
+  VP9RowMTSync row_mt_sync;
 } TileDataEnc;
 
+typedef struct RowMTInfo {
+  JobQueueHandle job_queue_hdl;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t job_mutex;
+#endif
+} RowMTInfo;
+
+typedef struct MultiThreadHandle {
+  int allocated_tile_rows;
+  int allocated_tile_cols;
+  int allocated_vert_unit_rows;
+
+  // Frame level params
+  int num_tile_vert_sbs[MAX_NUM_TILE_ROWS];
+
+  // Job Queue structure and handles
+  JobQueue *job_queue;
+
+  int jobs_per_tile_col;
+
+  RowMTInfo row_mt_info[MAX_NUM_TILE_COLS];
+  int thread_id_to_tile_id[MAX_NUM_THREADS];  // Mapping of threads to tiles
+} MultiThreadHandle;
+
 typedef struct RD_COUNTS {
   vp9_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int64_t comp_pred_diff[REFERENCE_MODES];
@@ -629,6 +659,10 @@ typedef struct VP9_COMP {
 
   int keep_level_stats;
   Vp9LevelInfo level_info;
+  MultiThreadHandle multi_thread_ctxt;
+  void (*row_mt_sync_read_ptr)(VP9RowMTSync *const, int, int);
+  void (*row_mt_sync_write_ptr)(VP9RowMTSync *const, int, int, const int);
+  int new_mt;
 
   // Previous Partition Info
   BLOCK_SIZE *prev_partition;
@@ -808,6 +842,18 @@ static INLINE int *cond_cost_list(const struct VP9_COMP *cpi, int *cost_list) {
   return cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL;
 }
 
+static INLINE int get_num_vert_units(TileInfo tile, int shift) {
+  int num_vert_units =
+      (tile.mi_row_end - tile.mi_row_start + (1 << shift) - 1) >> shift;
+  return num_vert_units;
+}
+
+static INLINE int get_num_cols(TileInfo tile, int shift) {
+  int num_cols =
+      (tile.mi_col_end - tile.mi_col_start + (1 << shift) - 1) >> shift;
+  return num_cols;
+}
+
 static INLINE int get_level_index(VP9_LEVEL level) {
   int i;
   for (i = 0; i < VP9_LEVELS; ++i) {
diff --git a/vp9/encoder/vp9_ethread.c b/vp9/encoder/vp9_ethread.c
index f4f7c7b..6cc103c 100644
--- a/vp9/encoder/vp9_ethread.c
+++ b/vp9/encoder/vp9_ethread.c
@@ -11,6 +11,8 @@
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_multi_thread.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
 static void accumulate_rd_opt(ThreadData *td, ThreadData *td_t) {
@@ -64,15 +66,11 @@ static int get_max_tile_cols(VP9_COMP *cpi) {
   return (1 << log2_tile_cols);
 }
 
-void vp9_encode_tiles_mt(VP9_COMP *cpi) {
+static void create_enc_workers(VP9_COMP *cpi, int num_workers) {
   VP9_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-  const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
   int i;
 
-  vp9_init_tile_data(cpi);
-
   // Only run once to create threads and allocate thread data.
   if (cpi->num_workers == 0) {
     int allocated_workers = num_workers;
@@ -123,19 +121,57 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
         thread_data->cpi = cpi;
         thread_data->td = &cpi->td;
       }
-
       winterface->sync(worker);
     }
   }
+}
+
+static void launch_enc_workers(VP9_COMP *cpi, VPxWorkerHook hook, void *data2,
+                               int num_workers) {
+  const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
+  int i;
 
   for (i = 0; i < num_workers; i++) {
     VPxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *thread_data;
-
-    worker->hook = (VPxWorkerHook)enc_worker_hook;
+    worker->hook = (VPxWorkerHook)hook;
     worker->data1 = &cpi->tile_thr_data[i];
-    worker->data2 = NULL;
-    thread_data = (EncWorkerData *)worker->data1;
+    worker->data2 = data2;
+  }
+
+  // Encode a frame
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+
+    // Set the starting tile for each thread.
+    thread_data->start = i;
+
+    if (i == cpi->num_workers - 1)
+      winterface->execute(worker);
+    else
+      winterface->launch(worker);
+  }
+
+  // Encoding ends.
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    winterface->sync(worker);
+  }
+}
+
+void vp9_encode_tiles_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
+  int i;
+
+  vp9_init_tile_data(cpi);
+
+  create_enc_workers(cpi, num_workers);
+
+  for (i = 0; i < num_workers; i++) {
+    EncWorkerData *thread_data;
+    thread_data = &cpi->tile_thr_data[i];
 
     // Before encoding a frame, copy the thread data from cpi.
     if (thread_data->td != &cpi->td) {
@@ -165,34 +201,266 @@ void vp9_encode_tiles_mt(VP9_COMP *cpi) {
     }
   }
 
-  // Encode a frame
+  launch_enc_workers(cpi, (VPxWorkerHook)enc_worker_hook, NULL, num_workers);
+
   for (i = 0; i < num_workers; i++) {
     VPxWorker *const worker = &cpi->workers[i];
     EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
 
-    // Set the starting tile for each thread.
-    thread_data->start = i;
+    // Accumulate counters.
+    if (i < cpi->num_workers - 1) {
+      vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0);
+      accumulate_rd_opt(&cpi->td, thread_data->td);
+    }
+  }
+}
 
-    if (i == cpi->num_workers - 1)
-      winterface->execute(worker);
-    else
-      winterface->launch(worker);
+static void accumulate_fp_tile_stat(TileDataEnc *tile_data,
+                                    TileDataEnc *tile_data_t) {
+  tile_data->fp_data.intra_factor += tile_data_t->fp_data.intra_factor;
+  tile_data->fp_data.brightness_factor +=
+      tile_data_t->fp_data.brightness_factor;
+  tile_data->fp_data.coded_error += tile_data_t->fp_data.coded_error;
+  tile_data->fp_data.sr_coded_error += tile_data_t->fp_data.sr_coded_error;
+  tile_data->fp_data.frame_noise_energy +=
+      tile_data_t->fp_data.frame_noise_energy;
+  tile_data->fp_data.intra_error += tile_data_t->fp_data.intra_error;
+  tile_data->fp_data.intercount += tile_data_t->fp_data.intercount;
+  tile_data->fp_data.second_ref_count += tile_data_t->fp_data.second_ref_count;
+  tile_data->fp_data.neutral_count += tile_data_t->fp_data.neutral_count;
+  tile_data->fp_data.intra_skip_count += tile_data_t->fp_data.intra_skip_count;
+  tile_data->fp_data.mvcount += tile_data_t->fp_data.mvcount;
+  tile_data->fp_data.sum_mvr += tile_data_t->fp_data.sum_mvr;
+  tile_data->fp_data.sum_mvr_abs += tile_data_t->fp_data.sum_mvr_abs;
+  tile_data->fp_data.sum_mvc += tile_data_t->fp_data.sum_mvc;
+  tile_data->fp_data.sum_mvc_abs += tile_data_t->fp_data.sum_mvc_abs;
+  tile_data->fp_data.sum_mvrs += tile_data_t->fp_data.sum_mvrs;
+  tile_data->fp_data.sum_mvcs += tile_data_t->fp_data.sum_mvcs;
+  tile_data->fp_data.sum_in_vectors += tile_data_t->fp_data.sum_in_vectors;
+  tile_data->fp_data.intra_smooth_count +=
+      tile_data_t->fp_data.intra_smooth_count;
+  tile_data->fp_data.image_data_start_row =
+      VPXMIN(tile_data->fp_data.image_data_start_row,
+             tile_data_t->fp_data.image_data_start_row) == INVALID_ROW
+          ? VPXMAX(tile_data->fp_data.image_data_start_row,
+                   tile_data_t->fp_data.image_data_start_row)
+          : VPXMIN(tile_data->fp_data.image_data_start_row,
+                   tile_data_t->fp_data.image_data_start_row);
+}
+
+// Allocate memory for row synchronization
+void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, VP9_COMMON *cm,
+                               int rows) {
+  row_mt_sync->rows = rows;
+#if CONFIG_MULTITHREAD
+  {
+    int i;
+
+    CHECK_MEM_ERROR(cm, row_mt_sync->mutex_,
+                    vpx_malloc(sizeof(*row_mt_sync->mutex_) * rows));
+    if (row_mt_sync->mutex_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&row_mt_sync->mutex_[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, row_mt_sync->cond_,
+                    vpx_malloc(sizeof(*row_mt_sync->cond_) * rows));
+    if (row_mt_sync->cond_) {
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&row_mt_sync->cond_[i], NULL);
+      }
+    }
   }
+#endif  // CONFIG_MULTITHREAD
 
-  // Encoding ends.
-  for (i = 0; i < num_workers; i++) {
-    VPxWorker *const worker = &cpi->workers[i];
-    winterface->sync(worker);
+  CHECK_MEM_ERROR(cm, row_mt_sync->cur_col,
+                  vpx_malloc(sizeof(*row_mt_sync->cur_col) * rows));
+
+  // Set up nsync.
+  row_mt_sync->sync_range = 1;
+}
+
+// Deallocate row based multi-threading synchronization related mutex and data
+void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync) {
+  if (row_mt_sync != NULL) {
+#if CONFIG_MULTITHREAD
+    int i;
+
+    if (row_mt_sync->mutex_ != NULL) {
+      for (i = 0; i < row_mt_sync->rows; ++i) {
+        pthread_mutex_destroy(&row_mt_sync->mutex_[i]);
+      }
+      vpx_free(row_mt_sync->mutex_);
+    }
+    if (row_mt_sync->cond_ != NULL) {
+      for (i = 0; i < row_mt_sync->rows; ++i) {
+        pthread_cond_destroy(&row_mt_sync->cond_[i]);
+      }
+      vpx_free(row_mt_sync->cond_);
+    }
+#endif  // CONFIG_MULTITHREAD
+    vpx_free(row_mt_sync->cur_col);
+    // clear the structure as the source of this call may be dynamic change
+    // in tiles in which case this call will be followed by an _alloc()
+    // which may fail.
+    vp9_zero(*row_mt_sync);
   }
+}
+
+void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c) {
+#if CONFIG_MULTITHREAD
+  const int nsync = row_mt_sync->sync_range;
+
+  if (r && !(c & (nsync - 1))) {
+    pthread_mutex_t *const mutex = &row_mt_sync->mutex_[r - 1];
+    pthread_mutex_lock(mutex);
+
+    while (c > row_mt_sync->cur_col[r - 1] - nsync) {
+      pthread_cond_wait(&row_mt_sync->cond_[r - 1], mutex);
+    }
+    pthread_mutex_unlock(mutex);
+  }
+#else
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c) {
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+  return;
+}
+
+void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,
+                           const int cols) {
+#if CONFIG_MULTITHREAD
+  const int nsync = row_mt_sync->sync_range;
+  int cur;
+  // Only signal when there are enough filtered SB for next row to run.
+  int sig = 1;
+
+  if (c < cols - 1) {
+    cur = c;
+    if (c % nsync) sig = 0;
+  } else {
+    cur = cols + nsync;
+  }
+
+  if (sig) {
+    pthread_mutex_lock(&row_mt_sync->mutex_[r]);
+
+    row_mt_sync->cur_col[r] = cur;
+
+    pthread_cond_signal(&row_mt_sync->cond_[r]);
+    pthread_mutex_unlock(&row_mt_sync->mutex_[r]);
+  }
+#else
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+#endif  // CONFIG_MULTITHREAD
+}
+
+void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
+                                 const int cols) {
+  (void)row_mt_sync;
+  (void)r;
+  (void)c;
+  (void)cols;
+  return;
+}
+
+static int first_pass_worker_hook(EncWorkerData *const thread_data,
+                                  MultiThreadHandle *multi_thread_ctxt) {
+  VP9_COMP *const cpi = thread_data->cpi;
+  const VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  int tile_row, tile_col;
+  TileDataEnc *this_tile;
+  int end_of_frame;
+  int thread_id = thread_data->thread_id;
+  int cur_tile_id = multi_thread_ctxt->thread_id_to_tile_id[thread_id];
+  JobNode *proc_job = NULL;
+  FIRSTPASS_DATA fp_acc_data;
+  MV zero_mv = { 0, 0 };
+  MV best_ref_mv;
+  int mb_row;
+
+  end_of_frame = 0;
+  while (0 == end_of_frame) {
+    // Get the next job in the queue
+    proc_job =
+        (JobNode *)vp9_enc_grp_get_next_job(multi_thread_ctxt, cur_tile_id);
+    if (NULL == proc_job) {
+      // Query for the status of other tiles
+      end_of_frame = vp9_get_tiles_proc_status(
+          multi_thread_ctxt, thread_data->tile_completion_status, &cur_tile_id,
+          tile_cols);
+    } else {
+      tile_col = proc_job->tile_col_id;
+      tile_row = proc_job->tile_row_id;
+
+      this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+      mb_row = proc_job->vert_unit_row_num;
+
+      best_ref_mv = zero_mv;
+      vp9_zero(fp_acc_data);
+      fp_acc_data.image_data_start_row = INVALID_ROW;
+      vp9_first_pass_encode_tile_mb_row(cpi, thread_data->td, &fp_acc_data,
+                                        this_tile, &best_ref_mv, mb_row);
+    }
+  }
+  return 0;
+}
+
+void vp9_encode_fp_row_mt(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  TileDataEnc *first_tile_col;
+  int num_workers = VPXMAX(cpi->oxcf.max_threads, 1);
+  int i;
+
+  if (multi_thread_ctxt->allocated_tile_cols < tile_cols ||
+      multi_thread_ctxt->allocated_tile_rows < tile_rows ||
+      multi_thread_ctxt->allocated_vert_unit_rows < cm->mb_rows) {
+    vp9_row_mt_mem_dealloc(cpi);
+    vp9_init_tile_data(cpi);
+    vp9_row_mt_mem_alloc(cpi);
+  } else {
+    vp9_init_tile_data(cpi);
+  }
+
+  create_enc_workers(cpi, num_workers);
+
+  vp9_assign_tile_to_thread(multi_thread_ctxt, tile_cols, cpi->num_workers);
+
+  vp9_prepare_job_queue(cpi, FIRST_PASS_JOB);
+
+  vp9_multi_thread_tile_init(cpi);
 
   for (i = 0; i < num_workers; i++) {
-    VPxWorker *const worker = &cpi->workers[i];
-    EncWorkerData *const thread_data = (EncWorkerData *)worker->data1;
+    EncWorkerData *thread_data;
+    thread_data = &cpi->tile_thr_data[i];
 
-    // Accumulate counters.
-    if (i < cpi->num_workers - 1) {
-      vp9_accumulate_frame_counts(&cm->counts, thread_data->td->counts, 0);
-      accumulate_rd_opt(&cpi->td, thread_data->td);
+    // Before encoding a frame, copy the thread data from cpi.
+    if (thread_data->td != &cpi->td) {
+      thread_data->td->mb = cpi->td.mb;
     }
   }
+
+  launch_enc_workers(cpi, (VPxWorkerHook)first_pass_worker_hook,
+                     multi_thread_ctxt, num_workers);
+
+  first_tile_col = &cpi->tile_data[0];
+  for (i = 1; i < tile_cols; i++) {
+    TileDataEnc *this_tile = &cpi->tile_data[i];
+    accumulate_fp_tile_stat(first_tile_col, this_tile);
+  }
 }
diff --git a/vp9/encoder/vp9_ethread.h b/vp9/encoder/vp9_ethread.h
index 1efa4dc..968e500 100644
--- a/vp9/encoder/vp9_ethread.h
+++ b/vp9/encoder/vp9_ethread.h
@@ -15,6 +15,10 @@
 extern "C" {
 #endif
 
+#define MAX_NUM_TILE_COLS (1 << 6)
+#define MAX_NUM_TILE_ROWS 4
+#define MAX_NUM_THREADS 80
+
 struct VP9_COMP;
 struct ThreadData;
 
@@ -22,10 +26,41 @@ typedef struct EncWorkerData {
   struct VP9_COMP *cpi;
   struct ThreadData *td;
   int start;
+  int thread_id;
+  int tile_completion_status[MAX_NUM_TILE_COLS];
 } EncWorkerData;
 
+// Encoder row synchronization
+typedef struct VP9RowMTSyncData {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_;
+  pthread_cond_t *cond_;
+#endif
+  // Allocate memory to store the sb/mb block index in each row.
+  int *cur_col;
+  int sync_range;
+  int rows;
+} VP9RowMTSync;
+
 void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
 
+void vp9_encode_fp_row_mt(struct VP9_COMP *cpi);
+
+void vp9_row_mt_sync_read(VP9RowMTSync *const row_mt_sync, int r, int c);
+void vp9_row_mt_sync_write(VP9RowMTSync *const row_mt_sync, int r, int c,
+                           const int cols);
+
+void vp9_row_mt_sync_read_dummy(VP9RowMTSync *const row_mt_sync, int r, int c);
+void vp9_row_mt_sync_write_dummy(VP9RowMTSync *const row_mt_sync, int r, int c,
+                                 const int cols);
+
+// Allocate memory for row based multi-threading synchronization.
+void vp9_row_mt_sync_mem_alloc(VP9RowMTSync *row_mt_sync, struct VP9Common *cm,
+                               int rows);
+
+// Deallocate row based multi-threading synchronization related mutex and data.
+void vp9_row_mt_sync_mem_dealloc(VP9RowMTSync *row_mt_sync);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 72e9ac7..59dd536 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -31,6 +31,7 @@
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
 #include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
 #include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/encoder/vp9_mcomp.h"
@@ -646,37 +647,150 @@ static int fp_estimate_block_noise(MACROBLOCK *x, BLOCK_SIZE bsize) {
   return block_noise << 2;  // Scale << 2 to account for sampling.
 }
 
-#define INVALID_ROW -1
-void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
+#if ENABLE_MT_BIT_MATCH
+static void accumulate_floating_point_stats(VP9_COMP *cpi,
+                                            TileDataEnc *first_tile_col) {
+  VP9_COMMON *const cm = &cpi->common;
   int mb_row, mb_col;
-  MACROBLOCK *const x = &cpi->td.mb;
+  first_tile_col->fp_data.intra_factor = 0;
+  first_tile_col->fp_data.brightness_factor = 0;
+  first_tile_col->fp_data.neutral_count = 0;
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
+      const int mb_index = mb_row * cm->mb_cols + mb_col;
+      first_tile_col->fp_data.intra_factor +=
+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor;
+      first_tile_col->fp_data.brightness_factor +=
+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor;
+      first_tile_col->fp_data.neutral_count +=
+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count;
+    }
+  }
+}
+#endif
+
+static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
+                                 FIRSTPASS_DATA *fp_acc_data) {
+  VP9_COMMON *const cm = &cpi->common;
+  // The minimum error here insures some bit allocation to frames even
+  // in static regions. The allocation per MB declines for larger formats
+  // where the typical "real" energy per MB also falls.
+  // Initial estimate here uses sqrt(mbs) to define the min_err, where the
+  // number of mbs is proportional to the image area.
+  const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE) ? cpi->initial_mbs
+                                                             : cpi->common.MBs;
+  const double min_err = 200 * sqrt(num_mbs);
+
+  // Clamp the image start to rows/2. This number of rows is discarded top
+  // and bottom as dead data so rows / 2 means the frame is blank.
+  if ((fp_acc_data->image_data_start_row > cm->mb_rows / 2) ||
+      (fp_acc_data->image_data_start_row == INVALID_ROW)) {
+    fp_acc_data->image_data_start_row = cm->mb_rows / 2;
+  }
+  // Exclude any image dead zone
+  if (fp_acc_data->image_data_start_row > 0) {
+    fp_acc_data->intra_skip_count =
+        VPXMAX(0, fp_acc_data->intra_skip_count -
+                      (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
+  }
+
+  fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs;
+  fp_acc_data->brightness_factor =
+      fp_acc_data->brightness_factor / (double)num_mbs;
+  fps->weight = fp_acc_data->intra_factor * fp_acc_data->brightness_factor;
+
+  fps->frame = cm->current_video_frame;
+  fps->spatial_layer_id = cpi->svc.spatial_layer_id;
+  fps->coded_error = (double)(fp_acc_data->coded_error >> 8) + min_err;
+  fps->sr_coded_error = (double)(fp_acc_data->sr_coded_error >> 8) + min_err;
+  fps->intra_error = (double)(fp_acc_data->intra_error >> 8) + min_err;
+  fps->frame_noise_energy =
+      (double)(fp_acc_data->frame_noise_energy) / (double)num_mbs;
+  fps->count = 1.0;
+  fps->pcnt_inter = (double)(fp_acc_data->intercount) / num_mbs;
+  fps->pcnt_second_ref = (double)(fp_acc_data->second_ref_count) / num_mbs;
+  fps->pcnt_neutral = (double)(fp_acc_data->neutral_count) / num_mbs;
+  fps->intra_skip_pct = (double)(fp_acc_data->intra_skip_count) / num_mbs;
+  fps->intra_smooth_pct = (double)(fp_acc_data->intra_smooth_count) / num_mbs;
+  fps->inactive_zone_rows = (double)(fp_acc_data->image_data_start_row);
+  // Currently set to 0 as most issues relate to letter boxing.
+  fps->inactive_zone_cols = (double)0;
+
+  if (fp_acc_data->mvcount > 0) {
+    fps->MVr = (double)(fp_acc_data->sum_mvr) / fp_acc_data->mvcount;
+    fps->mvr_abs = (double)(fp_acc_data->sum_mvr_abs) / fp_acc_data->mvcount;
+    fps->MVc = (double)(fp_acc_data->sum_mvc) / fp_acc_data->mvcount;
+    fps->mvc_abs = (double)(fp_acc_data->sum_mvc_abs) / fp_acc_data->mvcount;
+    fps->MVrv = ((double)(fp_acc_data->sum_mvrs) -
+                 ((double)(fp_acc_data->sum_mvr) * (fp_acc_data->sum_mvr) /
+                  fp_acc_data->mvcount)) /
+                fp_acc_data->mvcount;
+    fps->MVcv = ((double)(fp_acc_data->sum_mvcs) -
+                 ((double)(fp_acc_data->sum_mvc) * (fp_acc_data->sum_mvc) /
+                  fp_acc_data->mvcount)) /
+                fp_acc_data->mvcount;
+    fps->mv_in_out_count =
+        (double)(fp_acc_data->sum_in_vectors) / (fp_acc_data->mvcount * 2);
+    fps->pcnt_motion = (double)(fp_acc_data->mvcount) / num_mbs;
+  } else {
+    fps->MVr = 0.0;
+    fps->mvr_abs = 0.0;
+    fps->MVc = 0.0;
+    fps->mvc_abs = 0.0;
+    fps->MVrv = 0.0;
+    fps->MVcv = 0.0;
+    fps->mv_in_out_count = 0.0;
+    fps->pcnt_motion = 0.0;
+  }
+}
+
+static void accumulate_fp_mb_row_stat(TileDataEnc *this_tile,
+                                      FIRSTPASS_DATA *fp_acc_data) {
+  this_tile->fp_data.intra_factor += fp_acc_data->intra_factor;
+  this_tile->fp_data.brightness_factor += fp_acc_data->brightness_factor;
+  this_tile->fp_data.coded_error += fp_acc_data->coded_error;
+  this_tile->fp_data.sr_coded_error += fp_acc_data->sr_coded_error;
+  this_tile->fp_data.frame_noise_energy += fp_acc_data->frame_noise_energy;
+  this_tile->fp_data.intra_error += fp_acc_data->intra_error;
+  this_tile->fp_data.intercount += fp_acc_data->intercount;
+  this_tile->fp_data.second_ref_count += fp_acc_data->second_ref_count;
+  this_tile->fp_data.neutral_count += fp_acc_data->neutral_count;
+  this_tile->fp_data.intra_skip_count += fp_acc_data->intra_skip_count;
+  this_tile->fp_data.mvcount += fp_acc_data->mvcount;
+  this_tile->fp_data.sum_mvr += fp_acc_data->sum_mvr;
+  this_tile->fp_data.sum_mvr_abs += fp_acc_data->sum_mvr_abs;
+  this_tile->fp_data.sum_mvc += fp_acc_data->sum_mvc;
+  this_tile->fp_data.sum_mvc_abs += fp_acc_data->sum_mvc_abs;
+  this_tile->fp_data.sum_mvrs += fp_acc_data->sum_mvrs;
+  this_tile->fp_data.sum_mvcs += fp_acc_data->sum_mvcs;
+  this_tile->fp_data.sum_in_vectors += fp_acc_data->sum_in_vectors;
+  this_tile->fp_data.intra_smooth_count += fp_acc_data->intra_smooth_count;
+  this_tile->fp_data.image_data_start_row =
+      VPXMIN(this_tile->fp_data.image_data_start_row,
+             fp_acc_data->image_data_start_row) == INVALID_ROW
+          ? VPXMAX(this_tile->fp_data.image_data_start_row,
+                   fp_acc_data->image_data_start_row)
+          : VPXMIN(this_tile->fp_data.image_data_start_row,
+                   fp_acc_data->image_data_start_row);
+}
+
+void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
+                                       FIRSTPASS_DATA *fp_acc_data,
+                                       TileDataEnc *tile_data, MV *best_ref_mv,
+                                       int mb_row) {
+  int mb_col;
+  MACROBLOCK *const x = &td->mb;
   VP9_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  TileInfo tile;
+  TileInfo tile = tile_data->tile_info;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
-  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
-  int i;
+  const PICK_MODE_CONTEXT *ctx = &td->pc_root->none;
+  int i, c;
+  int num_mb_cols = get_num_cols(tile_data->tile_info, 1);
 
   int recon_yoffset, recon_uvoffset;
-  int64_t intra_error = 0;
-  int64_t coded_error = 0;
-  int64_t sr_coded_error = 0;
-  int64_t frame_noise_energy = 0;
-
-  int sum_mvr = 0, sum_mvc = 0;
-  int sum_mvr_abs = 0, sum_mvc_abs = 0;
-  int64_t sum_mvrs = 0, sum_mvcs = 0;
-  int mvcount = 0;
-  int intercount = 0;
-  int second_ref_count = 0;
   const int intrapenalty = INTRA_MODE_PENALTY;
-  double neutral_count;
-  int intra_skip_count = 0;
-  int intra_smooth_count = 0;
-  int image_data_start_row = INVALID_ROW;
-  int sum_in_vectors = 0;
-  TWO_PASS *twopass = &cpi->twopass;
   const MV zero_mv = { 0, 0 };
   int recon_y_stride, recon_uv_stride, uv_mb_height;
 
@@ -688,50 +802,13 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
   LAYER_CONTEXT *const lc =
       is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id]
                            : NULL;
-  double intra_factor;
-  double brightness_factor;
-  BufferPool *const pool = cm->buffer_pool;
   MODE_INFO mi_above, mi_left;
 
   // First pass code requires valid last and new frame buffers.
   assert(new_yv12 != NULL);
   assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
 
-#if CONFIG_FP_MB_STATS
-  if (cpi->use_fp_mb_stats) {
-    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);
-  }
-#endif
-
-  vpx_clear_system_state();
-
-  intra_factor = 0.0;
-  brightness_factor = 0.0;
-  neutral_count = 0.0;
-
-  set_first_pass_params(cpi);
-  vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
-
   if (lc != NULL) {
-    twopass = &lc->twopass;
-
-    cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
-    cpi->ref_frame_flags = VP9_LAST_FLAG;
-
-    if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <
-        REF_FRAMES) {
-      cpi->gld_fb_idx =
-          cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;
-      cpi->ref_frame_flags |= VP9_GOLD_FLAG;
-      cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);
-    } else {
-      cpi->refresh_golden_frame = 0;
-    }
-
-    if (lc->current_video_frame_in_layer == 0) cpi->ref_frame_flags = 0;
-
-    vp9_scale_references(cpi);
-
     // Use either last frame or alt frame for motion search.
     if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
       first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
@@ -747,28 +824,11 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
     } else {
       gld_yv12 = NULL;
     }
-
-    set_ref_ptrs(cm, xd,
-                 (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME : NONE,
-                 (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
-
-    cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
-                                        &cpi->scaled_source, 0);
   }
 
-  vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
-
-  vp9_setup_src_planes(x, cpi->Source, 0, 0);
-  vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
-
-  if (!frame_is_intra_only(cm)) {
-    vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
-  }
-
-  xd->mi = cm->mi_grid_visible;
-  xd->mi[0] = cm->mi;
-
-  vp9_frame_init_quantizer(cpi);
+  xd->mi = cm->mi_grid_visible + xd->mi_stride * (mb_row << 1) +
+           (tile.mi_col_start >> 1);
+  xd->mi[0] = cm->mi + xd->mi_stride * (mb_row << 1) + (tile.mi_col_start >> 1);
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][1];
@@ -776,276 +836,363 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
     p[i].eobs = ctx->eobs_pbuf[i][1];
   }
-  x->skip_recode = 0;
-
-  vp9_init_mv_probs(cm);
-  vp9_initialize_rd_consts(cpi);
-
-  // Tiling is ignored in the first pass.
-  vp9_tile_init(&tile, cm, 0, 0);
 
   recon_y_stride = new_yv12->y_stride;
   recon_uv_stride = new_yv12->uv_stride;
   uv_mb_height = 16 >> (new_yv12->y_height > new_yv12->uv_height);
 
-  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
-    MV best_ref_mv = { 0, 0 };
+  // Reset above block coeffs.
+  recon_yoffset =
+      (mb_row * recon_y_stride * 16) + (tile.mi_col_start >> 1) * 16;
+  recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height) +
+                   (tile.mi_col_start >> 1) * uv_mb_height;
+
+  // Set up limit values for motion vectors to prevent them extending
+  // outside the UMV borders.
+  x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
+  x->mv_limits.row_max =
+      ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
+
+  for (mb_col = tile.mi_col_start >> 1, c = 0; mb_col < (tile.mi_col_end >> 1);
+       ++mb_col, c++) {
+    int this_error;
+    int this_intra_error;
+    const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+    const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+    double log_intra;
+    int level_sample;
+#if ENABLE_MT_BIT_MATCH
+    const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
 
-    // Reset above block coeffs.
-    recon_yoffset = (mb_row * recon_y_stride * 16);
-    recon_uvoffset = (mb_row * recon_uv_stride * uv_mb_height);
+#if CONFIG_FP_MB_STATS
+    const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
 
-    // Set up limit values for motion vectors to prevent them extending
-    // outside the UMV borders.
-    x->mv_limits.row_min = -((mb_row * 16) + BORDER_MV_PIXELS_B16);
-    x->mv_limits.row_max =
-        ((cm->mb_rows - 1 - mb_row) * 16) + BORDER_MV_PIXELS_B16;
+    (*(cpi->row_mt_sync_read_ptr))(&tile_data->row_mt_sync, mb_row, c - 1);
 
-    for (mb_col = 0; mb_col < cm->mb_cols; ++mb_col) {
-      int this_error;
-      int this_intra_error;
-      const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
-      const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
-      double log_intra;
-      int level_sample;
+    // Adjust to the next column of MBs.
+    x->plane[0].src.buf = cpi->Source->y_buffer +
+                          mb_row * 16 * x->plane[0].src.stride + mb_col * 16;
+    x->plane[1].src.buf = cpi->Source->u_buffer +
+                          mb_row * uv_mb_height * x->plane[1].src.stride +
+                          mb_col * uv_mb_height;
+    x->plane[2].src.buf = cpi->Source->v_buffer +
+                          mb_row * uv_mb_height * x->plane[1].src.stride +
+                          mb_col * uv_mb_height;
 
-#if CONFIG_FP_MB_STATS
-      const int mb_index = mb_row * cm->mb_cols + mb_col;
-#endif
+    vpx_clear_system_state();
 
-      vpx_clear_system_state();
-
-      xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
-      xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
-      xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
-      xd->mi[0]->sb_type = bsize;
-      xd->mi[0]->ref_frame[0] = INTRA_FRAME;
-      set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize],
-                     mb_col << 1, num_8x8_blocks_wide_lookup[bsize],
-                     cm->mi_rows, cm->mi_cols);
-      // Are edges available for intra prediction?
-      // Since the firstpass does not populate the mi_grid_visible,
-      // above_mi/left_mi must be overwritten with a nonzero value when edges
-      // are available.  Required by vp9_predict_intra_block().
-      xd->above_mi = (mb_row != 0) ? &mi_above : NULL;
-      xd->left_mi = (mb_col > tile.mi_col_start) ? &mi_left : NULL;
-
-      // Do intra 16x16 prediction.
-      x->skip_encode = 0;
-      xd->mi[0]->mode = DC_PRED;
-      xd->mi[0]->tx_size =
-          use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
-
-      // Set the 16x16 src_diff block to zero, which ensures correct this_error
-      // calculation for block sizes smaller than 16x16.
-      vp9_zero_array(x->plane[0].src_diff, 256);
-      vp9_encode_intra_block_plane(x, bsize, 0, 0);
-      this_error = vpx_get_mb_ss(x->plane[0].src_diff);
-      this_intra_error = this_error;
-
-      // Keep a record of blocks that have very low intra error residual
-      // (i.e. are in effect completely flat and untextured in the intra
-      // domain). In natural videos this is uncommon, but it is much more
-      // common in animations, graphics and screen content, so may be used
-      // as a signal to detect these types of content.
-      if (this_error < get_ul_intra_threshold(cm)) {
-        ++intra_skip_count;
-      } else if ((mb_col > 0) && (image_data_start_row == INVALID_ROW)) {
-        image_data_start_row = mb_row;
-      }
+    xd->plane[0].dst.buf = new_yv12->y_buffer + recon_yoffset;
+    xd->plane[1].dst.buf = new_yv12->u_buffer + recon_uvoffset;
+    xd->plane[2].dst.buf = new_yv12->v_buffer + recon_uvoffset;
+    xd->mi[0]->sb_type = bsize;
+    xd->mi[0]->ref_frame[0] = INTRA_FRAME;
+    set_mi_row_col(xd, &tile, mb_row << 1, num_8x8_blocks_high_lookup[bsize],
+                   mb_col << 1, num_8x8_blocks_wide_lookup[bsize], cm->mi_rows,
+                   cm->mi_cols);
+    // Are edges available for intra prediction?
+    // Since the firstpass does not populate the mi_grid_visible,
+    // above_mi/left_mi must be overwritten with a nonzero value when edges
+    // are available.  Required by vp9_predict_intra_block().
+    xd->above_mi = (mb_row != 0) ? &mi_above : NULL;
+    xd->left_mi = ((mb_col << 1) > tile.mi_col_start) ? &mi_left : NULL;
+
+    // Do intra 16x16 prediction.
+    x->skip_encode = 0;
+    x->fp_src_pred = 0;
+    // Do intra prediction based on source pixels for tile boundaries
+    if ((mb_col == (tile.mi_col_start >> 1)) && mb_col != 0) {
+      xd->left_mi = &mi_left;
+      x->fp_src_pred = 1;
+    }
+    xd->mi[0]->mode = DC_PRED;
+    xd->mi[0]->tx_size =
+        use_dc_pred ? (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
+    // Fix - zero the 16x16 block first. This ensures correct this_error for
+    // block sizes smaller than 16x16.
+    vp9_zero_array(x->plane[0].src_diff, 256);
+    vp9_encode_intra_block_plane(x, bsize, 0, 0);
+    this_error = vpx_get_mb_ss(x->plane[0].src_diff);
+    this_intra_error = this_error;
+
+    // Keep a record of blocks that have very low intra error residual
+    // (i.e. are in effect completely flat and untextured in the intra
+    // domain). In natural videos this is uncommon, but it is much more
+    // common in animations, graphics and screen content, so may be used
+    // as a signal to detect these types of content.
+    if (this_error < get_ul_intra_threshold(cm)) {
+      ++(fp_acc_data->intra_skip_count);
+    } else if ((mb_col > 0) &&
+               (fp_acc_data->image_data_start_row == INVALID_ROW)) {
+      fp_acc_data->image_data_start_row = mb_row;
+    }
 
-      // Blocks that are mainly smooth in the intra domain.
-      // Some special accounting for CQ but also these are better for testing
-      // noise levels.
-      if (this_error < get_smooth_intra_threshold(cm)) {
-        ++intra_smooth_count;
-      }
+    // Blocks that are mainly smooth in the intra domain.
+    // Some special accounting for CQ but also these are better for testing
+    // noise levels.
+    if (this_error < get_smooth_intra_threshold(cm)) {
+      ++(fp_acc_data->intra_smooth_count);
+    }
 
-      // Special case noise measurement for first frame.
-      if (cm->current_video_frame == 0) {
-        if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
-          frame_noise_energy += fp_estimate_block_noise(x, bsize);
-        } else {
-          frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
-        }
+    // Special case noise measurement for first frame.
+    if (cm->current_video_frame == 0) {
+      if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
+        fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+      } else {
+        fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
       }
+    }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth) {
-        switch (cm->bit_depth) {
-          case VPX_BITS_8: break;
-          case VPX_BITS_10: this_error >>= 4; break;
-          case VPX_BITS_12: this_error >>= 8; break;
-          default:
-            assert(0 &&
-                   "cm->bit_depth should be VPX_BITS_8, "
-                   "VPX_BITS_10 or VPX_BITS_12");
-            return;
-        }
+    if (cm->use_highbitdepth) {
+      switch (cm->bit_depth) {
+        case VPX_BITS_8: break;
+        case VPX_BITS_10: this_error >>= 4; break;
+        case VPX_BITS_12: this_error >>= 8; break;
+        default:
+          assert(0 &&
+                 "cm->bit_depth should be VPX_BITS_8, "
+                 "VPX_BITS_10 or VPX_BITS_12");
+          return;
       }
+    }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-      vpx_clear_system_state();
-      log_intra = log(this_error + 1.0);
-      if (log_intra < 10.0)
-        intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
-      else
-        intra_factor += 1.0;
+    vpx_clear_system_state();
+    log_intra = log(this_error + 1.0);
+    if (log_intra < 10.0) {
+      fp_acc_data->intra_factor += 1.0 + ((10.0 - log_intra) * 0.05);
+#if ENABLE_MT_BIT_MATCH
+      cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor =
+          1.0 + ((10.0 - log_intra) * 0.05);
+#endif
+    } else {
+      fp_acc_data->intra_factor += 1.0;
+#if ENABLE_MT_BIT_MATCH
+      cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_intra_factor = 1.0;
+#endif
+    }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (cm->use_highbitdepth)
-        level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
-      else
-        level_sample = x->plane[0].src.buf[0];
-#else
+    if (cm->use_highbitdepth)
+      level_sample = CONVERT_TO_SHORTPTR(x->plane[0].src.buf)[0];
+    else
       level_sample = x->plane[0].src.buf[0];
+#else
+    level_sample = x->plane[0].src.buf[0];
 #endif
-      if ((level_sample < DARK_THRESH) && (log_intra < 9.0))
-        brightness_factor += 1.0 + (0.01 * (DARK_THRESH - level_sample));
-      else
-        brightness_factor += 1.0;
+    if ((level_sample < DARK_THRESH) && (log_intra < 9.0)) {
+      fp_acc_data->brightness_factor +=
+          1.0 + (0.01 * (DARK_THRESH - level_sample));
+#if ENABLE_MT_BIT_MATCH
+      cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor =
+          1.0 + (0.01 * (DARK_THRESH - level_sample));
+#endif
+    } else {
+      fp_acc_data->brightness_factor += 1.0;
+#if ENABLE_MT_BIT_MATCH
+      cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_brightness_factor = 1.0;
+#endif
+    }
 
-      // Intrapenalty below deals with situations where the intra and inter
-      // error scores are very low (e.g. a plain black frame).
-      // We do not have special cases in first pass for 0,0 and nearest etc so
-      // all inter modes carry an overhead cost estimate for the mv.
-      // When the error score is very low this causes us to pick all or lots of
-      // INTRA modes and throw lots of key frames.
-      // This penalty adds a cost matching that of a 0,0 mv to the intra case.
-      this_error += intrapenalty;
+    // Intrapenalty below deals with situations where the intra and inter
+    // error scores are very low (e.g. a plain black frame).
+    // We do not have special cases in first pass for 0,0 and nearest etc so
+    // all inter modes carry an overhead cost estimate for the mv.
+    // When the error score is very low this causes us to pick all or lots of
+    // INTRA modes and throw lots of key frames.
+    // This penalty adds a cost matching that of a 0,0 mv to the intra case.
+    this_error += intrapenalty;
 
-      // Accumulate the intra error.
-      intra_error += (int64_t)this_error;
+    // Accumulate the intra error.
+    fp_acc_data->intra_error += (int64_t)this_error;
 
 #if CONFIG_FP_MB_STATS
-      if (cpi->use_fp_mb_stats) {
-        // initialization
-        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-      }
+    if (cpi->use_fp_mb_stats) {
+      // initialization
+      cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+    }
 #endif
 
-      // Set up limit values for motion vectors to prevent them extending
-      // outside the UMV borders.
-      x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
-      x->mv_limits.col_max =
-          ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
-
-      // Other than for the first frame do a motion search.
-      if ((lc == NULL && cm->current_video_frame > 0) ||
-          (lc != NULL && lc->current_video_frame_in_layer > 0)) {
-        int tmp_err, motion_error, raw_motion_error;
-        // Assume 0,0 motion with no mv overhead.
-        MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
-        struct buf_2d unscaled_last_source_buf_2d;
-
-        xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+    // Set up limit values for motion vectors to prevent them extending
+    // outside the UMV borders.
+    x->mv_limits.col_min = -((mb_col * 16) + BORDER_MV_PIXELS_B16);
+    x->mv_limits.col_max =
+        ((cm->mb_cols - 1 - mb_col) * 16) + BORDER_MV_PIXELS_B16;
+
+    // Other than for the first frame do a motion search.
+    if ((lc == NULL && cm->current_video_frame > 0) ||
+        (lc != NULL && lc->current_video_frame_in_layer > 0)) {
+      int tmp_err, motion_error, raw_motion_error;
+      // Assume 0,0 motion with no mv overhead.
+      MV mv = { 0, 0 }, tmp_mv = { 0, 0 };
+      struct buf_2d unscaled_last_source_buf_2d;
+
+      xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
 #if CONFIG_VP9_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          motion_error = highbd_get_prediction_error(
-              bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
-        } else {
-          motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                              &xd->plane[0].pre[0]);
-        }
-#else
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        motion_error = highbd_get_prediction_error(
+            bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+      } else {
         motion_error =
             get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
+      }
+#else
+      motion_error =
+          get_prediction_error(bsize, &x->plane[0].src, &xd->plane[0].pre[0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-        // Compute the motion error of the 0,0 motion using the last source
-        // frame as the reference. Skip the further motion search on
-        // reconstructed frame if this error is small.
-        unscaled_last_source_buf_2d.buf =
-            cpi->unscaled_last_source->y_buffer + recon_yoffset;
-        unscaled_last_source_buf_2d.stride =
-            cpi->unscaled_last_source->y_stride;
+      // Compute the motion error of the 0,0 motion using the last source
+      // frame as the reference. Skip the further motion search on
+      // reconstructed frame if this error is small.
+      unscaled_last_source_buf_2d.buf =
+          cpi->unscaled_last_source->y_buffer + recon_yoffset;
+      unscaled_last_source_buf_2d.stride = cpi->unscaled_last_source->y_stride;
 #if CONFIG_VP9_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          raw_motion_error = highbd_get_prediction_error(
-              bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
-        } else {
-          raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                  &unscaled_last_source_buf_2d);
-        }
-#else
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        raw_motion_error = highbd_get_prediction_error(
+            bsize, &x->plane[0].src, &unscaled_last_source_buf_2d, xd->bd);
+      } else {
         raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
                                                 &unscaled_last_source_buf_2d);
+      }
+#else
+      raw_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                              &unscaled_last_source_buf_2d);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-        // TODO(pengchong): Replace the hard-coded threshold
-        if (raw_motion_error > 25 || lc != NULL) {
-          // Test last reference frame using the previous best mv as the
-          // starting point (best reference) for the search.
-          first_pass_motion_search(cpi, x, &best_ref_mv, &mv, &motion_error);
-
-          // If the current best reference mv is not centered on 0,0 then do a
-          // 0,0 based search as well.
-          if (!is_zero_mv(&best_ref_mv)) {
-            tmp_err = INT_MAX;
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
-
-            if (tmp_err < motion_error) {
-              motion_error = tmp_err;
-              mv = tmp_mv;
-            }
+      // TODO(pengchong): Replace the hard-coded threshold
+      if (raw_motion_error > 25 || lc != NULL) {
+        // Test last reference frame using the previous best mv as the
+        // starting point (best reference) for the search.
+        first_pass_motion_search(cpi, x, best_ref_mv, &mv, &motion_error);
+
+        // If the current best reference mv is not centered on 0,0 then do a
+        // 0,0 based search as well.
+        if (!is_zero_mv(best_ref_mv)) {
+          tmp_err = INT_MAX;
+          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &tmp_err);
+
+          if (tmp_err < motion_error) {
+            motion_error = tmp_err;
+            mv = tmp_mv;
           }
+        }
 
-          // Search in an older reference frame.
-          if (((lc == NULL && cm->current_video_frame > 1) ||
-               (lc != NULL && lc->current_video_frame_in_layer > 1)) &&
-              gld_yv12 != NULL) {
-            // Assume 0,0 motion with no mv overhead.
-            int gf_motion_error;
+        // Search in an older reference frame.
+        if (((lc == NULL && cm->current_video_frame > 1) ||
+             (lc != NULL && lc->current_video_frame_in_layer > 1)) &&
+            gld_yv12 != NULL) {
+          // Assume 0,0 motion with no mv overhead.
+          int gf_motion_error;
 
-            xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
+          xd->plane[0].pre[0].buf = gld_yv12->y_buffer + recon_yoffset;
 #if CONFIG_VP9_HIGHBITDEPTH
-            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-              gf_motion_error = highbd_get_prediction_error(
-                  bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
-            } else {
-              gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
-                                                     &xd->plane[0].pre[0]);
-            }
-#else
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+            gf_motion_error = highbd_get_prediction_error(
+                bsize, &x->plane[0].src, &xd->plane[0].pre[0], xd->bd);
+          } else {
             gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
                                                    &xd->plane[0].pre[0]);
+          }
+#else
+          gf_motion_error = get_prediction_error(bsize, &x->plane[0].src,
+                                                 &xd->plane[0].pre[0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-            first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv,
-                                     &gf_motion_error);
-
-            if (gf_motion_error < motion_error && gf_motion_error < this_error)
-              ++second_ref_count;
-
-            // Reset to last frame as reference buffer.
-            xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
-            xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
-            xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
-
-            // In accumulating a score for the older reference frame take the
-            // best of the motion predicted score and the intra coded error
-            // (just as will be done for) accumulation of "coded_error" for
-            // the last frame.
-            if (gf_motion_error < this_error)
-              sr_coded_error += gf_motion_error;
-            else
-              sr_coded_error += this_error;
-          } else {
-            sr_coded_error += motion_error;
-          }
+          first_pass_motion_search(cpi, x, &zero_mv, &tmp_mv, &gf_motion_error);
+
+          if (gf_motion_error < motion_error && gf_motion_error < this_error)
+            ++(fp_acc_data->second_ref_count);
+
+          // Reset to last frame as reference buffer.
+          xd->plane[0].pre[0].buf = first_ref_buf->y_buffer + recon_yoffset;
+          xd->plane[1].pre[0].buf = first_ref_buf->u_buffer + recon_uvoffset;
+          xd->plane[2].pre[0].buf = first_ref_buf->v_buffer + recon_uvoffset;
+
+          // In accumulating a score for the older reference frame take the
+          // best of the motion predicted score and the intra coded error
+          // (just as will be done for) accumulation of "coded_error" for
+          // the last frame.
+          if (gf_motion_error < this_error)
+            fp_acc_data->sr_coded_error += gf_motion_error;
+          else
+            fp_acc_data->sr_coded_error += this_error;
         } else {
-          sr_coded_error += motion_error;
+          fp_acc_data->sr_coded_error += motion_error;
         }
+      } else {
+        fp_acc_data->sr_coded_error += motion_error;
+      }
 
-        // Start by assuming that intra mode is best.
-        best_ref_mv.row = 0;
-        best_ref_mv.col = 0;
+      // Start by assuming that intra mode is best.
+      best_ref_mv->row = 0;
+      best_ref_mv->col = 0;
+
+#if CONFIG_FP_MB_STATS
+      if (cpi->use_fp_mb_stats) {
+        // intra prediction statistics
+        cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+        cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+        cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
+        if (this_error > FPMB_ERROR_LARGE_TH) {
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
+        } else if (this_error < FPMB_ERROR_SMALL_TH) {
+          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_SMALL_MASK;
+        }
+      }
+#endif
+
+      if (motion_error <= this_error) {
+        vpx_clear_system_state();
+
+        // Keep a count of cases where the inter and intra were very close
+        // and very low. This helps with scene cut detection for example in
+        // cropped clips with black bars at the sides or top and bottom.
+        if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
+            (this_error < (2 * intrapenalty))) {
+          fp_acc_data->neutral_count += 1.0;
+#if ENABLE_MT_BIT_MATCH
+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count = 1.0;
+#endif
+          // Also track cases where the intra is not much worse than the inter
+          // and use this in limiting the GF/arf group length.
+        } else if ((this_error > NCOUNT_INTRA_THRESH) &&
+                   (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
+          fp_acc_data->neutral_count +=
+              (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
+#if ENABLE_MT_BIT_MATCH
+          cpi->twopass.fp_mb_float_stats[mb_index].frame_mb_neutral_count =
+              (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
+#endif
+        }
+
+        mv.row *= 8;
+        mv.col *= 8;
+        this_error = motion_error;
+        xd->mi[0]->mode = NEWMV;
+        xd->mi[0]->mv[0].as_mv = mv;
+        xd->mi[0]->tx_size = TX_4X4;
+        xd->mi[0]->ref_frame[0] = LAST_FRAME;
+        xd->mi[0]->ref_frame[1] = NONE;
+        vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
+        vp9_encode_sby_pass1(x, bsize);
+        fp_acc_data->sum_mvr += mv.row;
+        fp_acc_data->sum_mvr_abs += abs(mv.row);
+        fp_acc_data->sum_mvc += mv.col;
+        fp_acc_data->sum_mvc_abs += abs(mv.col);
+        fp_acc_data->sum_mvrs += mv.row * mv.row;
+        fp_acc_data->sum_mvcs += mv.col * mv.col;
+        ++(fp_acc_data->intercount);
+
+        *best_ref_mv = mv;
 
 #if CONFIG_FP_MB_STATS
         if (cpi->use_fp_mb_stats) {
-          // intra prediction statistics
+          // inter prediction statistics
           cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-          cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+          cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
           cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
           if (this_error > FPMB_ERROR_LARGE_TH) {
             cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LARGE_MASK;
@@ -1055,214 +1202,229 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
         }
 #endif
 
-        if (motion_error <= this_error) {
-          vpx_clear_system_state();
-
-          // Keep a count of cases where the inter and intra were very close
-          // and very low. This helps with scene cut detection for example in
-          // cropped clips with black bars at the sides or top and bottom.
-          if (((this_error - intrapenalty) * 9 <= motion_error * 10) &&
-              (this_error < (2 * intrapenalty))) {
-            neutral_count += 1.0;
-            // Also track cases where the intra is not much worse than the inter
-            // and use this in limiting the GF/arf group length.
-          } else if ((this_error > NCOUNT_INTRA_THRESH) &&
-                     (this_error < (NCOUNT_INTRA_FACTOR * motion_error))) {
-            neutral_count +=
-                (double)motion_error / DOUBLE_DIVIDE_CHECK((double)this_error);
-          }
-
-          mv.row *= 8;
-          mv.col *= 8;
-          this_error = motion_error;
-          xd->mi[0]->mode = NEWMV;
-          xd->mi[0]->mv[0].as_mv = mv;
-          xd->mi[0]->tx_size = TX_4X4;
-          xd->mi[0]->ref_frame[0] = LAST_FRAME;
-          xd->mi[0]->ref_frame[1] = NONE;
-          vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, bsize);
-          vp9_encode_sby_pass1(x, bsize);
-          sum_mvr += mv.row;
-          sum_mvr_abs += abs(mv.row);
-          sum_mvc += mv.col;
-          sum_mvc_abs += abs(mv.col);
-          sum_mvrs += mv.row * mv.row;
-          sum_mvcs += mv.col * mv.col;
-          ++intercount;
-
-          best_ref_mv = mv;
+        if (!is_zero_mv(&mv)) {
+          ++(fp_acc_data->mvcount);
 
 #if CONFIG_FP_MB_STATS
           if (cpi->use_fp_mb_stats) {
-            // inter prediction statistics
-            cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
-            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
-            cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_ZERO_MASK;
-            if (this_error > FPMB_ERROR_LARGE_TH) {
+            cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_MOTION_ZERO_MASK;
+            // check estimated motion direction
+            if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {
+              // right direction
               cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_ERROR_LARGE_MASK;
-            } else if (this_error < FPMB_ERROR_SMALL_TH) {
+                  FPMB_MOTION_RIGHT_MASK;
+            } else if (mv.as_mv.row < 0 &&
+                       abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {
+              // up direction
+              cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_MOTION_UP_MASK;
+            } else if (mv.as_mv.col < 0 &&
+                       abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {
+              // left direction
               cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                  FPMB_ERROR_SMALL_MASK;
+                  FPMB_MOTION_LEFT_MASK;
+            } else {
+              // down direction
+              cpi->twopass.frame_mb_stats_buf[mb_index] |=
+                  FPMB_MOTION_DOWN_MASK;
             }
           }
 #endif
 
-          if (!is_zero_mv(&mv)) {
-            ++mvcount;
+          // Does the row vector point inwards or outwards?
+          if (mb_row < cm->mb_rows / 2) {
+            if (mv.row > 0)
+              --(fp_acc_data->sum_in_vectors);
+            else if (mv.row < 0)
+              ++(fp_acc_data->sum_in_vectors);
+          } else if (mb_row > cm->mb_rows / 2) {
+            if (mv.row > 0)
+              ++(fp_acc_data->sum_in_vectors);
+            else if (mv.row < 0)
+              --(fp_acc_data->sum_in_vectors);
+          }
+
+          // Does the col vector point inwards or outwards?
+          if (mb_col < cm->mb_cols / 2) {
+            if (mv.col > 0)
+              --(fp_acc_data->sum_in_vectors);
+            else if (mv.col < 0)
+              ++(fp_acc_data->sum_in_vectors);
+          } else if (mb_col > cm->mb_cols / 2) {
+            if (mv.col > 0)
+              ++(fp_acc_data->sum_in_vectors);
+            else if (mv.col < 0)
+              --(fp_acc_data->sum_in_vectors);
+          }
+          fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+        } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
+          fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+        } else {  // 0,0 mv but high error
+          fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+        }
+      } else {  // Intra < inter error
+        if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH))
+          fp_acc_data->frame_noise_energy += fp_estimate_block_noise(x, bsize);
+        else
+          fp_acc_data->frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
+      }
+    } else {
+      fp_acc_data->sr_coded_error += (int64_t)this_error;
+    }
+    fp_acc_data->coded_error += (int64_t)this_error;
+
+    recon_yoffset += 16;
+    recon_uvoffset += uv_mb_height;
+
+    // Accumulate row level stats to the corresponding tile stats
+    if (cpi->new_mt && mb_col == (tile.mi_col_end >> 1) - 1)
+      accumulate_fp_mb_row_stat(tile_data, fp_acc_data);
+
+    (*(cpi->row_mt_sync_write_ptr))(&tile_data->row_mt_sync, mb_row, c,
+                                    num_mb_cols);
+  }
+  vpx_clear_system_state();
+}
+
+static void first_pass_encode(VP9_COMP *cpi, FIRSTPASS_DATA *fp_acc_data) {
+  VP9_COMMON *const cm = &cpi->common;
+  int mb_row;
+  TileDataEnc tile_data;
+  TileInfo *tile = &tile_data.tile_info;
+  MV zero_mv = { 0, 0 };
+  MV best_ref_mv;
+  // Tiling is ignored in the first pass.
+  vp9_tile_init(tile, cm, 0, 0);
+
+  for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
+    best_ref_mv = zero_mv;
+    vp9_first_pass_encode_tile_mb_row(cpi, &cpi->td, fp_acc_data, &tile_data,
+                                      &best_ref_mv, mb_row);
+  }
+}
+
+void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) {
+  MACROBLOCK *const x = &cpi->td.mb;
+  VP9_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  TWO_PASS *twopass = &cpi->twopass;
+
+  YV12_BUFFER_CONFIG *const lst_yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
+  YV12_BUFFER_CONFIG *gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+  YV12_BUFFER_CONFIG *const new_yv12 = get_frame_new_buffer(cm);
+  const YV12_BUFFER_CONFIG *first_ref_buf = lst_yv12;
+
+  LAYER_CONTEXT *const lc =
+      is_two_pass_svc(cpi) ? &cpi->svc.layer_context[cpi->svc.spatial_layer_id]
+                           : NULL;
+  BufferPool *const pool = cm->buffer_pool;
+
+  FIRSTPASS_DATA fp_temp_data;
+  FIRSTPASS_DATA *fp_acc_data = &fp_temp_data;
+
+  vpx_clear_system_state();
+  vp9_zero(fp_temp_data);
+  fp_acc_data->image_data_start_row = INVALID_ROW;
+
+  // First pass code requires valid last and new frame buffers.
+  assert(new_yv12 != NULL);
+  assert((lc != NULL) || frame_is_intra_only(cm) || (lst_yv12 != NULL));
 
 #if CONFIG_FP_MB_STATS
-            if (cpi->use_fp_mb_stats) {
-              cpi->twopass.frame_mb_stats_buf[mb_index] &=
-                  ~FPMB_MOTION_ZERO_MASK;
-              // check estimated motion direction
-              if (mv.as_mv.col > 0 && mv.as_mv.col >= abs(mv.as_mv.row)) {
-                // right direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_RIGHT_MASK;
-              } else if (mv.as_mv.row < 0 &&
-                         abs(mv.as_mv.row) >= abs(mv.as_mv.col)) {
-                // up direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_UP_MASK;
-              } else if (mv.as_mv.col < 0 &&
-                         abs(mv.as_mv.col) >= abs(mv.as_mv.row)) {
-                // left direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_LEFT_MASK;
-              } else {
-                // down direction
-                cpi->twopass.frame_mb_stats_buf[mb_index] |=
-                    FPMB_MOTION_DOWN_MASK;
-              }
-            }
+  if (cpi->use_fp_mb_stats) {
+    vp9_zero_array(cpi->twopass.frame_mb_stats_buf, cm->initial_mbs);
+  }
 #endif
 
-            // Does the row vector point inwards or outwards?
-            if (mb_row < cm->mb_rows / 2) {
-              if (mv.row > 0)
-                --sum_in_vectors;
-              else if (mv.row < 0)
-                ++sum_in_vectors;
-            } else if (mb_row > cm->mb_rows / 2) {
-              if (mv.row > 0)
-                ++sum_in_vectors;
-              else if (mv.row < 0)
-                --sum_in_vectors;
-            }
+  set_first_pass_params(cpi);
+  vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth));
 
-            // Does the col vector point inwards or outwards?
-            if (mb_col < cm->mb_cols / 2) {
-              if (mv.col > 0)
-                --sum_in_vectors;
-              else if (mv.col < 0)
-                ++sum_in_vectors;
-            } else if (mb_col > cm->mb_cols / 2) {
-              if (mv.col > 0)
-                ++sum_in_vectors;
-              else if (mv.col < 0)
-                --sum_in_vectors;
-            }
-            frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
-          } else if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH)) {
-            frame_noise_energy += fp_estimate_block_noise(x, bsize);
-          } else {  // 0,0 mv but high error
-            frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
-          }
-        } else {  // Intra < inter error
-          if (this_intra_error < scale_sse_threshold(cm, LOW_I_THRESH))
-            frame_noise_energy += fp_estimate_block_noise(x, bsize);
-          else
-            frame_noise_energy += (int64_t)SECTION_NOISE_DEF;
-        }
-      } else {
-        sr_coded_error += (int64_t)this_error;
-      }
-      coded_error += (int64_t)this_error;
+  if (lc != NULL) {
+    twopass = &lc->twopass;
 
-      // Adjust to the next column of MBs.
-      x->plane[0].src.buf += 16;
-      x->plane[1].src.buf += uv_mb_height;
-      x->plane[2].src.buf += uv_mb_height;
+    cpi->lst_fb_idx = cpi->svc.spatial_layer_id;
+    cpi->ref_frame_flags = VP9_LAST_FLAG;
 
-      recon_yoffset += 16;
-      recon_uvoffset += uv_mb_height;
+    if (cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id <
+        REF_FRAMES) {
+      cpi->gld_fb_idx =
+          cpi->svc.number_spatial_layers + cpi->svc.spatial_layer_id;
+      cpi->ref_frame_flags |= VP9_GOLD_FLAG;
+      cpi->refresh_golden_frame = (lc->current_video_frame_in_layer == 0);
+    } else {
+      cpi->refresh_golden_frame = 0;
     }
 
-    // Adjust to the next row of MBs.
-    x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
-    x->plane[1].src.buf +=
-        uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
-    x->plane[2].src.buf +=
-        uv_mb_height * x->plane[1].src.stride - uv_mb_height * cm->mb_cols;
+    if (lc->current_video_frame_in_layer == 0) cpi->ref_frame_flags = 0;
 
-    vpx_clear_system_state();
-  }
+    vp9_scale_references(cpi);
 
-  // Clamp the image start to rows/2. This number of rows is discarded top
-  // and bottom as dead data so rows / 2 means the frame is blank.
-  if ((image_data_start_row > cm->mb_rows / 2) ||
-      (image_data_start_row == INVALID_ROW)) {
-    image_data_start_row = cm->mb_rows / 2;
+    // Use either last frame or alt frame for motion search.
+    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
+      first_ref_buf = vp9_get_scaled_ref_frame(cpi, LAST_FRAME);
+      if (first_ref_buf == NULL)
+        first_ref_buf = get_ref_frame_buffer(cpi, LAST_FRAME);
+    }
+
+    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
+      gld_yv12 = vp9_get_scaled_ref_frame(cpi, GOLDEN_FRAME);
+      if (gld_yv12 == NULL) {
+        gld_yv12 = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
+      }
+    } else {
+      gld_yv12 = NULL;
+    }
+
+    set_ref_ptrs(cm, xd,
+                 (cpi->ref_frame_flags & VP9_LAST_FLAG) ? LAST_FRAME : NONE,
+                 (cpi->ref_frame_flags & VP9_GOLD_FLAG) ? GOLDEN_FRAME : NONE);
+
+    cpi->Source = vp9_scale_if_required(cm, cpi->un_scaled_source,
+                                        &cpi->scaled_source, 0);
   }
-  // Exclude any image dead zone
-  if (image_data_start_row > 0) {
-    intra_skip_count =
-        VPXMAX(0, intra_skip_count - (image_data_start_row * cm->mb_cols * 2));
+
+  vp9_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
+
+  vp9_setup_src_planes(x, cpi->Source, 0, 0);
+  vp9_setup_dst_planes(xd->plane, new_yv12, 0, 0);
+
+  if (!frame_is_intra_only(cm)) {
+    vp9_setup_pre_planes(xd, 0, first_ref_buf, 0, 0, NULL);
   }
 
+  xd->mi = cm->mi_grid_visible;
+  xd->mi[0] = cm->mi;
+
+  vp9_frame_init_quantizer(cpi);
+
+  x->skip_recode = 0;
+
+  vp9_init_mv_probs(cm);
+  vp9_initialize_rd_consts(cpi);
+
+  cm->log2_tile_rows = 0;
+
   {
     FIRSTPASS_STATS fps;
-    // The minimum error here insures some bit allocation to frames even
-    // in static regions. The allocation per MB declines for larger formats
-    // where the typical "real" energy per MB also falls.
-    // Initial estimate here uses sqrt(mbs) to define the min_err, where the
-    // number of mbs is proportional to the image area.
-    const int num_mbs = (cpi->oxcf.resize_mode != RESIZE_NONE)
-                            ? cpi->initial_mbs
-                            : cpi->common.MBs;
-    const double min_err = 200 * sqrt(num_mbs);
-
-    intra_factor = intra_factor / (double)num_mbs;
-    brightness_factor = brightness_factor / (double)num_mbs;
-    fps.weight = intra_factor * brightness_factor;
-
-    fps.frame = cm->current_video_frame;
-    fps.spatial_layer_id = cpi->svc.spatial_layer_id;
-    fps.coded_error = (double)(coded_error >> 8) + min_err;
-    fps.sr_coded_error = (double)(sr_coded_error >> 8) + min_err;
-    fps.intra_error = (double)(intra_error >> 8) + min_err;
-    fps.frame_noise_energy = (double)frame_noise_energy / (double)num_mbs;
-    fps.count = 1.0;
-    fps.pcnt_inter = (double)intercount / num_mbs;
-    fps.pcnt_second_ref = (double)second_ref_count / num_mbs;
-    fps.pcnt_neutral = (double)neutral_count / num_mbs;
-    fps.intra_skip_pct = (double)intra_skip_count / num_mbs;
-    fps.intra_smooth_pct = (double)intra_smooth_count / num_mbs;
-    fps.inactive_zone_rows = (double)image_data_start_row;
-    // Currently set to 0 as most issues relate to letter boxing.
-    fps.inactive_zone_cols = (double)0;
-
-    if (mvcount > 0) {
-      fps.MVr = (double)sum_mvr / mvcount;
-      fps.mvr_abs = (double)sum_mvr_abs / mvcount;
-      fps.MVc = (double)sum_mvc / mvcount;
-      fps.mvc_abs = (double)sum_mvc_abs / mvcount;
-      fps.MVrv =
-          ((double)sum_mvrs - ((double)sum_mvr * sum_mvr / mvcount)) / mvcount;
-      fps.MVcv =
-          ((double)sum_mvcs - ((double)sum_mvc * sum_mvc / mvcount)) / mvcount;
-      fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2);
-      fps.pcnt_motion = (double)mvcount / num_mbs;
+    TileDataEnc *first_tile_col;
+    if (!cpi->new_mt) {
+      cm->log2_tile_cols = 0;
+      cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read_dummy;
+      cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write_dummy;
+      first_pass_encode(cpi, fp_acc_data);
+      first_pass_stat_calc(cpi, &fps, fp_acc_data);
     } else {
-      fps.MVr = 0.0;
-      fps.mvr_abs = 0.0;
-      fps.MVc = 0.0;
-      fps.mvc_abs = 0.0;
-      fps.MVrv = 0.0;
-      fps.MVcv = 0.0;
-      fps.mv_in_out_count = 0.0;
-      fps.pcnt_motion = 0.0;
+      cpi->row_mt_sync_read_ptr = vp9_row_mt_sync_read;
+      cpi->row_mt_sync_write_ptr = vp9_row_mt_sync_write;
+#if ENABLE_MT_BIT_MATCH
+      cm->log2_tile_cols = 0;
+      vp9_zero_array(cpi->twopass.fp_mb_float_stats, cm->MBs);
+#endif
+      vp9_encode_fp_row_mt(cpi);
+      first_tile_col = &cpi->tile_data[0];
+#if ENABLE_MT_BIT_MATCH
+      accumulate_floating_point_stats(cpi, first_tile_col);
+#endif
+      first_pass_stat_calc(cpi, &fps, &(first_tile_col->fp_data));
     }
 
     // Dont allow a value of 0 for duration.
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 5541893..ee6d5f3 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -39,6 +39,40 @@ typedef struct {
 } FIRSTPASS_MB_STATS;
 #endif
 
+#define INVALID_ROW -1
+
+#define ENABLE_MT_BIT_MATCH 0
+#if ENABLE_MT_BIT_MATCH
+typedef struct {
+  double frame_mb_intra_factor;
+  double frame_mb_brightness_factor;
+  double frame_mb_neutral_count;
+} FP_MB_FLOAT_STATS;
+#endif
+
+typedef struct {
+  double intra_factor;
+  double brightness_factor;
+  int64_t coded_error;
+  int64_t sr_coded_error;
+  int64_t frame_noise_energy;
+  int64_t intra_error;
+  int intercount;
+  int second_ref_count;
+  double neutral_count;
+  int intra_skip_count;
+  int image_data_start_row;
+  int mvcount;
+  int sum_mvr;
+  int sum_mvr_abs;
+  int sum_mvc;
+  int sum_mvc_abs;
+  int64_t sum_mvrs;
+  int64_t sum_mvcs;
+  int sum_in_vectors;
+  int intra_smooth_count;
+} FIRSTPASS_DATA;
+
 typedef struct {
   double frame;
   double weight;
@@ -114,6 +148,11 @@ typedef struct {
   uint8_t *this_frame_mb_stats;
   FIRSTPASS_MB_STATS firstpass_mb_stats;
 #endif
+
+#if ENABLE_MT_BIT_MATCH
+  FP_MB_FLOAT_STATS *fp_mb_float_stats;
+#endif
+
   // An indication of the content type of the current frame
   FRAME_CONTENT_TYPE fr_content_type;
 
@@ -141,12 +180,20 @@ typedef struct {
 } TWO_PASS;
 
 struct VP9_COMP;
+struct ThreadData;
+struct TileDataEnc;
 
 void vp9_init_first_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_first_pass_params(struct VP9_COMP *cpi);
 void vp9_first_pass(struct VP9_COMP *cpi, const struct lookahead_entry *source);
 void vp9_end_first_pass(struct VP9_COMP *cpi);
 
+void vp9_first_pass_encode_tile_mb_row(struct VP9_COMP *cpi,
+                                       struct ThreadData *td,
+                                       FIRSTPASS_DATA *fp_acc_data,
+                                       struct TileDataEnc *tile_data,
+                                       MV *best_ref_mv, int mb_row);
+
 void vp9_init_second_pass(struct VP9_COMP *cpi);
 void vp9_rc_get_second_pass_params(struct VP9_COMP *cpi);
 void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_job_queue.h b/vp9/encoder/vp9_job_queue.h
new file mode 100644
index 0000000..89c08f2
--- /dev/null
+++ b/vp9/encoder/vp9_job_queue.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_JOB_QUEUE_H_
+#define VP9_ENCODER_VP9_JOB_QUEUE_H_
+
+typedef enum {
+  FIRST_PASS_JOB,
+  ENCODE_JOB,
+  ARNR_JOB,
+  NUM_JOB_TYPES,
+} JOB_TYPE;
+
+// Encode job parameters
+typedef struct {
+  int vert_unit_row_num;  // Index of the vertical unit row
+  int tile_col_id;        // tile col id within a tile
+  int tile_row_id;        // tile col id within a tile
+} JobNode;
+
+// Job queue element parameters
+typedef struct {
+  // Pointer to the next link in the job queue
+  void *next;
+
+  // Job information context of the module
+  JobNode job_info;
+} JobQueue;
+
+// Job queue handle
+typedef struct {
+  // Pointer to the next link in the job queue
+  void *next;
+
+  // Counter to store the number of jobs picked up for processing
+  int num_jobs_acquired;
+} JobQueueHandle;
+
+#endif  // VP9_ENCODER_VP9_JOB_QUEUE_H_
diff --git a/vp9/encoder/vp9_multi_thread.c b/vp9/encoder/vp9_multi_thread.c
new file mode 100644
index 0000000..23b0b42
--- /dev/null
+++ b/vp9/encoder/vp9_multi_thread.c
@@ -0,0 +1,282 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_ethread.h"
+#include "vp9/encoder/vp9_multi_thread.h"
+
+void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
+                               int tile_id) {
+  RowMTInfo *row_mt_info;
+  JobQueueHandle *job_queue_hdl = NULL;
+  void *next = NULL;
+  JobNode *job_info = NULL;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex_handle = NULL;
+#endif
+
+  row_mt_info = (RowMTInfo *)(&multi_thread_ctxt->row_mt_info[tile_id]);
+  job_queue_hdl = (JobQueueHandle *)&row_mt_info->job_queue_hdl;
+#if CONFIG_MULTITHREAD
+  mutex_handle = &row_mt_info->job_mutex;
+#endif
+
+// lock the mutex for queue access
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(mutex_handle);
+#endif
+  next = job_queue_hdl->next;
+  if (NULL != next) {
+    JobQueue *job_queue = (JobQueue *)next;
+    job_info = &job_queue->job_info;
+    // Update the next job in the queue
+    job_queue_hdl->next = job_queue->next;
+    job_queue_hdl->num_jobs_acquired++;
+  }
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(mutex_handle);
+#endif
+
+  return job_info;
+}
+
+void vp9_row_mt_mem_alloc(VP9_COMP *cpi) {
+  struct VP9Common *cm = &cpi->common;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int tile_row, tile_col;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  int jobs_per_tile_col, total_jobs;
+
+  jobs_per_tile_col = VPXMAX(cm->mb_rows, sb_rows);
+  // Calculate the total number of jobs
+  total_jobs = jobs_per_tile_col * tile_cols;
+
+  multi_thread_ctxt->allocated_tile_cols = tile_cols;
+  multi_thread_ctxt->allocated_tile_rows = tile_rows;
+  multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;
+
+  multi_thread_ctxt->job_queue =
+      (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue));
+
+#if CONFIG_MULTITHREAD
+  // Create mutex for each tile
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col];
+    pthread_mutex_init(&row_mt_info->job_mutex, NULL);
+  }
+#endif
+
+  // Allocate memory for row based multi-threading
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    TileDataEnc *this_tile = &cpi->tile_data[tile_col];
+    vp9_row_mt_sync_mem_alloc(&this_tile->row_mt_sync, cm, jobs_per_tile_col);
+  }
+
+  // Assign the sync pointer of tile row zero for every tile row > 0
+  for (tile_row = 1; tile_row < tile_rows; tile_row++) {
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols + tile_col];
+      TileDataEnc *this_col_tile = &cpi->tile_data[tile_col];
+      this_tile->row_mt_sync = this_col_tile->row_mt_sync;
+    }
+  }
+
+  // Calculate the number of vertical units in the given tile row
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    TileDataEnc *this_tile = &cpi->tile_data[tile_row * tile_cols];
+    TileInfo *tile_info = &this_tile->tile_info;
+    multi_thread_ctxt->num_tile_vert_sbs[tile_row] =
+        get_num_vert_units(*tile_info, MI_BLOCK_SIZE_LOG2);
+  }
+}
+
+void vp9_row_mt_mem_dealloc(VP9_COMP *cpi) {
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  int tile_col;
+
+  // Deallocate memory for job queue
+  if (multi_thread_ctxt->job_queue) vpx_free(multi_thread_ctxt->job_queue);
+
+#if CONFIG_MULTITHREAD
+  // Destroy mutex for each tile
+  for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+       tile_col++) {
+    RowMTInfo *row_mt_info = &multi_thread_ctxt->row_mt_info[tile_col];
+    if (row_mt_info) pthread_mutex_destroy(&row_mt_info->job_mutex);
+  }
+#endif
+
+  // Free row based multi-threading sync memory
+  for (tile_col = 0; tile_col < multi_thread_ctxt->allocated_tile_cols;
+       tile_col++) {
+    TileDataEnc *this_tile = &cpi->tile_data[tile_col];
+    vp9_row_mt_sync_mem_dealloc(&this_tile->row_mt_sync);
+  }
+}
+
+void vp9_multi_thread_tile_init(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  int i;
+
+  for (i = 0; i < tile_cols; i++) {
+    TileDataEnc *this_tile = &cpi->tile_data[i];
+    int jobs_per_tile_col = cpi->oxcf.pass == 1 ? cm->mb_rows : sb_rows;
+
+    // Initialize cur_col to -1 for all rows.
+    memset(this_tile->row_mt_sync.cur_col, -1,
+           sizeof(*this_tile->row_mt_sync.cur_col) * jobs_per_tile_col);
+    vp9_zero(this_tile->fp_data);
+    this_tile->fp_data.image_data_start_row = INVALID_ROW;
+  }
+}
+
+void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,
+                               int tile_cols, int num_workers) {
+  int tile_id = 0;
+  int i;
+
+  // Allocating the threads for the tiles
+  for (i = 0; i < num_workers; i++) {
+    multi_thread_ctxt->thread_id_to_tile_id[i] = tile_id++;
+    if (tile_id == tile_cols) tile_id = 0;
+  }
+}
+
+int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt,
+                             int cur_tile_id) {
+  RowMTInfo *row_mt_info;
+  JobQueueHandle *job_queue_hndl;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t *mutex;
+#endif
+  int num_jobs_remaining;
+
+  row_mt_info = &multi_thread_ctxt->row_mt_info[cur_tile_id];
+  job_queue_hndl = &row_mt_info->job_queue_hdl;
+#if CONFIG_MULTITHREAD
+  mutex = &row_mt_info->job_mutex;
+#endif
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(mutex);
+#endif
+  num_jobs_remaining =
+      multi_thread_ctxt->jobs_per_tile_col - job_queue_hndl->num_jobs_acquired;
+#if CONFIG_MULTITHREAD
+  pthread_mutex_unlock(mutex);
+#endif
+
+  return (num_jobs_remaining);
+}
+
+void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type) {
+  VP9_COMMON *const cm = &cpi->common;
+  MultiThreadHandle *multi_thread_ctxt = &cpi->multi_thread_ctxt;
+  JobQueue *job_queue = multi_thread_ctxt->job_queue;
+  const int tile_cols = 1 << cm->log2_tile_cols;
+  int job_row_num, jobs_per_tile, jobs_per_tile_col, total_jobs;
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  int tile_col, i;
+
+  jobs_per_tile_col = (job_type != ENCODE_JOB) ? cm->mb_rows : sb_rows;
+  total_jobs = jobs_per_tile_col * tile_cols;
+
+  multi_thread_ctxt->jobs_per_tile_col = jobs_per_tile_col;
+  // memset the entire job queue buffer to zero
+  memset(job_queue, 0, total_jobs * sizeof(JobQueue));
+
+  // Job queue preparation
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    RowMTInfo *tile_ctxt = &multi_thread_ctxt->row_mt_info[tile_col];
+    JobQueue *job_queue_curr, *job_queue_temp;
+    int tile_row = 0;
+
+    tile_ctxt->job_queue_hdl.next = (void *)job_queue;
+    tile_ctxt->job_queue_hdl.num_jobs_acquired = 0;
+
+    job_queue_curr = job_queue;
+    job_queue_temp = job_queue;
+
+    // loop over all the vertical rows
+    for (job_row_num = 0, jobs_per_tile = 0; job_row_num < jobs_per_tile_col;
+         job_row_num++, jobs_per_tile++) {
+      job_queue_curr->job_info.vert_unit_row_num = job_row_num;
+      job_queue_curr->job_info.tile_col_id = tile_col;
+      job_queue_curr->job_info.tile_row_id = tile_row;
+      job_queue_curr->next = (void *)(job_queue_temp + 1);
+      job_queue_curr = ++job_queue_temp;
+
+      if (ENCODE_JOB == job_type) {
+        if (jobs_per_tile >=
+            multi_thread_ctxt->num_tile_vert_sbs[tile_row] - 1) {
+          tile_row++;
+          jobs_per_tile = -1;
+        }
+      }
+    }
+
+    // Set the last pointer to NULL
+    job_queue_curr += -1;
+    job_queue_curr->next = (void *)NULL;
+
+    // Move to the next tile
+    job_queue += jobs_per_tile_col;
+  }
+
+  for (i = 0; i < cpi->num_workers; i++) {
+    EncWorkerData *thread_data;
+    thread_data = &cpi->tile_thr_data[i];
+    thread_data->thread_id = i;
+
+    for (tile_col = 0; tile_col < tile_cols; tile_col++)
+      thread_data->tile_completion_status[tile_col] = 0;
+  }
+}
+
+int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,
+                              int *tile_completion_status, int *cur_tile_id,
+                              int tile_cols) {
+  int tile_col;
+  int tile_id = -1;  // Stores the tile ID with minimum proc done
+  int max_num_jobs_remaining = 0;
+  int num_jobs_remaining;
+
+  // Mark the completion to avoid check in the loop
+  tile_completion_status[*cur_tile_id] = 1;
+  // Check for the status of all the tiles
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    if (tile_completion_status[tile_col] == 0) {
+      num_jobs_remaining =
+          vp9_get_job_queue_status(multi_thread_ctxt, tile_col);
+      // Mark the completion to avoid checks during future switches across tiles
+      if (num_jobs_remaining == 0) tile_completion_status[tile_col] = 1;
+      if (num_jobs_remaining > max_num_jobs_remaining) {
+        max_num_jobs_remaining = num_jobs_remaining;
+        tile_id = tile_col;
+      }
+    }
+  }
+
+  if (-1 == tile_id) {
+    return 1;
+  } else {
+    // Update the cur ID to the next tile ID that will be processed,
+    // which will be the least processed tile
+    *cur_tile_id = tile_id;
+    return 0;
+  }
+}
diff --git a/vp9/encoder/vp9_multi_thread.h b/vp9/encoder/vp9_multi_thread.h
new file mode 100644
index 0000000..bfc0c0a
--- /dev/null
+++ b/vp9/encoder/vp9_multi_thread.h
@@ -0,0 +1,38 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_ENCODER_VP9_MULTI_THREAD_H
+#define VP9_ENCODER_VP9_MULTI_THREAD_H
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_job_queue.h"
+
+void *vp9_enc_grp_get_next_job(MultiThreadHandle *multi_thread_ctxt,
+                               int tile_id);
+
+void vp9_prepare_job_queue(VP9_COMP *cpi, JOB_TYPE job_type);
+
+int vp9_get_job_queue_status(MultiThreadHandle *multi_thread_ctxt,
+                             int cur_tile_id);
+
+void vp9_assign_tile_to_thread(MultiThreadHandle *multi_thread_ctxt,
+                               int tile_cols, int num_workers);
+
+void vp9_multi_thread_tile_init(VP9_COMP *cpi);
+
+void vp9_row_mt_mem_alloc(VP9_COMP *cpi);
+
+void vp9_row_mt_mem_dealloc(VP9_COMP *cpi);
+
+int vp9_get_tiles_proc_status(MultiThreadHandle *multi_thread_ctxt,
+                              int *tile_completion_status, int *cur_tile_id,
+                              int tile_cols);
+
+#endif  // VP9_ENCODER_VP9_MULTI_THREAD_H
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 1d5ed7c..c239ca6 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -51,6 +51,7 @@ struct vp9_extracfg {
   vpx_color_range_t color_range;
   int render_width;
   int render_height;
+  unsigned int new_mt;
 };
 
 static struct vp9_extracfg default_extra_cfg = {
@@ -82,6 +83,7 @@ static struct vp9_extracfg default_extra_cfg = {
   0,                     // color range
   0,                     // render width
   0,                     // render height
+  1,                     // new_mt
 };
 
 struct vpx_codec_alg_priv {
@@ -245,6 +247,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx,
         "kf_min_dist not supported in auto mode, use 0 "
         "or kf_max_dist instead.");
 
+  RANGE_CHECK(extra_cfg, new_mt, 0, 1);
   RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
   RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
@@ -554,6 +557,8 @@ static vpx_codec_err_t set_encoder_config(
 
   oxcf->target_level = extra_cfg->target_level;
 
+  oxcf->new_mt = extra_cfg->new_mt;
+
   for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
 #if CONFIG_SPATIAL_SVC
     oxcf->ss_enable_auto_arf[sl] = cfg->ss_enable_auto_alt_ref[sl];
@@ -842,6 +847,13 @@ static vpx_codec_err_t ctrl_set_target_level(vpx_codec_alg_priv_t *ctx,
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_new_mt(vpx_codec_alg_priv_t *ctx,
+                                       va_list args) {
+  struct vp9_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.new_mt = CAST(VP9E_SET_NEW_MT, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_err_t ctrl_get_level(vpx_codec_alg_priv_t *ctx, va_list args) {
   int *const arg = va_arg(args, int *);
   if (arg == NULL) return VPX_CODEC_INVALID_PARAM;
@@ -1594,6 +1606,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   { VP9E_SET_SVC_REF_FRAME_CONFIG, ctrl_set_svc_ref_frame_config },
   { VP9E_SET_RENDER_SIZE, ctrl_set_render_size },
   { VP9E_SET_TARGET_LEVEL, ctrl_set_target_level },
+  { VP9E_SET_NEW_MT, ctrl_set_new_mt },
 
   // Getters
   { VP8E_GET_LAST_QUANTIZER, ctrl_get_quantizer },
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index a8ca0d5..87d9a77 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -39,9 +39,12 @@ VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
 VP9_CX_SRCS-yes += encoder/vp9_extend.h
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.h
+VP9_CX_SRCS-yes += encoder/vp9_job_queue.h
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.h
 VP9_CX_SRCS-yes += encoder/vp9_mcomp.h
+VP9_CX_SRCS-yes += encoder/vp9_multi_thread.c
+VP9_CX_SRCS-yes += encoder/vp9_multi_thread.h
 VP9_CX_SRCS-yes += encoder/vp9_encoder.h
 VP9_CX_SRCS-yes += encoder/vp9_quantize.h
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index cc90159..a04d7dd 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -547,6 +547,14 @@ enum vp8e_enc_control_id {
    */
   VP9E_SET_TARGET_LEVEL,
 
+  /*!\brief Codec control function to set row level multi-threading.
+  *
+  * 0 : off, 1 : on
+  *
+  * Supported in codecs: VP9
+  */
+  VP9E_SET_NEW_MT,
+
   /*!\brief Codec control function to get bitstream level.
    *
    * Supported in codecs: VP9
@@ -838,6 +846,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
 VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int)
 #define VPX_CTRL_VP9E_SET_TARGET_LEVEL
 
+VPX_CTRL_USE_TYPE(VP9E_SET_NEW_MT, unsigned int)
+#define VPX_CTRL_VP9E_SET_NEW_MT
+
 VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
 #define VPX_CTRL_VP9E_GET_LEVEL
 
diff --git a/vpxenc.c b/vpxenc.c
index 9cd10ab..abb3bae 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -470,6 +470,9 @@ static const arg_def_t target_level = ARG_DEF(
     NULL, "target-level", 1,
     "Target level (255: off (default); 0: only keep level stats; 10: level 1.0;"
     " 11: level 1.1; ... 62: level 6.2)");
+
+static const arg_def_t new_mt =
+    ARG_DEF(NULL, "new-mt", 1, "Enable row based multi-threading in VP9");
 #endif
 
 #if CONFIG_VP9_ENCODER
@@ -498,6 +501,7 @@ static const arg_def_t *vp9_args[] = { &cpu_used_vp9,
                                        &min_gf_interval,
                                        &max_gf_interval,
                                        &target_level,
+                                       &new_mt,
 #if CONFIG_VP9_HIGHBITDEPTH
                                        &bitdeptharg,
                                        &inbitdeptharg,
@@ -528,6 +532,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED,
                                         VP9E_SET_MIN_GF_INTERVAL,
                                         VP9E_SET_MAX_GF_INTERVAL,
                                         VP9E_SET_TARGET_LEVEL,
+                                        VP9E_SET_NEW_MT,
                                         0 };
 #endif
 
-- 
2.7.4