Populate q_index in multi-thread encoding
[profile/ivi/libvpx.git] / vp8 / encoder / ethreading.c
index 5c607a0..8c49668 100644 (file)
@@ -20,12 +20,10 @@ extern int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
                                          int recon_uvoffset);
 extern int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x,
                                           TOKENEXTRA **t);
-extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
+extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x, int ok_to_skip);
 extern void vp8_build_block_offsets(MACROBLOCK *x);
 extern void vp8_setup_block_ptrs(MACROBLOCK *x);
 
-#if CONFIG_MULTITHREAD
-
 extern void loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
 
 static THREAD_FUNCTION loopfilter_thread(void *p_data)
@@ -51,7 +49,6 @@ static THREAD_FUNCTION loopfilter_thread(void *p_data)
 
     return 0;
 }
-#endif
 
 static
 THREAD_FUNCTION thread_encoding_proc(void *p_data)
@@ -87,15 +84,14 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
             for (mb_row = ithread + 1; mb_row < cm->mb_rows; mb_row += (cpi->encoding_thread_count + 1))
             {
 
-                int i;
                 int recon_yoffset, recon_uvoffset;
                 int mb_col;
                 int ref_fb_idx = cm->lst_fb_idx;
                 int dst_fb_idx = cm->new_fb_idx;
                 int recon_y_stride = cm->yv12_fb[ref_fb_idx].y_stride;
                 int recon_uv_stride = cm->yv12_fb[ref_fb_idx].uv_stride;
+                int map_index = (mb_row * cm->mb_cols);
                 volatile int *last_row_current_mb_col;
-                INT64 activity_sum = 0;
 
                 tp = cpi->tok + (mb_row * (cm->mb_cols * 16 * 24));
 
@@ -115,11 +111,12 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
 
                 //printf("Thread mb_row = %d\n", mb_row);
 
+                // Set the mb activity pointer to the start of the row.
+                x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
+
                 // for each macroblock col in image
                 for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
                 {
-                    int seg_map_index = (mb_row * cm->mb_cols);
-
                     if ((mb_col & (nsync - 1)) == 0)
                     {
                         while (mb_col > (*last_row_current_mb_col - nsync) && *last_row_current_mb_col != cm->mb_cols - 1)
@@ -150,25 +147,28 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                     x->rddiv = cpi->RDDIV;
                     x->rdmult = cpi->RDMULT;
 
+                    //Copy current mb to a buffer
+                    RECON_INVOKE(&xd->rtcd->recon, copy16x16)(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
+
                     if (cpi->oxcf.tuning == VP8_TUNE_SSIM)
-                        activity_sum += vp8_activity_masking(cpi, x);
+                        vp8_activity_masking(cpi, x);
 
                     // Is segmentation enabled
                     // MB level adjutment to quantizer
                     if (xd->segmentation_enabled)
                     {
                         // Code to set segment id in xd->mbmi.segment_id for current MB (with range checking)
-                        if (cpi->segmentation_map[seg_map_index + mb_col] <= 3)
-                            xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[seg_map_index + mb_col];
+                        if (cpi->segmentation_map[map_index + mb_col] <= 3)
+                            xd->mode_info_context->mbmi.segment_id = cpi->segmentation_map[map_index + mb_col];
                         else
                             xd->mode_info_context->mbmi.segment_id = 0;
 
-                        vp8cx_mb_init_quantizer(cpi, x);
+                        vp8cx_mb_init_quantizer(cpi, x, 1);
                     }
                     else
                         xd->mode_info_context->mbmi.segment_id = 0; // Set to Segment 0 by default
 
-                    x->active_ptr = cpi->active_map + seg_map_index + mb_col;
+                    x->active_ptr = cpi->active_map + map_index + mb_col;
 
                     if (cm->frame_type == KEY_FRAME)
                     {
@@ -206,29 +206,30 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                         if (cpi->cyclic_refresh_mode_enabled && xd->segmentation_enabled)
                         {
                             const MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
-                            cpi->segmentation_map[seg_map_index + mb_col] = mbmi->segment_id;
+                            cpi->segmentation_map[map_index + mb_col] = mbmi->segment_id;
 
                             // If the block has been refreshed mark it as clean (the magnitude of the -ve influences how long it will be before we consider another refresh):
                             // Else if it was coded (last frame 0,0) and has not already been refreshed then mark it as a candidate for cleanup next time (marked 0)
                             // else mark it as dirty (1).
                             if (mbmi->segment_id)
-                                cpi->cyclic_refresh_map[seg_map_index + mb_col] = -1;
+                                cpi->cyclic_refresh_map[map_index + mb_col] = -1;
                             else if ((mbmi->mode == ZEROMV) && (mbmi->ref_frame == LAST_FRAME))
                             {
-                                if (cpi->cyclic_refresh_map[seg_map_index + mb_col] == 1)
-                                    cpi->cyclic_refresh_map[seg_map_index + mb_col] = 0;
+                                if (cpi->cyclic_refresh_map[map_index + mb_col] == 1)
+                                    cpi->cyclic_refresh_map[map_index + mb_col] = 0;
                             }
                             else
-                                cpi->cyclic_refresh_map[seg_map_index + mb_col] = 1;
+                                cpi->cyclic_refresh_map[map_index + mb_col] = 1;
 
                         }
                     }
                     cpi->tplist[mb_row].stop = tp;
 
-                    x->gf_active_ptr++; // Increment pointer into gf useage flags structure for next mb
+                    // Increment pointer into gf useage flags structure.
+                    x->gf_active_ptr++;
 
-                    for (i = 0; i < 16; i++)
-                        vpx_memcpy(&xd->mode_info_context->bmi[i], &xd->block[i].bmi, sizeof(xd->block[i].bmi));
+                    // Increment the activity mask pointers.
+                    x->mb_activity_ptr++;
 
                     // adjust to the next column of macroblocks
                     x->src.y_buffer += 16;
@@ -259,7 +260,6 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
                 // this is to account for the border
                 xd->mode_info_context++;
                 x->partition_info++;
-                x->activity_sum += activity_sum;
 
                 x->src.y_buffer += 16 * x->src.y_stride * (cpi->encoding_thread_count + 1) - 16 * cm->mb_cols;
                 x->src.u_buffer += 8 * x->src.uv_stride * (cpi->encoding_thread_count + 1) - 8 * cm->mb_cols;
@@ -267,6 +267,7 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
 
                 xd->mode_info_context += xd->mode_info_stride * cpi->encoding_thread_count;
                 x->partition_info += xd->mode_info_stride * cpi->encoding_thread_count;
+                x->gf_active_ptr   += cm->mb_cols * cpi->encoding_thread_count;
 
                 if (mb_row == cm->mb_rows - 1)
                 {
@@ -295,7 +296,6 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
 
     z->sadperbit16      = x->sadperbit16;
     z->sadperbit4       = x->sadperbit4;
-    z->errthresh        = x->errthresh;
 
     /*
     z->mv_col_min    = x->mv_col_min;
@@ -309,6 +309,7 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
     z->vp8_short_fdct8x4     = x->vp8_short_fdct8x4;
     z->short_walsh4x4    = x->short_walsh4x4;
     z->quantize_b        = x->quantize_b;
+    z->quantize_b_pair   = x->quantize_b_pair;
     z->optimize          = x->optimize;
 
     /*
@@ -322,8 +323,8 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
     vpx_memcpy(z->mvcosts,          x->mvcosts,         sizeof(x->mvcosts));
     z->mvcost[0] = &z->mvcosts[0][mv_max+1];
     z->mvcost[1] = &z->mvcosts[1][mv_max+1];
-    z->mvsadcost[0] = &z->mvsadcosts[0][mv_max+1];
-    z->mvsadcost[1] = &z->mvsadcosts[1][mv_max+1];
+    z->mvsadcost[0] = &z->mvsadcosts[0][mvfp_max+1];
+    z->mvsadcost[1] = &z->mvsadcosts[1][mvfp_max+1];
 
 
     vpx_memcpy(z->token_costs,       x->token_costs,      sizeof(x->token_costs));
@@ -342,6 +343,9 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
         z->block[i].zbin            = x->block[i].zbin;
         z->block[i].zrun_zbin_boost   = x->block[i].zrun_zbin_boost;
         z->block[i].round           = x->block[i].round;
+        z->q_index                  = x->q_index;
+        z->act_zbin_adj             = x->act_zbin_adj;
+        z->last_act_zbin_adj        = x->last_act_zbin_adj;
         /*
         z->block[i].src             = x->block[i].src;
         */
@@ -446,65 +450,71 @@ void vp8cx_init_mbrthread_data(VP8_COMP *cpi,
 
         vp8_setup_block_ptrs(mb);
 
-        mb->activity_sum = 0;
-
         mbd->left_context = &cm->left_context;
         mb->mvc = cm->fc.mvc;
 
         setup_mbby_copy(&mbr_ei[i].mb, x);
 
+        mbd->fullpixel_mask = 0xffffffff;
+        if(cm->full_pixel)
+            mbd->fullpixel_mask = 0xfffffff8;
     }
 }
 
 void vp8cx_create_encoder_threads(VP8_COMP *cpi)
 {
-    cpi->b_multi_threaded = 0;
+    const VP8_COMMON * cm = &cpi->common;
 
-    cpi->processor_core_count = 32; //vp8_get_proc_core_count();
+    cpi->b_multi_threaded = 0;
+    cpi->encoding_thread_count = 0;
 
-    if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
+    if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1)
     {
         int ithread;
+        int th_count = cpi->oxcf.multi_threaded - 1;
 
-        if (cpi->oxcf.multi_threaded > cpi->processor_core_count)
-            cpi->encoding_thread_count = cpi->processor_core_count - 1;
-        else
-            cpi->encoding_thread_count = cpi->oxcf.multi_threaded - 1;
+        /* don't allocate more threads than cores available */
+        if (cpi->oxcf.multi_threaded > cm->processor_core_count)
+            th_count = cm->processor_core_count - 1;
 
-        CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->h_event_start_encoding, vpx_malloc(sizeof(sem_t) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count));
-        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * cpi->encoding_thread_count);
-        CHECK_MEM_ERROR(cpi->en_thread_data, vpx_malloc(sizeof(ENCODETHREAD_DATA) * cpi->encoding_thread_count));
-        CHECK_MEM_ERROR(cpi->mt_current_mb_col, vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cpi->common.mb_rows));
+        /* we have th_count + 1 (main) threads processing one row each */
+        /* no point to have more threads than the sync range allows */
+        if(th_count > ((cm->mb_cols / cpi->mt_sync_range) - 1))
+        {
+            th_count = (cm->mb_cols / cpi->mt_sync_range) - 1;
+        }
+
+        if(th_count == 0)
+            return;
+
+        CHECK_MEM_ERROR(cpi->h_encoding_thread, vpx_malloc(sizeof(pthread_t) * th_count));
+        CHECK_MEM_ERROR(cpi->h_event_start_encoding, vpx_malloc(sizeof(sem_t) * th_count));
+        CHECK_MEM_ERROR(cpi->mb_row_ei, vpx_memalign(32, sizeof(MB_ROW_COMP) * th_count));
+        vpx_memset(cpi->mb_row_ei, 0, sizeof(MB_ROW_COMP) * th_count);
+        CHECK_MEM_ERROR(cpi->en_thread_data,
+                        vpx_malloc(sizeof(ENCODETHREAD_DATA) * th_count));
+        CHECK_MEM_ERROR(cpi->mt_current_mb_col,
+                        vpx_malloc(sizeof(*cpi->mt_current_mb_col) * cm->mb_rows));
 
-        //cpi->h_event_main = CreateEvent(NULL, FALSE, FALSE, NULL);
         sem_init(&cpi->h_event_end_encoding, 0, 0);
 
         cpi->b_multi_threaded = 1;
+        cpi->encoding_thread_count = th_count;
 
-        //printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n", (cpi->encoding_thread_count +1));
+        /*
+        printf("[VP8:] multi_threaded encoding is enabled with %d threads\n\n",
+               (cpi->encoding_thread_count +1));
+        */
 
-        for (ithread = 0; ithread < cpi->encoding_thread_count; ithread++)
+        for (ithread = 0; ithread < th_count; ithread++)
         {
             ENCODETHREAD_DATA * ethd = &cpi->en_thread_data[ithread];
 
-            //cpi->h_event_mbrencoding[ithread] = CreateEvent(NULL, FALSE, FALSE, NULL);
             sem_init(&cpi->h_event_start_encoding[ithread], 0, 0);
             ethd->ithread = ithread;
             ethd->ptr1 = (void *)cpi;
             ethd->ptr2 = (void *)&cpi->mb_row_ei[ithread];
 
-            //printf(" call begin thread %d \n", ithread);
-
-            //cpi->h_encoding_thread[ithread] =   (HANDLE)_beginthreadex(
-            //  NULL,           // security
-            //  0,              // stksize
-            //  thread_encoding_proc,
-            //  (&cpi->en_thread_data[ithread]),          // Thread data
-            //  0,
-            //  NULL);
-
             pthread_create(&cpi->h_encoding_thread[ithread], 0, thread_encoding_proc, ethd);
         }