Swap alt/gold/new/last frame buffer ptrs instead of copying.
[profile/ivi/libvpx.git] / vp8 / decoder / threading.c
1 /*
2  *  Copyright (c) 2010 The VP8 project authors. All Rights Reserved.
3  *
4  *  Use of this source code is governed by a BSD-style license
5  *  that can be found in the LICENSE file in the root of the source
6  *  tree. An additional intellectual property rights grant can be found
7  *  in the file PATENTS.  All contributing project authors may
8  *  be found in the AUTHORS file in the root of the source tree.
9  */
10
11
12 #ifndef WIN32
13 # include <unistd.h>
14 #endif
15 #include "onyxd_int.h"
16 #include "vpx_mem/vpx_mem.h"
17 #include "threading.h"
18
19 #include "loopfilter.h"
20 #include "extend.h"
21 #include "vpx_ports/vpx_timer.h"
22
23 extern void vp8_decode_mb_row(VP8D_COMP *pbi,
24                               VP8_COMMON *pc,
25                               int mb_row,
26                               MACROBLOCKD *xd);
27
28 extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel);
29 extern void vp8_decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd);
30
31 void vp8_setup_decoding_thread_data(VP8D_COMP *pbi, MACROBLOCKD *xd, MB_ROW_DEC *mbrd, int count)
32 {
33
34
35
36 #if CONFIG_MULTITHREAD
37     VP8_COMMON *const pc = & pbi->common;
38     int i, j;
39
40     for (i = 0; i < count; i++)
41     {
42         MACROBLOCKD *mbd = &mbrd[i].mbd;
43 #if CONFIG_RUNTIME_CPU_DETECT
44         mbd->rtcd = xd->rtcd;
45 #endif
46
47
48         mbd->subpixel_predict        = xd->subpixel_predict;
49         mbd->subpixel_predict8x4     = xd->subpixel_predict8x4;
50         mbd->subpixel_predict8x8     = xd->subpixel_predict8x8;
51         mbd->subpixel_predict16x16   = xd->subpixel_predict16x16;
52         mbd->gf_active_ptr            = xd->gf_active_ptr;
53
54         mbd->mode_info        = pc->mi - 1;
55         mbd->mode_info_context = pc->mi   + pc->mode_info_stride * (i + 1);
56         mbd->mode_info_stride  = pc->mode_info_stride;
57
58         mbd->frame_type = pc->frame_type;
59         mbd->frames_since_golden      = pc->frames_since_golden;
60         mbd->frames_till_alt_ref_frame  = pc->frames_till_alt_ref_frame;
61
62         mbd->pre = pc->yv12_fb[pc->lst_fb_idx];
63         mbd->dst = pc->yv12_fb[pc->new_fb_idx];
64
65         vp8_setup_block_dptrs(mbd);
66         vp8_build_block_doffsets(mbd);
67         mbd->segmentation_enabled    = xd->segmentation_enabled;
68         mbd->mb_segement_abs_delta     = xd->mb_segement_abs_delta;
69         vpx_memcpy(mbd->segment_feature_data, xd->segment_feature_data, sizeof(xd->segment_feature_data));
70
71         mbd->mbmi.mode = DC_PRED;
72         mbd->mbmi.uv_mode = DC_PRED;
73
74         mbd->current_bc = &pbi->bc2;
75
76         for (j = 0; j < 25; j++)
77         {
78             mbd->block[j].dequant = xd->block[j].dequant;
79         }
80     }
81
82 #else
83     (void) pbi;
84     (void) xd;
85     (void) mbrd;
86     (void) count;
87 #endif
88 }
89
90
91 THREAD_FUNCTION vp8_thread_decoding_proc(void *p_data)
92 {
93 #if CONFIG_MULTITHREAD
94     int ithread = ((DECODETHREAD_DATA *)p_data)->ithread;
95     VP8D_COMP *pbi = (VP8D_COMP *)(((DECODETHREAD_DATA *)p_data)->ptr1);
96     MB_ROW_DEC *mbrd = (MB_ROW_DEC *)(((DECODETHREAD_DATA *)p_data)->ptr2);
97     ENTROPY_CONTEXT mb_row_left_context[4][4];
98
99     while (1)
100     {
101         if (pbi->b_multithreaded_rd == 0)
102             break;
103
104         //if(WaitForSingleObject(pbi->h_event_mbrdecoding[ithread], INFINITE) == WAIT_OBJECT_0)
105         if (sem_wait(&pbi->h_event_mbrdecoding[ithread]) == 0)
106         {
107             if (pbi->b_multithreaded_rd == 0)
108                 break;
109             else
110             {
111                 VP8_COMMON *pc = &pbi->common;
112                 int mb_row       = mbrd->mb_row;
113                 MACROBLOCKD *xd = &mbrd->mbd;
114
115                 //printf("ithread:%d mb_row %d\n", ithread, mb_row);
116                 int i;
117                 int recon_yoffset, recon_uvoffset;
118                 int mb_col;
119                 int ref_fb_idx = pc->lst_fb_idx;
120                 int dst_fb_idx = pc->new_fb_idx;
121                 int recon_y_stride = pc->yv12_fb[ref_fb_idx].y_stride;
122                 int recon_uv_stride = pc->yv12_fb[ref_fb_idx].uv_stride;
123
124                 volatile int *last_row_current_mb_col;
125
126                 if (ithread > 0)
127                     last_row_current_mb_col = &pbi->mb_row_di[ithread-1].current_mb_col;
128                 else
129                     last_row_current_mb_col = &pbi->current_mb_col_main;
130
131                 recon_yoffset = mb_row * recon_y_stride * 16;
132                 recon_uvoffset = mb_row * recon_uv_stride * 8;
133                 // reset above block coeffs
134
135                 xd->above_context[Y1CONTEXT] = pc->above_context[Y1CONTEXT];
136                 xd->above_context[UCONTEXT ] = pc->above_context[UCONTEXT];
137                 xd->above_context[VCONTEXT ] = pc->above_context[VCONTEXT];
138                 xd->above_context[Y2CONTEXT] = pc->above_context[Y2CONTEXT];
139                 xd->left_context = mb_row_left_context;
140                 vpx_memset(mb_row_left_context, 0, sizeof(mb_row_left_context));
141                 xd->up_available = (mb_row != 0);
142
143                 xd->mb_to_top_edge = -((mb_row * 16)) << 3;
144                 xd->mb_to_bottom_edge = ((pc->mb_rows - 1 - mb_row) * 16) << 3;
145
146                 for (mb_col = 0; mb_col < pc->mb_cols; mb_col++)
147                 {
148
149                     while (mb_col > (*last_row_current_mb_col - 1) && *last_row_current_mb_col != pc->mb_cols - 1)
150                     {
151                         x86_pause_hint();
152                         thread_sleep(0);
153                     }
154
155                     // Take a copy of the mode and Mv information for this macroblock into the xd->mbmi
156                     // the partition_bmi array is unused in the decoder, so don't copy it.
157                     vpx_memcpy(&xd->mbmi, &xd->mode_info_context->mbmi,
158                                sizeof(MB_MODE_INFO) - sizeof(xd->mbmi.partition_bmi));
159
160                     if (xd->mbmi.mode == SPLITMV || xd->mbmi.mode == B_PRED)
161                     {
162                         for (i = 0; i < 16; i++)
163                         {
164                             BLOCKD *d = &xd->block[i];
165                             vpx_memcpy(&d->bmi, &xd->mode_info_context->bmi[i], sizeof(B_MODE_INFO));
166                         }
167                     }
168
169                     // Distance of Mb to the various image edges.
170                     // These specified to 8th pel as they are always compared to values that are in 1/8th pel units
171                     xd->mb_to_left_edge = -((mb_col * 16) << 3);
172                     xd->mb_to_right_edge = ((pc->mb_cols - 1 - mb_col) * 16) << 3;
173
174                     xd->dst.y_buffer = pc->yv12_fb[dst_fb_idx].y_buffer + recon_yoffset;
175                     xd->dst.u_buffer = pc->yv12_fb[dst_fb_idx].u_buffer + recon_uvoffset;
176                     xd->dst.v_buffer = pc->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset;
177
178                     xd->left_available = (mb_col != 0);
179
180                     // Select the appropriate reference frame for this MB
181                     if (xd->mbmi.ref_frame == LAST_FRAME)
182                         ref_fb_idx = pc->lst_fb_idx;
183                     else if (xd->mbmi.ref_frame == GOLDEN_FRAME)
184                         ref_fb_idx = pc->gld_fb_idx;
185                     else
186                         ref_fb_idx = pc->alt_fb_idx;
187
188                     xd->pre.y_buffer = pc->yv12_fb[ref_fb_idx].y_buffer + recon_yoffset;
189                     xd->pre.u_buffer = pc->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
190                     xd->pre.v_buffer = pc->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
191
192                     vp8_build_uvmvs(xd, pc->full_pixel);
193
194                     vp8_decode_macroblock(pbi, xd);
195
196
197                     recon_yoffset += 16;
198                     recon_uvoffset += 8;
199
200                     ++xd->mode_info_context;  /* next mb */
201
202                     xd->gf_active_ptr++;      // GF useage flag for next MB
203
204                     xd->above_context[Y1CONTEXT] += 4;
205                     xd->above_context[UCONTEXT ] += 2;
206                     xd->above_context[VCONTEXT ] += 2;
207                     xd->above_context[Y2CONTEXT] ++;
208                     pbi->mb_row_di[ithread].current_mb_col = mb_col;
209
210                 }
211
212                 // adjust to the next row of mbs
213                 vp8_extend_mb_row(
214                     &pc->yv12_fb[dst_fb_idx],
215                     xd->dst.y_buffer + 16, xd->dst.u_buffer + 8, xd->dst.v_buffer + 8
216                 );
217
218                 ++xd->mode_info_context;      /* skip prediction column */
219
220                 // since we have multithread
221                 xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
222
223                 //memcpy(&pbi->lpfmb, &pbi->mb, sizeof(pbi->mb));
224                 if ((mb_row & 1) == 1)
225                 {
226                     pbi->last_mb_row_decoded = mb_row;
227                     //printf("S%d", pbi->last_mb_row_decoded);
228                 }
229
230                 if (ithread == (pbi->decoding_thread_count - 1) || mb_row == pc->mb_rows - 1)
231                 {
232                     //SetEvent(pbi->h_event_main);
233                     sem_post(&pbi->h_event_main);
234
235                 }
236             }
237         }
238     }
239
240 #else
241     (void) p_data;
242 #endif
243
244     return 0 ;
245 }
246
247 THREAD_FUNCTION vp8_thread_loop_filter(void *p_data)
248 {
249 #if CONFIG_MULTITHREAD
250     VP8D_COMP *pbi = (VP8D_COMP *)p_data;
251
252     while (1)
253     {
254         if (pbi->b_multithreaded_lf == 0)
255             break;
256
257         //printf("before waiting for start_lpf\n");
258
259         //if(WaitForSingleObject(pbi->h_event_start_lpf, INFINITE) == WAIT_OBJECT_0)
260         if (sem_wait(&pbi->h_event_start_lpf) == 0)
261         {
262             if (pbi->b_multithreaded_lf == 0) // we're shutting down
263                 break;
264             else
265             {
266
267                 VP8_COMMON *cm  = &pbi->common;
268                 MACROBLOCKD *mbd = &pbi->lpfmb;
269                 int default_filt_lvl = pbi->common.filter_level;
270
271                 YV12_BUFFER_CONFIG *post = &cm->yv12_fb[cm->new_fb_idx];
272                 loop_filter_info *lfi = cm->lf_info;
273                 int frame_type = cm->frame_type;
274
275                 int mb_row;
276                 int mb_col;
277
278                 int baseline_filter_level[MAX_MB_SEGMENTS];
279                 int filter_level;
280                 int alt_flt_enabled = mbd->segmentation_enabled;
281
282                 int i;
283                 unsigned char *y_ptr, *u_ptr, *v_ptr;
284
285                 volatile int *last_mb_row_decoded = &pbi->last_mb_row_decoded;
286
287                 //MODE_INFO * this_mb_mode_info = cm->mi;
288                 mbd->mode_info_context = cm->mi;          // Point at base of Mb MODE_INFO list
289
290                 // Note the baseline filter values for each segment
291                 if (alt_flt_enabled)
292                 {
293                     for (i = 0; i < MAX_MB_SEGMENTS; i++)
294                     {
295                         if (mbd->mb_segement_abs_delta == SEGMENT_ABSDATA)
296                             baseline_filter_level[i] = mbd->segment_feature_data[MB_LVL_ALT_LF][i];
297                         else
298                         {
299                             baseline_filter_level[i] = default_filt_lvl + mbd->segment_feature_data[MB_LVL_ALT_LF][i];
300                             baseline_filter_level[i] = (baseline_filter_level[i] >= 0) ? ((baseline_filter_level[i] <= MAX_LOOP_FILTER) ? baseline_filter_level[i] : MAX_LOOP_FILTER) : 0;  // Clamp to valid range
301                         }
302                     }
303                 }
304                 else
305                 {
306                     for (i = 0; i < MAX_MB_SEGMENTS; i++)
307                         baseline_filter_level[i] = default_filt_lvl;
308                 }
309
310                 // Initialize the loop filter for this frame.
311                 if ((cm->last_filter_type != cm->filter_type) || (cm->last_sharpness_level != cm->sharpness_level))
312                     vp8_init_loop_filter(cm);
313                 else if (frame_type != cm->last_frame_type)
314                     vp8_frame_init_loop_filter(lfi, frame_type);
315
316                 // Set up the buffer pointers
317                 y_ptr = post->y_buffer;
318                 u_ptr = post->u_buffer;
319                 v_ptr = post->v_buffer;
320
321                 // vp8_filter each macro block
322                 for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
323                 {
324
325                     while (mb_row >= *last_mb_row_decoded)
326                     {
327                         x86_pause_hint();
328                         thread_sleep(0);
329                     }
330
331                     //printf("R%d", mb_row);
332                     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
333                     {
334                         int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0;
335
336                         filter_level = baseline_filter_level[Segment];
337
338                         // Apply any context driven MB level adjustment
339                         vp8_adjust_mb_lf_value(mbd, &filter_level);
340
341                         if (filter_level)
342                         {
343                             if (mb_col > 0)
344                                 cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
345
346                             if (mbd->mode_info_context->mbmi.dc_diff > 0)
347                                 cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
348
349                             // don't apply across umv border
350                             if (mb_row > 0)
351                                 cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
352
353                             if (mbd->mode_info_context->mbmi.dc_diff > 0)
354                                 cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf);
355                         }
356
357                         y_ptr += 16;
358                         u_ptr += 8;
359                         v_ptr += 8;
360
361                         mbd->mode_info_context++;     // step to next MB
362
363                     }
364
365                     y_ptr += post->y_stride  * 16 - post->y_width;
366                     u_ptr += post->uv_stride *  8 - post->uv_width;
367                     v_ptr += post->uv_stride *  8 - post->uv_width;
368
369                     mbd->mode_info_context++;         // Skip border mb
370                 }
371
372                 //printf("R%d\n", mb_row);
373                 // When done, signal main thread that ME is finished
374                 //SetEvent(pbi->h_event_lpf);
375                 sem_post(&pbi->h_event_lpf);
376             }
377
378         }
379     }
380
381 #else
382     (void) p_data;
383 #endif
384     return 0;
385 }
386
387 void vp8_decoder_create_threads(VP8D_COMP *pbi)
388 {
389 #if CONFIG_MULTITHREAD
390     int core_count = 0;
391     int ithread;
392
393     pbi->b_multithreaded_rd = 0;
394     pbi->b_multithreaded_lf = 0;
395     pbi->allocated_decoding_thread_count = 0;
396     core_count = (pbi->max_threads > 16) ? 16 : pbi->max_threads; //vp8_get_proc_core_count();
397     if (core_count > 1)
398     {
399         sem_init(&pbi->h_event_lpf, 0, 0);
400         sem_init(&pbi->h_event_start_lpf, 0, 0);
401         pbi->b_multithreaded_lf = 1;
402         pthread_create(&pbi->h_thread_lpf, 0, vp8_thread_loop_filter, (pbi));
403     }
404
405     if (core_count > 1)
406     {
407         pbi->b_multithreaded_rd = 1;
408         pbi->decoding_thread_count = core_count - 1;
409
410         CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count));
411         CHECK_MEM_ERROR(pbi->h_event_mbrdecoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count));
412         CHECK_MEM_ERROR(pbi->mb_row_di, vpx_memalign(32, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count));
413         vpx_memset(pbi->mb_row_di, 0, sizeof(MB_ROW_DEC) * pbi->decoding_thread_count);
414         CHECK_MEM_ERROR(pbi->de_thread_data, vpx_malloc(sizeof(DECODETHREAD_DATA) * pbi->decoding_thread_count));
415
416         for (ithread = 0; ithread < pbi->decoding_thread_count; ithread++)
417         {
418             sem_init(&pbi->h_event_mbrdecoding[ithread], 0, 0);
419
420             pbi->de_thread_data[ithread].ithread  = ithread;
421             pbi->de_thread_data[ithread].ptr1     = (void *)pbi;
422             pbi->de_thread_data[ithread].ptr2     = (void *) &pbi->mb_row_di[ithread];
423
424             pthread_create(&pbi->h_decoding_thread[ithread], 0, vp8_thread_decoding_proc, (&pbi->de_thread_data[ithread]));
425
426         }
427
428         sem_init(&pbi->h_event_main, 0, 0);
429         pbi->allocated_decoding_thread_count = pbi->decoding_thread_count;
430     }
431
432 #else
433     (void) pbi;
434 #endif
435 }
436
437 void vp8_decoder_remove_threads(VP8D_COMP *pbi)
438 {
439 #if CONFIG_MULTITHREAD
440
441     if (pbi->b_multithreaded_lf)
442     {
443         pbi->b_multithreaded_lf = 0;
444         sem_post(&pbi->h_event_start_lpf);
445         pthread_join(pbi->h_thread_lpf, 0);
446         sem_destroy(&pbi->h_event_start_lpf);
447     }
448
449     //shutdown MB Decoding thread;
450     if (pbi->b_multithreaded_rd)
451     {
452         pbi->b_multithreaded_rd = 0;
453         // allow all threads to exit
454         {
455             int i;
456
457             for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
458             {
459
460                 sem_post(&pbi->h_event_mbrdecoding[i]);
461                 pthread_join(pbi->h_decoding_thread[i], NULL);
462             }
463         }
464         {
465
466             int i;
467             for (i = 0; i < pbi->allocated_decoding_thread_count; i++)
468             {
469                 sem_destroy(&pbi->h_event_mbrdecoding[i]);
470             }
471
472
473         }
474
475         sem_destroy(&pbi->h_event_main);
476
477         if (pbi->h_decoding_thread)
478         {
479             vpx_free(pbi->h_decoding_thread);
480             pbi->h_decoding_thread = NULL;
481         }
482
483         if (pbi->h_event_mbrdecoding)
484         {
485             vpx_free(pbi->h_event_mbrdecoding);
486             pbi->h_event_mbrdecoding = NULL;
487         }
488
489         if (pbi->mb_row_di)
490         {
491             vpx_free(pbi->mb_row_di);
492             pbi->mb_row_di = NULL ;
493         }
494
495         if (pbi->de_thread_data)
496         {
497             vpx_free(pbi->de_thread_data);
498             pbi->de_thread_data = NULL;
499         }
500     }
501
502 #else
503     (void) pbi;
504 #endif
505 }
506
507
508 void vp8_start_lfthread(VP8D_COMP *pbi)
509 {
510 #if CONFIG_MULTITHREAD
511     memcpy(&pbi->lpfmb, &pbi->mb, sizeof(pbi->mb));
512     pbi->last_mb_row_decoded = 0;
513     sem_post(&pbi->h_event_start_lpf);
514 #else
515     (void) pbi;
516 #endif
517 }
518
519 void vp8_stop_lfthread(VP8D_COMP *pbi)
520 {
521 #if CONFIG_MULTITHREAD
522     struct vpx_usec_timer timer;
523
524     vpx_usec_timer_start(&timer);
525
526     sem_wait(&pbi->h_event_lpf);
527
528     vpx_usec_timer_mark(&timer);
529     pbi->time_loop_filtering += vpx_usec_timer_elapsed(&timer);
530 #else
531     (void) pbi;
532 #endif
533 }
534
535
536 void vp8_mtdecode_mb_rows(VP8D_COMP *pbi,
537                           MACROBLOCKD *xd)
538 {
539 #if CONFIG_MULTITHREAD
540     int mb_row;
541     VP8_COMMON *pc = &pbi->common;
542
543     int ibc = 0;
544     int num_part = 1 << pbi->common.multi_token_partition;
545
546     vp8_setup_decoding_thread_data(pbi, xd, pbi->mb_row_di, pbi->decoding_thread_count);
547
548     for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
549     {
550         int i;
551         pbi->current_mb_col_main = -1;
552
553         xd->current_bc = &pbi->mbc[ibc];
554         ibc++ ;
555
556         if (ibc == num_part)
557             ibc = 0;
558
559         for (i = 0; i < pbi->decoding_thread_count; i++)
560         {
561             if ((mb_row + i + 1) >= pc->mb_rows)
562                 break;
563
564             pbi->mb_row_di[i].mb_row = mb_row + i + 1;
565             pbi->mb_row_di[i].mbd.current_bc =  &pbi->mbc[ibc];
566             ibc++;
567
568             if (ibc == num_part)
569                 ibc = 0;
570
571             pbi->mb_row_di[i].current_mb_col = -1;
572             sem_post(&pbi->h_event_mbrdecoding[i]);
573         }
574
575         vp8_decode_mb_row(pbi, pc, mb_row, xd);
576
577         xd->mode_info_context += xd->mode_info_stride * pbi->decoding_thread_count;
578
579         if (mb_row < pc->mb_rows - 1)
580         {
581             sem_wait(&pbi->h_event_main);
582         }
583     }
584
585     pbi->last_mb_row_decoded = mb_row;
586 #else
587     (void) pbi;
588     (void) xd;
589 #endif
590 }