src/third_party/ffmpeg/libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "internal.h"
  31 #include "rectangle.h"
  32 #include "thread.h"
  33 #include "vp8.h"
  34 #include "vp8data.h"
  35
  36 #if ARCH_ARM
  37 #   include "arm/vp8.h"
  38 #endif
  39
  40 #if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
  41 #define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
  42 #elif CONFIG_VP7_DECODER
  43 #define VPX(vp7, f) vp7_ ## f
  44 #else // CONFIG_VP8_DECODER
  45 #define VPX(vp7, f) vp8_ ## f
  46 #endif
  47
  48 static void free_buffers(VP8Context *s)
  49 {
  50     int i;
  51     if (s->thread_data)
  52         for (i = 0; i < MAX_THREADS; i++) {
  53 #if HAVE_THREADS
  54             pthread_cond_destroy(&s->thread_data[i].cond);
  55             pthread_mutex_destroy(&s->thread_data[i].lock);
  56 #endif
  57             av_freep(&s->thread_data[i].filter_strength);
  58         }
  59     av_freep(&s->thread_data);
  60     av_freep(&s->macroblocks_base);
  61     av_freep(&s->intra4x4_pred_mode_top);
  62     av_freep(&s->top_nnz);
  63     av_freep(&s->top_border);
  64
  65     s->macroblocks = NULL;
  66 }
  67
  68 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  69 {
  70     int ret;
  71     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  72                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  73         return ret;
  74     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  75         ff_thread_release_buffer(s->avctx, &f->tf);
  76         return AVERROR(ENOMEM);
  77     }
  78     return 0;
  79 }
  80
  81 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  82 {
  83     av_buffer_unref(&f->seg_map);
  84     ff_thread_release_buffer(s->avctx, &f->tf);
  85 }
  86
  87 #if CONFIG_VP8_DECODER
  88 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  89 {
  90     int ret;
  91
  92     vp8_release_frame(s, dst);
  93
  94     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  95         return ret;
  96     if (src->seg_map &&
  97         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  98         vp8_release_frame(s, dst);
  99         return AVERROR(ENOMEM);
 100     }
 101
 102     return 0;
 103 }
 104 #endif /* CONFIG_VP8_DECODER */
 105
 106 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 107 {
 108     VP8Context *s = avctx->priv_data;
 109     int i;
 110
 111     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 112         vp8_release_frame(s, &s->frames[i]);
 113     memset(s->framep, 0, sizeof(s->framep));
 114
 115     if (free_mem)
 116         free_buffers(s);
 117 }
 118
 119 static void vp8_decode_flush(AVCodecContext *avctx)
 120 {
 121     vp8_decode_flush_impl(avctx, 0);
 122 }
 123
 124 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 125 {
 126     VP8Frame *frame = NULL;
 127     int i;
 128
 129     // find a free buffer
 130     for (i = 0; i < 5; i++)
 131         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 132             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 133             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 134             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 135             frame = &s->frames[i];
 136             break;
 137         }
 138     if (i == 5) {
 139         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 140         abort();
 141     }
 142     if (frame->tf.f->data[0])
 143         vp8_release_frame(s, frame);
 144
 145     return frame;
 146 }
 147
 148 static av_always_inline
 149 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 150 {
 151     AVCodecContext *avctx = s->avctx;
 152     int i, ret;
 153
 154     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 155         height != s->avctx->height) {
 156         vp8_decode_flush_impl(s->avctx, 1);
 157
 158         ret = ff_set_dimensions(s->avctx, width, height);
 159         if (ret < 0)
 160             return ret;
 161     }
 162
 163     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 164     s->mb_height = (s->avctx->coded_height + 15) / 16;
 165
 166     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 167                    FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1;
 168     if (!s->mb_layout) { // Frame threading and one thread
 169         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 170                                                sizeof(*s->macroblocks));
 171         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 172     } else // Sliced threading
 173         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 174                                          sizeof(*s->macroblocks));
 175     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 176     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 177     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 178
 179     for (i = 0; i < MAX_THREADS; i++) {
 180         s->thread_data[i].filter_strength =
 181             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 182 #if HAVE_THREADS
 183         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 184         pthread_cond_init(&s->thread_data[i].cond, NULL);
 185 #endif
 186     }
 187
 188     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 189         (!s->intra4x4_pred_mode_top && !s->mb_layout))
 190         return AVERROR(ENOMEM);
 191
 192     s->macroblocks = s->macroblocks_base + 1;
 193
 194     return 0;
 195 }
 196
 197 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 198 {
 199     return update_dimensions(s, width, height, IS_VP7);
 200 }
 201
 202 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 203 {
 204     return update_dimensions(s, width, height, IS_VP8);
 205 }
 206
 207
 208 static void parse_segment_info(VP8Context *s)
 209 {
 210     VP56RangeCoder *c = &s->c;
 211     int i;
 212
 213     s->segmentation.update_map = vp8_rac_get(c);
 214
 215     if (vp8_rac_get(c)) { // update segment feature data
 216         s->segmentation.absolute_vals = vp8_rac_get(c);
 217
 218         for (i = 0; i < 4; i++)
 219             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 220
 221         for (i = 0; i < 4; i++)
 222             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 223     }
 224     if (s->segmentation.update_map)
 225         for (i = 0; i < 3; i++)
 226             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 227 }
 228
 229 static void update_lf_deltas(VP8Context *s)
 230 {
 231     VP56RangeCoder *c = &s->c;
 232     int i;
 233
 234     for (i = 0; i < 4; i++) {
 235         if (vp8_rac_get(c)) {
 236             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 237
 238             if (vp8_rac_get(c))
 239                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 240         }
 241     }
 242
 243     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 244         if (vp8_rac_get(c)) {
 245             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 246
 247             if (vp8_rac_get(c))
 248                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 249         }
 250     }
 251 }
 252
 253 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 254 {
 255     const uint8_t *sizes = buf;
 256     int i;
 257
 258     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 259
 260     buf      += 3 * (s->num_coeff_partitions - 1);
 261     buf_size -= 3 * (s->num_coeff_partitions - 1);
 262     if (buf_size < 0)
 263         return -1;
 264
 265     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 266         int size = AV_RL24(sizes + 3 * i);
 267         if (buf_size - size < 0)
 268             return -1;
 269
 270         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 271         buf      += size;
 272         buf_size -= size;
 273     }
 274     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 275
 276     return 0;
 277 }
 278
 279 static void vp7_get_quants(VP8Context *s)
 280 {
 281     VP56RangeCoder *c = &s->c;
 282
 283     int yac_qi  = vp8_rac_get_uint(c, 7);
 284     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 285     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 286     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 287     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 288     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 289
 290     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 291     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 292     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 293     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 294     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 295     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 296 }
 297
 298 static void vp8_get_quants(VP8Context *s)
 299 {
 300     VP56RangeCoder *c = &s->c;
 301     int i, base_qi;
 302
 303     int yac_qi     = vp8_rac_get_uint(c, 7);
 304     int ydc_delta  = vp8_rac_get_sint(c, 4);
 305     int y2dc_delta = vp8_rac_get_sint(c, 4);
 306     int y2ac_delta = vp8_rac_get_sint(c, 4);
 307     int uvdc_delta = vp8_rac_get_sint(c, 4);
 308     int uvac_delta = vp8_rac_get_sint(c, 4);
 309
 310     for (i = 0; i < 4; i++) {
 311         if (s->segmentation.enabled) {
 312             base_qi = s->segmentation.base_quant[i];
 313             if (!s->segmentation.absolute_vals)
 314                 base_qi += yac_qi;
 315         } else
 316             base_qi = yac_qi;
 317
 318         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta,  7)];
 319         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 320         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)] * 2;
 321         /* 101581>>16 is equivalent to 155/100 */
 322         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] * 101581 >> 16;
 323         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 324         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 325
 326         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 327         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 328     }
 329 }
 330
 331 /**
 332  * Determine which buffers golden and altref should be updated with after this frame.
 333  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 334  *
 335  * Intra frames update all 3 references
 336  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 337  * If the update (golden|altref) flag is set, it's updated with the current frame
 338  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 339  * If the flag is not set, the number read means:
 340  *      0: no update
 341  *      1: VP56_FRAME_PREVIOUS
 342  *      2: update golden with altref, or update altref with golden
 343  */
 344 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 345 {
 346     VP56RangeCoder *c = &s->c;
 347
 348     if (update)
 349         return VP56_FRAME_CURRENT;
 350
 351     switch (vp8_rac_get_uint(c, 2)) {
 352     case 1:
 353         return VP56_FRAME_PREVIOUS;
 354     case 2:
 355         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 356     }
 357     return VP56_FRAME_NONE;
 358 }
 359
 360 static void vp78_reset_probability_tables(VP8Context *s)
 361 {
 362     int i, j;
 363     for (i = 0; i < 4; i++)
 364         for (j = 0; j < 16; j++)
 365             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 366                    sizeof(s->prob->token[i][j]));
 367 }
 368
 369 static void vp78_update_probability_tables(VP8Context *s)
 370 {
 371     VP56RangeCoder *c = &s->c;
 372     int i, j, k, l, m;
 373
 374     for (i = 0; i < 4; i++)
 375         for (j = 0; j < 8; j++)
 376             for (k = 0; k < 3; k++)
 377                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 378                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 379                         int prob = vp8_rac_get_uint(c, 8);
 380                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 381                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 382                     }
 383 }
 384
 385 #define VP7_MVC_SIZE 17
 386 #define VP8_MVC_SIZE 19
 387
 388 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 389                                                             int mvc_size)
 390 {
 391     VP56RangeCoder *c = &s->c;
 392     int i, j;
 393
 394     if (vp8_rac_get(c))
 395         for (i = 0; i < 4; i++)
 396             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 397     if (vp8_rac_get(c))
 398         for (i = 0; i < 3; i++)
 399             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 400
 401     // 17.2 MV probability update
 402     for (i = 0; i < 2; i++)
 403         for (j = 0; j < mvc_size; j++)
 404             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 405                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 406 }
 407
 408 static void update_refs(VP8Context *s)
 409 {
 410     VP56RangeCoder *c = &s->c;
 411
 412     int update_golden = vp8_rac_get(c);
 413     int update_altref = vp8_rac_get(c);
 414
 415     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 416     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 417 }
 418
 419 static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 420 {
 421     int i, j;
 422
 423     for (j = 1; j < 3; j++) {
 424         for (i = 0; i < height / 2; i++)
 425             memcpy(dst->data[j] + i * dst->linesize[j],
 426                    src->data[j] + i * src->linesize[j], width / 2);
 427     }
 428 }
 429
 430 static void fade(uint8_t *dst, int dst_linesize,
 431                  const uint8_t *src, int src_linesize,
 432                  int width, int height,
 433                  int alpha, int beta)
 434 {
 435     int i, j;
 436     for (j = 0; j < height; j++) {
 437         for (i = 0; i < width; i++) {
 438             uint8_t y = src[j * src_linesize + i];
 439             dst[j * dst_linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 440         }
 441     }
 442 }
 443
 444 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 445 {
 446     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 447     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 448     int ret;
 449
 450     if (!s->keyframe && (alpha || beta)) {
 451         int width  = s->mb_width * 16;
 452         int height = s->mb_height * 16;
 453         AVFrame *src, *dst;
 454
 455         if (!s->framep[VP56_FRAME_PREVIOUS] ||
 456             !s->framep[VP56_FRAME_GOLDEN]) {
 457             av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
 458             return AVERROR_INVALIDDATA;
 459         }
 460
 461         dst =
 462         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 463
 464         /* preserve the golden frame, write a new previous frame */
 465         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 466             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 467             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 468                 return ret;
 469
 470             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 471
 472             copy_chroma(dst, src, width, height);
 473         }
 474
 475         fade(dst->data[0], dst->linesize[0],
 476              src->data[0], src->linesize[0],
 477              width, height, alpha, beta);
 478     }
 479
 480     return 0;
 481 }
 482
 483 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 484 {
 485     VP56RangeCoder *c = &s->c;
 486     int part1_size, hscale, vscale, i, j, ret;
 487     int width  = s->avctx->width;
 488     int height = s->avctx->height;
 489
 490     s->profile = (buf[0] >> 1) & 7;
 491     if (s->profile > 1) {
 492         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 493         return AVERROR_INVALIDDATA;
 494     }
 495
 496     s->keyframe  = !(buf[0] & 1);
 497     s->invisible = 0;
 498     part1_size   = AV_RL24(buf) >> 4;
 499
 500     if (buf_size < 4 - s->profile + part1_size) {
 501         av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
 502         return AVERROR_INVALIDDATA;
 503     }
 504
 505     buf      += 4 - s->profile;
 506     buf_size -= 4 - s->profile;
 507
 508     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 509
 510     ff_vp56_init_range_decoder(c, buf, part1_size);
 511     buf      += part1_size;
 512     buf_size -= part1_size;
 513
 514     /* A. Dimension information (keyframes only) */
 515     if (s->keyframe) {
 516         width  = vp8_rac_get_uint(c, 12);
 517         height = vp8_rac_get_uint(c, 12);
 518         hscale = vp8_rac_get_uint(c, 2);
 519         vscale = vp8_rac_get_uint(c, 2);
 520         if (hscale || vscale)
 521             avpriv_request_sample(s->avctx, "Upscaling");
 522
 523         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 524         vp78_reset_probability_tables(s);
 525         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 526                sizeof(s->prob->pred16x16));
 527         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 528                sizeof(s->prob->pred8x8c));
 529         for (i = 0; i < 2; i++)
 530             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 531                    sizeof(vp7_mv_default_prob[i]));
 532         memset(&s->segmentation, 0, sizeof(s->segmentation));
 533         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 534         memcpy(s->prob[0].scan, zigzag_scan, sizeof(s->prob[0].scan));
 535     }
 536
 537     if (s->keyframe || s->profile > 0)
 538         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 539
 540     /* B. Decoding information for all four macroblock-level features */
 541     for (i = 0; i < 4; i++) {
 542         s->feature_enabled[i] = vp8_rac_get(c);
 543         if (s->feature_enabled[i]) {
 544              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 545
 546              for (j = 0; j < 3; j++)
 547                  s->feature_index_prob[i][j] =
 548                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 549
 550              if (vp7_feature_value_size[s->profile][i])
 551                  for (j = 0; j < 4; j++)
 552                      s->feature_value[i][j] =
 553                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 554         }
 555     }
 556
 557     s->segmentation.enabled    = 0;
 558     s->segmentation.update_map = 0;
 559     s->lf_delta.enabled        = 0;
 560
 561     s->num_coeff_partitions = 1;
 562     ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 563
 564     if (!s->macroblocks_base || /* first frame */
 565         width != s->avctx->width || height != s->avctx->height ||
 566         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 567         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 568             return ret;
 569     }
 570
 571     /* C. Dequantization indices */
 572     vp7_get_quants(s);
 573
 574     /* D. Golden frame update flag (a Flag) for interframes only */
 575     if (!s->keyframe) {
 576         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 577         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 578     }
 579
 580     s->update_last          = 1;
 581     s->update_probabilities = 1;
 582     s->fade_present         = 1;
 583
 584     if (s->profile > 0) {
 585         s->update_probabilities = vp8_rac_get(c);
 586         if (!s->update_probabilities)
 587             s->prob[1] = s->prob[0];
 588
 589         if (!s->keyframe)
 590             s->fade_present = vp8_rac_get(c);
 591     }
 592
 593     /* E. Fading information for previous frame */
 594     if (s->fade_present && vp8_rac_get(c)) {
 595         if ((ret = vp7_fade_frame(s ,c)) < 0)
 596             return ret;
 597     }
 598
 599     /* F. Loop filter type */
 600     if (!s->profile)
 601         s->filter.simple = vp8_rac_get(c);
 602
 603     /* G. DCT coefficient ordering specification */
 604     if (vp8_rac_get(c))
 605         for (i = 1; i < 16; i++)
 606             s->prob[0].scan[i] = zigzag_scan[vp8_rac_get_uint(c, 4)];
 607
 608     /* H. Loop filter levels  */
 609     if (s->profile > 0)
 610         s->filter.simple = vp8_rac_get(c);
 611     s->filter.level     = vp8_rac_get_uint(c, 6);
 612     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 613
 614     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 615     vp78_update_probability_tables(s);
 616
 617     s->mbskip_enabled = 0;
 618
 619     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 620     if (!s->keyframe) {
 621         s->prob->intra  = vp8_rac_get_uint(c, 8);
 622         s->prob->last   = vp8_rac_get_uint(c, 8);
 623         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 624     }
 625
 626     return 0;
 627 }
 628
 629 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 630 {
 631     VP56RangeCoder *c = &s->c;
 632     int header_size, hscale, vscale, ret;
 633     int width  = s->avctx->width;
 634     int height = s->avctx->height;
 635
 636     s->keyframe  = !(buf[0] & 1);
 637     s->profile   =  (buf[0]>>1) & 7;
 638     s->invisible = !(buf[0] & 0x10);
 639     header_size  = AV_RL24(buf) >> 5;
 640     buf      += 3;
 641     buf_size -= 3;
 642
 643     if (s->profile > 3)
 644         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 645
 646     if (!s->profile)
 647         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 648                sizeof(s->put_pixels_tab));
 649     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 650         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 651                sizeof(s->put_pixels_tab));
 652
 653     if (header_size > buf_size - 7 * s->keyframe) {
 654         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 655         return AVERROR_INVALIDDATA;
 656     }
 657
 658     if (s->keyframe) {
 659         if (AV_RL24(buf) != 0x2a019d) {
 660             av_log(s->avctx, AV_LOG_ERROR,
 661                    "Invalid start code 0x%x\n", AV_RL24(buf));
 662             return AVERROR_INVALIDDATA;
 663         }
 664         width     = AV_RL16(buf + 3) & 0x3fff;
 665         height    = AV_RL16(buf + 5) & 0x3fff;
 666         hscale    = buf[4] >> 6;
 667         vscale    = buf[6] >> 6;
 668         buf      += 7;
 669         buf_size -= 7;
 670
 671         if (hscale || vscale)
 672             avpriv_request_sample(s->avctx, "Upscaling");
 673
 674         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 675         vp78_reset_probability_tables(s);
 676         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 677                sizeof(s->prob->pred16x16));
 678         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 679                sizeof(s->prob->pred8x8c));
 680         memcpy(s->prob->mvc, vp8_mv_default_prob,
 681                sizeof(s->prob->mvc));
 682         memset(&s->segmentation, 0, sizeof(s->segmentation));
 683         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 684     }
 685
 686     ff_vp56_init_range_decoder(c, buf, header_size);
 687     buf      += header_size;
 688     buf_size -= header_size;
 689
 690     if (s->keyframe) {
 691         if (vp8_rac_get(c))
 692             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 693         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 694     }
 695
 696     if ((s->segmentation.enabled = vp8_rac_get(c)))
 697         parse_segment_info(s);
 698     else
 699         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 700
 701     s->filter.simple    = vp8_rac_get(c);
 702     s->filter.level     = vp8_rac_get_uint(c, 6);
 703     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 704
 705     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 706         if (vp8_rac_get(c))
 707             update_lf_deltas(s);
 708
 709     if (setup_partitions(s, buf, buf_size)) {
 710         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 711         return AVERROR_INVALIDDATA;
 712     }
 713
 714     if (!s->macroblocks_base || /* first frame */
 715         width != s->avctx->width || height != s->avctx->height ||
 716         (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
 717         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 718             return ret;
 719
 720     vp8_get_quants(s);
 721
 722     if (!s->keyframe) {
 723         update_refs(s);
 724         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 725         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 726     }
 727
 728     // if we aren't saving this frame's probabilities for future frames,
 729     // make a copy of the current probabilities
 730     if (!(s->update_probabilities = vp8_rac_get(c)))
 731         s->prob[1] = s->prob[0];
 732
 733     s->update_last = s->keyframe || vp8_rac_get(c);
 734
 735     vp78_update_probability_tables(s);
 736
 737     if ((s->mbskip_enabled = vp8_rac_get(c)))
 738         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 739
 740     if (!s->keyframe) {
 741         s->prob->intra  = vp8_rac_get_uint(c, 8);
 742         s->prob->last   = vp8_rac_get_uint(c, 8);
 743         s->prob->golden = vp8_rac_get_uint(c, 8);
 744         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 745     }
 746
 747     return 0;
 748 }
 749
 750 static av_always_inline
 751 void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 752 {
 753     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 754     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 755 }
 756
 757 /**
 758  * Motion vector coding, 17.1.
 759  */
 760 static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 761 {
 762     int bit, x = 0;
 763
 764     if (vp56_rac_get_prob_branchy(c, p[0])) {
 765         int i;
 766
 767         for (i = 0; i < 3; i++)
 768             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 769         for (i = (vp7 ? 7 : 9); i > 3; i--)
 770             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 771         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 772             x += 8;
 773     } else {
 774         // small_mvtree
 775         const uint8_t *ps = p + 2;
 776         bit = vp56_rac_get_prob(c, *ps);
 777         ps += 1 + 3 * bit;
 778         x  += 4 * bit;
 779         bit = vp56_rac_get_prob(c, *ps);
 780         ps += 1 + bit;
 781         x  += 2 * bit;
 782         x  += vp56_rac_get_prob(c, *ps);
 783     }
 784
 785     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 786 }
 787
 788 static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 789 {
 790     return read_mv_component(c, p, 1);
 791 }
 792
 793 static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 794 {
 795     return read_mv_component(c, p, 0);
 796 }
 797
 798 static av_always_inline
 799 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 800 {
 801     if (is_vp7)
 802         return vp7_submv_prob;
 803
 804     if (left == top)
 805         return vp8_submv_prob[4 - !!left];
 806     if (!top)
 807         return vp8_submv_prob[2];
 808     return vp8_submv_prob[1 - !!left];
 809 }
 810
 811 /**
 812  * Split motion vector prediction, 16.4.
 813  * @returns the number of motion vectors parsed (2, 4 or 16)
 814  */
 815 static av_always_inline
 816 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 817                     int layout, int is_vp7)
 818 {
 819     int part_idx;
 820     int n, num;
 821     VP8Macroblock *top_mb;
 822     VP8Macroblock *left_mb = &mb[-1];
 823     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 824     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 825     VP56mv *top_mv;
 826     VP56mv *left_mv = left_mb->bmv;
 827     VP56mv *cur_mv  = mb->bmv;
 828
 829     if (!layout) // layout is inlined, s->mb_layout is not
 830         top_mb = &mb[2];
 831     else
 832         top_mb = &mb[-s->mb_width - 1];
 833     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 834     top_mv       = top_mb->bmv;
 835
 836     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 837         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 838             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 839         else
 840             part_idx = VP8_SPLITMVMODE_8x8;
 841     } else {
 842         part_idx = VP8_SPLITMVMODE_4x4;
 843     }
 844
 845     num              = vp8_mbsplit_count[part_idx];
 846     mbsplits_cur     = vp8_mbsplits[part_idx],
 847     firstidx         = vp8_mbfirstidx[part_idx];
 848     mb->partitioning = part_idx;
 849
 850     for (n = 0; n < num; n++) {
 851         int k = firstidx[n];
 852         uint32_t left, above;
 853         const uint8_t *submv_prob;
 854
 855         if (!(k & 3))
 856             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 857         else
 858             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 859         if (k <= 3)
 860             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 861         else
 862             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 863
 864         submv_prob = get_submv_prob(left, above, is_vp7);
 865
 866         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 867             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 868                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 869                     mb->bmv[n].y = mb->mv.y +
 870                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 871                     mb->bmv[n].x = mb->mv.x +
 872                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 873                 } else {
 874                     AV_ZERO32(&mb->bmv[n]);
 875                 }
 876             } else {
 877                 AV_WN32A(&mb->bmv[n], above);
 878             }
 879         } else {
 880             AV_WN32A(&mb->bmv[n], left);
 881         }
 882     }
 883
 884     return num;
 885 }
 886
 887 /**
 888  * The vp7 reference decoder uses a padding macroblock column (added to right
 889  * edge of the frame) to guard against illegal macroblock offsets. The
 890  * algorithm has bugs that permit offsets to straddle the padding column.
 891  * This function replicates those bugs.
 892  *
 893  * @param[out] edge_x macroblock x address
 894  * @param[out] edge_y macroblock y address
 895  *
 896  * @return macroblock offset legal (boolean)
 897  */
 898 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 899                                    int xoffset, int yoffset, int boundary,
 900                                    int *edge_x, int *edge_y)
 901 {
 902     int vwidth = mb_width + 1;
 903     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 904     if (new < boundary || new % vwidth == vwidth - 1)
 905         return 0;
 906     *edge_y = new / vwidth;
 907     *edge_x = new % vwidth;
 908     return 1;
 909 }
 910
 911 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
 912 {
 913     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
 914 }
 915
 916 static av_always_inline
 917 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 918                     int mb_x, int mb_y, int layout)
 919 {
 920     VP8Macroblock *mb_edge[12];
 921     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
 922     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 923     int idx = CNT_ZERO;
 924     VP56mv near_mv[3];
 925     uint8_t cnt[3] = { 0 };
 926     VP56RangeCoder *c = &s->c;
 927     int i;
 928
 929     AV_ZERO32(&near_mv[0]);
 930     AV_ZERO32(&near_mv[1]);
 931     AV_ZERO32(&near_mv[2]);
 932
 933     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
 934         const VP7MVPred * pred = &vp7_mv_pred[i];
 935         int edge_x, edge_y;
 936
 937         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
 938                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
 939             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
 940                                              ? s->macroblocks_base + 1 + edge_x +
 941                                                (s->mb_width + 1) * (edge_y + 1)
 942                                              : s->macroblocks + edge_x +
 943                                                (s->mb_height - edge_y - 1) * 2;
 944             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
 945             if (mv) {
 946                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
 947                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
 948                         idx = CNT_NEAREST;
 949                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
 950                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
 951                             continue;
 952                         idx = CNT_NEAR;
 953                     } else {
 954                         AV_WN32A(&near_mv[CNT_NEAR], mv);
 955                         idx = CNT_NEAR;
 956                     }
 957                 } else {
 958                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
 959                     idx = CNT_NEAREST;
 960                 }
 961             } else {
 962                 idx = CNT_ZERO;
 963             }
 964         } else {
 965             idx = CNT_ZERO;
 966         }
 967         cnt[idx] += vp7_mv_pred[i].score;
 968     }
 969
 970     mb->partitioning = VP8_SPLITMVMODE_NONE;
 971
 972     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
 973         mb->mode = VP8_MVMODE_MV;
 974
 975         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
 976
 977             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
 978
 979                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
 980                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
 981                 else
 982                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
 983
 984                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
 985                     mb->mode = VP8_MVMODE_SPLIT;
 986                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
 987                 } else {
 988                     mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
 989                     mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
 990                     mb->bmv[0] = mb->mv;
 991                 }
 992             } else {
 993                 mb->mv = near_mv[CNT_NEAR];
 994                 mb->bmv[0] = mb->mv;
 995             }
 996         } else {
 997             mb->mv = near_mv[CNT_NEAREST];
 998             mb->bmv[0] = mb->mv;
 999         }
1000     } else {
1001         mb->mode = VP8_MVMODE_ZERO;
1002         AV_ZERO32(&mb->mv);
1003         mb->bmv[0] = mb->mv;
1004     }
1005 }
1006
1007 static av_always_inline
1008 void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1009                     int mb_x, int mb_y, int layout)
1010 {
1011     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1012                                   mb - 1 /* left */,
1013                                   0      /* top-left */ };
1014     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1015     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1016     int idx = CNT_ZERO;
1017     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1018     int8_t *sign_bias = s->sign_bias;
1019     VP56mv near_mv[4];
1020     uint8_t cnt[4] = { 0 };
1021     VP56RangeCoder *c = &s->c;
1022
1023     if (!layout) { // layout is inlined (s->mb_layout is not)
1024         mb_edge[0] = mb + 2;
1025         mb_edge[2] = mb + 1;
1026     } else {
1027         mb_edge[0] = mb - s->mb_width - 1;
1028         mb_edge[2] = mb - s->mb_width - 2;
1029     }
1030
1031     AV_ZERO32(&near_mv[0]);
1032     AV_ZERO32(&near_mv[1]);
1033     AV_ZERO32(&near_mv[2]);
1034
1035     /* Process MB on top, left and top-left */
1036 #define MV_EDGE_CHECK(n)                                                      \
1037     {                                                                         \
1038         VP8Macroblock *edge = mb_edge[n];                                     \
1039         int edge_ref = edge->ref_frame;                                       \
1040         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1041             uint32_t mv = AV_RN32A(&edge->mv);                                \
1042             if (mv) {                                                         \
1043                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1044                     /* SWAR negate of the values in mv. */                    \
1045                     mv = ~mv;                                                 \
1046                     mv = ((mv & 0x7fff7fff) +                                 \
1047                           0x00010001) ^ (mv & 0x80008000);                    \
1048                 }                                                             \
1049                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1050                     AV_WN32A(&near_mv[++idx], mv);                            \
1051                 cnt[idx] += 1 + (n != 2);                                     \
1052             } else                                                            \
1053                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1054         }                                                                     \
1055     }
1056
1057     MV_EDGE_CHECK(0)
1058     MV_EDGE_CHECK(1)
1059     MV_EDGE_CHECK(2)
1060
1061     mb->partitioning = VP8_SPLITMVMODE_NONE;
1062     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1063         mb->mode = VP8_MVMODE_MV;
1064
1065         /* If we have three distinct MVs, merge first and last if they're the same */
1066         if (cnt[CNT_SPLITMV] &&
1067             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1068             cnt[CNT_NEAREST] += 1;
1069
1070         /* Swap near and nearest if necessary */
1071         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1072             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1073             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1074         }
1075
1076         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1077             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1078                 /* Choose the best mv out of 0,0 and the nearest mv */
1079                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1080                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1081                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1082                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1083
1084                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1085                     mb->mode = VP8_MVMODE_SPLIT;
1086                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1087                 } else {
1088                     mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1089                     mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1090                     mb->bmv[0] = mb->mv;
1091                 }
1092             } else {
1093                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
1094                 mb->bmv[0] = mb->mv;
1095             }
1096         } else {
1097             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
1098             mb->bmv[0] = mb->mv;
1099         }
1100     } else {
1101         mb->mode = VP8_MVMODE_ZERO;
1102         AV_ZERO32(&mb->mv);
1103         mb->bmv[0] = mb->mv;
1104     }
1105 }
1106
1107 static av_always_inline
1108 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1109                            int mb_x, int keyframe, int layout)
1110 {
1111     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1112
1113     if (layout) {
1114         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1115         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1116     }
1117     if (keyframe) {
1118         int x, y;
1119         uint8_t *top;
1120         uint8_t *const left = s->intra4x4_pred_mode_left;
1121         if (layout)
1122             top = mb->intra4x4_pred_mode_top;
1123         else
1124             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1125         for (y = 0; y < 4; y++) {
1126             for (x = 0; x < 4; x++) {
1127                 const uint8_t *ctx;
1128                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1129                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1130                 left[y]   = top[x] = *intra4x4;
1131                 intra4x4++;
1132             }
1133         }
1134     } else {
1135         int i;
1136         for (i = 0; i < 16; i++)
1137             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1138                                            vp8_pred4x4_prob_inter);
1139     }
1140 }
1141
1142 static av_always_inline
1143 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1144                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1145 {
1146     VP56RangeCoder *c = &s->c;
1147     const char *vp7_feature_name[] = { "q-index",
1148                                        "lf-delta",
1149                                        "partial-golden-update",
1150                                        "blit-pitch" };
1151     if (is_vp7) {
1152         int i;
1153         *segment = 0;
1154         for (i = 0; i < 4; i++) {
1155             if (s->feature_enabled[i]) {
1156                 if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1157                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1158                                                    s->feature_index_prob[i]);
1159                       av_log(s->avctx, AV_LOG_WARNING,
1160                              "Feature %s present in macroblock (value 0x%x)\n",
1161                              vp7_feature_name[i], s->feature_value[i][index]);
1162                 }
1163            }
1164         }
1165     } else if (s->segmentation.update_map) {
1166         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
1167         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1168     } else if (s->segmentation.enabled)
1169         *segment = ref ? *ref : *segment;
1170     mb->segment = *segment;
1171
1172     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1173
1174     if (s->keyframe) {
1175         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1176                                     vp8_pred16x16_prob_intra);
1177
1178         if (mb->mode == MODE_I4x4) {
1179             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1180         } else {
1181             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1182                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1183             if (s->mb_layout)
1184                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1185             else
1186                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1187             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1188         }
1189
1190         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1191                                                 vp8_pred8x8c_prob_intra);
1192         mb->ref_frame        = VP56_FRAME_CURRENT;
1193     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1194         // inter MB, 16.2
1195         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1196             mb->ref_frame =
1197                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1198                                                                    : VP56_FRAME_GOLDEN;
1199         else
1200             mb->ref_frame = VP56_FRAME_PREVIOUS;
1201         s->ref_count[mb->ref_frame - 1]++;
1202
1203         // motion vectors, 16.3
1204         if (is_vp7)
1205             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1206         else
1207             vp8_decode_mvs(s, mb, mb_x, mb_y, layout);
1208     } else {
1209         // intra MB, 16.1
1210         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1211
1212         if (mb->mode == MODE_I4x4)
1213             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1214
1215         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1216                                                 s->prob->pred8x8c);
1217         mb->ref_frame        = VP56_FRAME_CURRENT;
1218         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1219         AV_ZERO32(&mb->bmv[0]);
1220     }
1221 }
1222
1223 /**
1224  * @param r     arithmetic bitstream reader context
1225  * @param block destination for block coefficients
1226  * @param probs probabilities to use when reading trees from the bitstream
1227  * @param i     initial coeff index, 0 unless a separate DC block is coded
1228  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1229  *
1230  * @return 0 if no coeffs were decoded
1231  *         otherwise, the index of the last coeff decoded plus one
1232  */
1233 static av_always_inline
1234 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1235                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1236                                  int i, uint8_t *token_prob, int16_t qmul[2],
1237                                  const uint8_t scan[16], int vp7)
1238 {
1239     VP56RangeCoder c = *r;
1240     goto skip_eob;
1241     do {
1242         int coeff;
1243 restart:
1244         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1245             break;
1246
1247 skip_eob:
1248         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1249             if (++i == 16)
1250                 break; // invalid input; blocks should end with EOB
1251             token_prob = probs[i][0];
1252             if (vp7)
1253                 goto restart;
1254             goto skip_eob;
1255         }
1256
1257         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1258             coeff = 1;
1259             token_prob = probs[i + 1][1];
1260         } else {
1261             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1262                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1263                 if (coeff)
1264                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1265                 coeff += 2;
1266             } else {
1267                 // DCT_CAT*
1268                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1269                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1270                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1271                     } else {                                    // DCT_CAT2
1272                         coeff  = 7;
1273                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1274                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1275                     }
1276                 } else {    // DCT_CAT3 and up
1277                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1278                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1279                     int cat = (a << 1) + b;
1280                     coeff  = 3 + (8 << cat);
1281                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1282                 }
1283             }
1284             token_prob = probs[i + 1][2];
1285         }
1286         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1287     } while (++i < 16);
1288
1289     *r = c;
1290     return i;
1291 }
1292
1293 static av_always_inline
1294 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1295 {
1296     int16_t dc = block[0];
1297     int ret = 0;
1298
1299     if (pred[1] > 3) {
1300         dc += pred[0];
1301         ret = 1;
1302     }
1303
1304     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1305         block[0] = pred[0] = dc;
1306         pred[1] = 0;
1307     } else {
1308         if (pred[0] == dc)
1309             pred[1]++;
1310         block[0] = pred[0] = dc;
1311     }
1312
1313     return ret;
1314 }
1315
1316 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1317                                             int16_t block[16],
1318                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1319                                             int i, uint8_t *token_prob,
1320                                             int16_t qmul[2],
1321                                             const uint8_t scan[16])
1322 {
1323     return decode_block_coeffs_internal(r, block, probs, i,
1324                                         token_prob, qmul, scan, IS_VP7);
1325 }
1326
1327 #ifndef vp8_decode_block_coeffs_internal
1328 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1329                                             int16_t block[16],
1330                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1331                                             int i, uint8_t *token_prob,
1332                                             int16_t qmul[2])
1333 {
1334     return decode_block_coeffs_internal(r, block, probs, i,
1335                                         token_prob, qmul, zigzag_scan, IS_VP8);
1336 }
1337 #endif
1338
1339 /**
1340  * @param c          arithmetic bitstream reader context
1341  * @param block      destination for block coefficients
1342  * @param probs      probabilities to use when reading trees from the bitstream
1343  * @param i          initial coeff index, 0 unless a separate DC block is coded
1344  * @param zero_nhood the initial prediction context for number of surrounding
1345  *                   all-zero blocks (only left/top, so 0-2)
1346  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1347  * @param scan       scan pattern (VP7 only)
1348  *
1349  * @return 0 if no coeffs were decoded
1350  *         otherwise, the index of the last coeff decoded plus one
1351  */
1352 static av_always_inline
1353 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1354                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1355                         int i, int zero_nhood, int16_t qmul[2],
1356                         const uint8_t scan[16], int vp7)
1357 {
1358     uint8_t *token_prob = probs[i][zero_nhood];
1359     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1360         return 0;
1361     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1362                                                   token_prob, qmul, scan)
1363                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1364                                                   token_prob, qmul);
1365 }
1366
1367 static av_always_inline
1368 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1369                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1370                       int is_vp7)
1371 {
1372     int i, x, y, luma_start = 0, luma_ctx = 3;
1373     int nnz_pred, nnz, nnz_total = 0;
1374     int segment = mb->segment;
1375     int block_dc = 0;
1376
1377     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1378         nnz_pred = t_nnz[8] + l_nnz[8];
1379
1380         // decode DC values and do hadamard
1381         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1382                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1383                                   zigzag_scan, is_vp7);
1384         l_nnz[8] = t_nnz[8] = !!nnz;
1385
1386         if (is_vp7 && mb->mode > MODE_I4x4) {
1387             nnz |=  inter_predict_dc(td->block_dc,
1388                                      s->inter_dc_pred[mb->ref_frame - 1]);
1389         }
1390
1391         if (nnz) {
1392             nnz_total += nnz;
1393             block_dc   = 1;
1394             if (nnz == 1)
1395                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1396             else
1397                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1398         }
1399         luma_start = 1;
1400         luma_ctx   = 0;
1401     }
1402
1403     // luma blocks
1404     for (y = 0; y < 4; y++)
1405         for (x = 0; x < 4; x++) {
1406             nnz_pred = l_nnz[y] + t_nnz[x];
1407             nnz = decode_block_coeffs(c, td->block[y][x],
1408                                       s->prob->token[luma_ctx],
1409                                       luma_start, nnz_pred,
1410                                       s->qmat[segment].luma_qmul,
1411                                       s->prob[0].scan, is_vp7);
1412             /* nnz+block_dc may be one more than the actual last index,
1413              * but we don't care */
1414             td->non_zero_count_cache[y][x] = nnz + block_dc;
1415             t_nnz[x] = l_nnz[y] = !!nnz;
1416             nnz_total += nnz;
1417         }
1418
1419     // chroma blocks
1420     // TODO: what to do about dimensions? 2nd dim for luma is x,
1421     // but for chroma it's (y<<1)|x
1422     for (i = 4; i < 6; i++)
1423         for (y = 0; y < 2; y++)
1424             for (x = 0; x < 2; x++) {
1425                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1426                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1427                                           s->prob->token[2], 0, nnz_pred,
1428                                           s->qmat[segment].chroma_qmul,
1429                                           s->prob[0].scan, is_vp7);
1430                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1431                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1432                 nnz_total += nnz;
1433             }
1434
1435     // if there were no coded coeffs despite the macroblock not being marked skip,
1436     // we MUST not do the inner loop filter and should not do IDCT
1437     // Since skip isn't used for bitstream prediction, just manually set it.
1438     if (!nnz_total)
1439         mb->skip = 1;
1440 }
1441
1442 static av_always_inline
1443 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1444                       uint8_t *src_cb, uint8_t *src_cr,
1445                       int linesize, int uvlinesize, int simple)
1446 {
1447     AV_COPY128(top_border, src_y + 15 * linesize);
1448     if (!simple) {
1449         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1450         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1451     }
1452 }
1453
1454 static av_always_inline
1455 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1456                     uint8_t *src_cr, int linesize, int uvlinesize, int mb_x,
1457                     int mb_y, int mb_width, int simple, int xchg)
1458 {
1459     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1460     src_y  -= linesize;
1461     src_cb -= uvlinesize;
1462     src_cr -= uvlinesize;
1463
1464 #define XCHG(a, b, xchg)                                                      \
1465     do {                                                                      \
1466         if (xchg)                                                             \
1467             AV_SWAP64(b, a);                                                  \
1468         else                                                                  \
1469             AV_COPY64(b, a);                                                  \
1470     } while (0)
1471
1472     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1473     XCHG(top_border, src_y, xchg);
1474     XCHG(top_border + 8, src_y + 8, 1);
1475     if (mb_x < mb_width - 1)
1476         XCHG(top_border + 32, src_y + 16, 1);
1477
1478     // only copy chroma for normal loop filter
1479     // or to initialize the top row to 127
1480     if (!simple || !mb_y) {
1481         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1482         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1483         XCHG(top_border + 16, src_cb, 1);
1484         XCHG(top_border + 24, src_cr, 1);
1485     }
1486 }
1487
1488 static av_always_inline
1489 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1490 {
1491     if (!mb_x)
1492         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1493     else
1494         return mb_y ? mode : LEFT_DC_PRED8x8;
1495 }
1496
1497 static av_always_inline
1498 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1499 {
1500     if (!mb_x)
1501         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1502     else
1503         return mb_y ? mode : HOR_PRED8x8;
1504 }
1505
1506 static av_always_inline
1507 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1508 {
1509     switch (mode) {
1510     case DC_PRED8x8:
1511         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1512     case VERT_PRED8x8:
1513         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1514     case HOR_PRED8x8:
1515         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1516     case PLANE_PRED8x8: /* TM */
1517         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1518     }
1519     return mode;
1520 }
1521
1522 static av_always_inline
1523 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1524 {
1525     if (!mb_x) {
1526         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1527     } else {
1528         return mb_y ? mode : HOR_VP8_PRED;
1529     }
1530 }
1531
1532 static av_always_inline
1533 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1534                                      int *copy_buf, int vp7)
1535 {
1536     switch (mode) {
1537     case VERT_PRED:
1538         if (!mb_x && mb_y) {
1539             *copy_buf = 1;
1540             return mode;
1541         }
1542         /* fall-through */
1543     case DIAG_DOWN_LEFT_PRED:
1544     case VERT_LEFT_PRED:
1545         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1546     case HOR_PRED:
1547         if (!mb_y) {
1548             *copy_buf = 1;
1549             return mode;
1550         }
1551         /* fall-through */
1552     case HOR_UP_PRED:
1553         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1554     case TM_VP8_PRED:
1555         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1556     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1557                    * as 16x16/8x8 DC */
1558     case DIAG_DOWN_RIGHT_PRED:
1559     case VERT_RIGHT_PRED:
1560     case HOR_DOWN_PRED:
1561         if (!mb_y || !mb_x)
1562             *copy_buf = 1;
1563         return mode;
1564     }
1565     return mode;
1566 }
1567
1568 static av_always_inline
1569 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1570                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1571 {
1572     int x, y, mode, nnz;
1573     uint32_t tr;
1574
1575     /* for the first row, we need to run xchg_mb_border to init the top edge
1576      * to 127 otherwise, skip it if we aren't going to deblock */
1577     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1578         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1579                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1580                        s->filter.simple, 1);
1581
1582     if (mb->mode < MODE_I4x4) {
1583         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1584         s->hpc.pred16x16[mode](dst[0], s->linesize);
1585     } else {
1586         uint8_t *ptr = dst[0];
1587         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1588         const uint8_t lo = is_vp7 ? 128 : 127;
1589         const uint8_t hi = is_vp7 ? 128 : 129;
1590         uint8_t tr_top[4] = { lo, lo, lo, lo };
1591
1592         // all blocks on the right edge of the macroblock use bottom edge
1593         // the top macroblock for their topright edge
1594         uint8_t *tr_right = ptr - s->linesize + 16;
1595
1596         // if we're on the right edge of the frame, said edge is extended
1597         // from the top macroblock
1598         if (mb_y && mb_x == s->mb_width - 1) {
1599             tr       = tr_right[-1] * 0x01010101u;
1600             tr_right = (uint8_t *) &tr;
1601         }
1602
1603         if (mb->skip)
1604             AV_ZERO128(td->non_zero_count_cache);
1605
1606         for (y = 0; y < 4; y++) {
1607             uint8_t *topright = ptr + 4 - s->linesize;
1608             for (x = 0; x < 4; x++) {
1609                 int copy = 0, linesize = s->linesize;
1610                 uint8_t *dst = ptr + 4 * x;
1611                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5 * 8];
1612
1613                 if ((y == 0 || x == 3) && mb_y == 0) {
1614                     topright = tr_top;
1615                 } else if (x == 3)
1616                     topright = tr_right;
1617
1618                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1619                                                         mb_y + y, &copy, is_vp7);
1620                 if (copy) {
1621                     dst      = copy_dst + 12;
1622                     linesize = 8;
1623                     if (!(mb_y + y)) {
1624                         copy_dst[3] = lo;
1625                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1626                     } else {
1627                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1628                         if (!(mb_x + x)) {
1629                             copy_dst[3] = hi;
1630                         } else {
1631                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1632                         }
1633                     }
1634                     if (!(mb_x + x)) {
1635                         copy_dst[11] =
1636                         copy_dst[19] =
1637                         copy_dst[27] =
1638                         copy_dst[35] = hi;
1639                     } else {
1640                         copy_dst[11] = ptr[4 * x                   - 1];
1641                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1642                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1643                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1644                     }
1645                 }
1646                 s->hpc.pred4x4[mode](dst, topright, linesize);
1647                 if (copy) {
1648                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1649                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1650                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1651                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1652                 }
1653
1654                 nnz = td->non_zero_count_cache[y][x];
1655                 if (nnz) {
1656                     if (nnz == 1)
1657                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1658                                                   td->block[y][x], s->linesize);
1659                     else
1660                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1661                                                td->block[y][x], s->linesize);
1662                 }
1663                 topright += 4;
1664             }
1665
1666             ptr      += 4 * s->linesize;
1667             intra4x4 += 4;
1668         }
1669     }
1670
1671     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1672                                             mb_x, mb_y, is_vp7);
1673     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1674     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1675
1676     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1677         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1678                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1679                        s->filter.simple, 0);
1680 }
1681
1682 static const uint8_t subpel_idx[3][8] = {
1683     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1684                                 // also function pointer index
1685     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1686     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1687 };
1688
1689 /**
1690  * luma MC function
1691  *
1692  * @param s        VP8 decoding context
1693  * @param dst      target buffer for block data at block position
1694  * @param ref      reference picture buffer at origin (0, 0)
1695  * @param mv       motion vector (relative to block position) to get pixel data from
1696  * @param x_off    horizontal position of block from origin (0, 0)
1697  * @param y_off    vertical position of block from origin (0, 0)
1698  * @param block_w  width of block (16, 8 or 4)
1699  * @param block_h  height of block (always same as block_w)
1700  * @param width    width of src/dst plane data
1701  * @param height   height of src/dst plane data
1702  * @param linesize size of a single line of plane data, including padding
1703  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1704  */
1705 static av_always_inline
1706 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1707                  ThreadFrame *ref, const VP56mv *mv,
1708                  int x_off, int y_off, int block_w, int block_h,
1709                  int width, int height, ptrdiff_t linesize,
1710                  vp8_mc_func mc_func[3][3])
1711 {
1712     uint8_t *src = ref->f->data[0];
1713
1714     if (AV_RN32A(mv)) {
1715         int src_linesize = linesize;
1716
1717         int mx = (mv->x << 1) & 7, mx_idx = subpel_idx[0][mx];
1718         int my = (mv->y << 1) & 7, my_idx = subpel_idx[0][my];
1719
1720         x_off += mv->x >> 2;
1721         y_off += mv->y >> 2;
1722
1723         // edge emulation
1724         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1725         src += y_off * linesize + x_off;
1726         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1727             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1728             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1729                                      src - my_idx * linesize - mx_idx,
1730                                      EDGE_EMU_LINESIZE, linesize,
1731                                      block_w + subpel_idx[1][mx],
1732                                      block_h + subpel_idx[1][my],
1733                                      x_off - mx_idx, y_off - my_idx,
1734                                      width, height);
1735             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1736             src_linesize = EDGE_EMU_LINESIZE;
1737         }
1738         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1739     } else {
1740         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1741         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1742                       linesize, block_h, 0, 0);
1743     }
1744 }
1745
1746 /**
1747  * chroma MC function
1748  *
1749  * @param s        VP8 decoding context
1750  * @param dst1     target buffer for block data at block position (U plane)
1751  * @param dst2     target buffer for block data at block position (V plane)
1752  * @param ref      reference picture buffer at origin (0, 0)
1753  * @param mv       motion vector (relative to block position) to get pixel data from
1754  * @param x_off    horizontal position of block from origin (0, 0)
1755  * @param y_off    vertical position of block from origin (0, 0)
1756  * @param block_w  width of block (16, 8 or 4)
1757  * @param block_h  height of block (always same as block_w)
1758  * @param width    width of src/dst plane data
1759  * @param height   height of src/dst plane data
1760  * @param linesize size of a single line of plane data, including padding
1761  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1762  */
1763 static av_always_inline
1764 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1765                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1766                    int x_off, int y_off, int block_w, int block_h,
1767                    int width, int height, ptrdiff_t linesize,
1768                    vp8_mc_func mc_func[3][3])
1769 {
1770     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1771
1772     if (AV_RN32A(mv)) {
1773         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1774         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1775
1776         x_off += mv->x >> 3;
1777         y_off += mv->y >> 3;
1778
1779         // edge emulation
1780         src1 += y_off * linesize + x_off;
1781         src2 += y_off * linesize + x_off;
1782         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1783         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1784             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1785             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1786                                      src1 - my_idx * linesize - mx_idx,
1787                                      EDGE_EMU_LINESIZE, linesize,
1788                                      block_w + subpel_idx[1][mx],
1789                                      block_h + subpel_idx[1][my],
1790                                      x_off - mx_idx, y_off - my_idx, width, height);
1791             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1792             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1793
1794             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1795                                      src2 - my_idx * linesize - mx_idx,
1796                                      EDGE_EMU_LINESIZE, linesize,
1797                                      block_w + subpel_idx[1][mx],
1798                                      block_h + subpel_idx[1][my],
1799                                      x_off - mx_idx, y_off - my_idx, width, height);
1800             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1801             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1802         } else {
1803             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1804             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1805         }
1806     } else {
1807         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1808         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1809         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1810     }
1811 }
1812
1813 static av_always_inline
1814 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1815                  ThreadFrame *ref_frame, int x_off, int y_off,
1816                  int bx_off, int by_off, int block_w, int block_h,
1817                  int width, int height, VP56mv *mv)
1818 {
1819     VP56mv uvmv = *mv;
1820
1821     /* Y */
1822     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1823                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1824                 block_w, block_h, width, height, s->linesize,
1825                 s->put_pixels_tab[block_w == 8]);
1826
1827     /* U/V */
1828     if (s->profile == 3) {
1829         /* this block only applies VP8; it is safe to check
1830          * only the profile, as VP7 profile <= 1 */
1831         uvmv.x &= ~7;
1832         uvmv.y &= ~7;
1833     }
1834     x_off   >>= 1;
1835     y_off   >>= 1;
1836     bx_off  >>= 1;
1837     by_off  >>= 1;
1838     width   >>= 1;
1839     height  >>= 1;
1840     block_w >>= 1;
1841     block_h >>= 1;
1842     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1843                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1844                   &uvmv, x_off + bx_off, y_off + by_off,
1845                   block_w, block_h, width, height, s->uvlinesize,
1846                   s->put_pixels_tab[1 + (block_w == 4)]);
1847 }
1848
1849 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1850  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1851 static av_always_inline
1852 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1853                      int mb_xy, int ref)
1854 {
1855     /* Don't prefetch refs that haven't been used very often this frame. */
1856     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1857         int x_off = mb_x << 4, y_off = mb_y << 4;
1858         int mx = (mb->mv.x >> 2) + x_off + 8;
1859         int my = (mb->mv.y >> 2) + y_off;
1860         uint8_t **src = s->framep[ref]->tf.f->data;
1861         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1862         /* For threading, a ff_thread_await_progress here might be useful, but
1863          * it actually slows down the decoder. Since a bad prefetch doesn't
1864          * generate bad decoder output, we don't run it here. */
1865         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1866         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1867         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1868     }
1869 }
1870
1871 /**
1872  * Apply motion vectors to prediction buffer, chapter 18.
1873  */
1874 static av_always_inline
1875 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1876                    VP8Macroblock *mb, int mb_x, int mb_y)
1877 {
1878     int x_off = mb_x << 4, y_off = mb_y << 4;
1879     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1880     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1881     VP56mv *bmv = mb->bmv;
1882
1883     switch (mb->partitioning) {
1884     case VP8_SPLITMVMODE_NONE:
1885         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1886                     0, 0, 16, 16, width, height, &mb->mv);
1887         break;
1888     case VP8_SPLITMVMODE_4x4: {
1889         int x, y;
1890         VP56mv uvmv;
1891
1892         /* Y */
1893         for (y = 0; y < 4; y++) {
1894             for (x = 0; x < 4; x++) {
1895                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1896                             ref, &bmv[4 * y + x],
1897                             4 * x + x_off, 4 * y + y_off, 4, 4,
1898                             width, height, s->linesize,
1899                             s->put_pixels_tab[2]);
1900             }
1901         }
1902
1903         /* U/V */
1904         x_off  >>= 1;
1905         y_off  >>= 1;
1906         width  >>= 1;
1907         height >>= 1;
1908         for (y = 0; y < 2; y++) {
1909             for (x = 0; x < 2; x++) {
1910                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
1911                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
1912                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
1913                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
1914                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
1915                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
1916                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
1917                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
1918                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT - 1))) >> 2;
1919                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT - 1))) >> 2;
1920                 if (s->profile == 3) {
1921                     uvmv.x &= ~7;
1922                     uvmv.y &= ~7;
1923                 }
1924                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
1925                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
1926                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
1927                               width, height, s->uvlinesize,
1928                               s->put_pixels_tab[2]);
1929             }
1930         }
1931         break;
1932     }
1933     case VP8_SPLITMVMODE_16x8:
1934         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1935                     0, 0, 16, 8, width, height, &bmv[0]);
1936         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1937                     0, 8, 16, 8, width, height, &bmv[1]);
1938         break;
1939     case VP8_SPLITMVMODE_8x16:
1940         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1941                     0, 0, 8, 16, width, height, &bmv[0]);
1942         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1943                     8, 0, 8, 16, width, height, &bmv[1]);
1944         break;
1945     case VP8_SPLITMVMODE_8x8:
1946         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1947                     0, 0, 8, 8, width, height, &bmv[0]);
1948         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1949                     8, 0, 8, 8, width, height, &bmv[1]);
1950         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1951                     0, 8, 8, 8, width, height, &bmv[2]);
1952         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1953                     8, 8, 8, 8, width, height, &bmv[3]);
1954         break;
1955     }
1956 }
1957
1958 static av_always_inline
1959 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
1960 {
1961     int x, y, ch;
1962
1963     if (mb->mode != MODE_I4x4) {
1964         uint8_t *y_dst = dst[0];
1965         for (y = 0; y < 4; y++) {
1966             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1967             if (nnz4) {
1968                 if (nnz4 & ~0x01010101) {
1969                     for (x = 0; x < 4; x++) {
1970                         if ((uint8_t) nnz4 == 1)
1971                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
1972                                                       td->block[y][x],
1973                                                       s->linesize);
1974                         else if ((uint8_t) nnz4 > 1)
1975                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
1976                                                    td->block[y][x],
1977                                                    s->linesize);
1978                         nnz4 >>= 8;
1979                         if (!nnz4)
1980                             break;
1981                     }
1982                 } else {
1983                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1984                 }
1985             }
1986             y_dst += 4 * s->linesize;
1987         }
1988     }
1989
1990     for (ch = 0; ch < 2; ch++) {
1991         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
1992         if (nnz4) {
1993             uint8_t *ch_dst = dst[1 + ch];
1994             if (nnz4 & ~0x01010101) {
1995                 for (y = 0; y < 2; y++) {
1996                     for (x = 0; x < 2; x++) {
1997                         if ((uint8_t) nnz4 == 1)
1998                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
1999                                                       td->block[4 + ch][(y << 1) + x],
2000                                                       s->uvlinesize);
2001                         else if ((uint8_t) nnz4 > 1)
2002                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2003                                                    td->block[4 + ch][(y << 1) + x],
2004                                                    s->uvlinesize);
2005                         nnz4 >>= 8;
2006                         if (!nnz4)
2007                             goto chroma_idct_end;
2008                     }
2009                     ch_dst += 4 * s->uvlinesize;
2010                 }
2011             } else {
2012                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2013             }
2014         }
2015 chroma_idct_end:
2016         ;
2017     }
2018 }
2019
2020 static av_always_inline
2021 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2022                          VP8FilterStrength *f, int is_vp7)
2023 {
2024     int interior_limit, filter_level;
2025
2026     if (s->segmentation.enabled) {
2027         filter_level = s->segmentation.filter_level[mb->segment];
2028         if (!s->segmentation.absolute_vals)
2029             filter_level += s->filter.level;
2030     } else
2031         filter_level = s->filter.level;
2032
2033     if (s->lf_delta.enabled) {
2034         filter_level += s->lf_delta.ref[mb->ref_frame];
2035         filter_level += s->lf_delta.mode[mb->mode];
2036     }
2037
2038     filter_level = av_clip_uintp2(filter_level, 6);
2039
2040     interior_limit = filter_level;
2041     if (s->filter.sharpness) {
2042         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2043         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2044     }
2045     interior_limit = FFMAX(interior_limit, 1);
2046
2047     f->filter_level = filter_level;
2048     f->inner_limit = interior_limit;
2049     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2050                       mb->mode == VP8_MVMODE_SPLIT;
2051 }
2052
2053 static av_always_inline
2054 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2055                int mb_x, int mb_y, int is_vp7)
2056 {
2057     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2058     int filter_level = f->filter_level;
2059     int inner_limit = f->inner_limit;
2060     int inner_filter = f->inner_filter;
2061     int linesize = s->linesize;
2062     int uvlinesize = s->uvlinesize;
2063     static const uint8_t hev_thresh_lut[2][64] = {
2064         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2065           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2066           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2067           3, 3, 3, 3 },
2068         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2069           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2070           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2071           2, 2, 2, 2 }
2072     };
2073
2074     if (!filter_level)
2075         return;
2076
2077     if (is_vp7) {
2078         bedge_lim_y  = filter_level;
2079         bedge_lim_uv = filter_level * 2;
2080         mbedge_lim   = filter_level + 2;
2081     } else {
2082         bedge_lim_y  =
2083         bedge_lim_uv = filter_level * 2 + inner_limit;
2084         mbedge_lim   = bedge_lim_y + 4;
2085     }
2086
2087     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2088
2089     if (mb_x) {
2090         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2091                                        mbedge_lim, inner_limit, hev_thresh);
2092         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2093                                        mbedge_lim, inner_limit, hev_thresh);
2094     }
2095
2096 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2097     if (cond && inner_filter) {                                               \
2098         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2099                                              bedge_lim_y, inner_limit,        \
2100                                              hev_thresh);                     \
2101         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2102                                              bedge_lim_y, inner_limit,        \
2103                                              hev_thresh);                     \
2104         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2105                                              bedge_lim_y, inner_limit,        \
2106                                              hev_thresh);                     \
2107         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2108                                              uvlinesize,  bedge_lim_uv,       \
2109                                              inner_limit, hev_thresh);        \
2110     }
2111
2112     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2113
2114     if (mb_y) {
2115         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2116                                        mbedge_lim, inner_limit, hev_thresh);
2117         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2118                                        mbedge_lim, inner_limit, hev_thresh);
2119     }
2120
2121     if (inner_filter) {
2122         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2123                                              linesize, bedge_lim_y,
2124                                              inner_limit, hev_thresh);
2125         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2126                                              linesize, bedge_lim_y,
2127                                              inner_limit, hev_thresh);
2128         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2129                                              linesize, bedge_lim_y,
2130                                              inner_limit, hev_thresh);
2131         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2132                                              dst[2] +  4 * uvlinesize,
2133                                              uvlinesize, bedge_lim_uv,
2134                                              inner_limit, hev_thresh);
2135     }
2136
2137     H_LOOP_FILTER_16Y_INNER(is_vp7)
2138 }
2139
2140 static av_always_inline
2141 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2142                       int mb_x, int mb_y)
2143 {
2144     int mbedge_lim, bedge_lim;
2145     int filter_level = f->filter_level;
2146     int inner_limit  = f->inner_limit;
2147     int inner_filter = f->inner_filter;
2148     int linesize     = s->linesize;
2149
2150     if (!filter_level)
2151         return;
2152
2153     bedge_lim  = 2 * filter_level + inner_limit;
2154     mbedge_lim = bedge_lim + 4;
2155
2156     if (mb_x)
2157         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2158     if (inner_filter) {
2159         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2160         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2161         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2162     }
2163
2164     if (mb_y)
2165         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2166     if (inner_filter) {
2167         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2168         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2169         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2170     }
2171 }
2172
2173 #define MARGIN (16 << 2)
2174 static av_always_inline
2175 void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2176                                     VP8Frame *prev_frame, int is_vp7)
2177 {
2178     VP8Context *s = avctx->priv_data;
2179     int mb_x, mb_y;
2180
2181     s->mv_min.y = -MARGIN;
2182     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2183     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2184         VP8Macroblock *mb = s->macroblocks_base +
2185                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2186         int mb_xy = mb_y * s->mb_width;
2187
2188         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2189
2190         s->mv_min.x = -MARGIN;
2191         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2192         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2193             if (mb_y == 0)
2194                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2195                          DC_PRED * 0x01010101);
2196             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2197                            prev_frame && prev_frame->seg_map ?
2198                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2199             s->mv_min.x -= 64;
2200             s->mv_max.x -= 64;
2201         }
2202         s->mv_min.y -= 64;
2203         s->mv_max.y -= 64;
2204     }
2205 }
2206
2207 static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2208                                    VP8Frame *prev_frame)
2209 {
2210     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2211 }
2212
2213 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2214                                    VP8Frame *prev_frame)
2215 {
2216     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2217 }
2218
2219 #if HAVE_THREADS
2220 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2221     do {                                                                      \
2222         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2223         if (otd->thread_mb_pos < tmp) {                                       \
2224             pthread_mutex_lock(&otd->lock);                                   \
2225             td->wait_mb_pos = tmp;                                            \
2226             do {                                                              \
2227                 if (otd->thread_mb_pos >= tmp)                                \
2228                     break;                                                    \
2229                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2230             } while (1);                                                      \
2231             td->wait_mb_pos = INT_MAX;                                        \
2232             pthread_mutex_unlock(&otd->lock);                                 \
2233         }                                                                     \
2234     } while (0);
2235
2236 #define update_pos(td, mb_y, mb_x)                                            \
2237     do {                                                                      \
2238         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2239         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2240                                (num_jobs > 1);                                \
2241         int is_null          = !next_td || !prev_td;                          \
2242         int pos_check        = (is_null) ? 1                                  \
2243                                          : (next_td != td &&                  \
2244                                             pos >= next_td->wait_mb_pos) ||   \
2245                                            (prev_td != td &&                  \
2246                                             pos >= prev_td->wait_mb_pos);     \
2247         td->thread_mb_pos = pos;                                              \
2248         if (sliced_threading && pos_check) {                                  \
2249             pthread_mutex_lock(&td->lock);                                    \
2250             pthread_cond_broadcast(&td->cond);                                \
2251             pthread_mutex_unlock(&td->lock);                                  \
2252         }                                                                     \
2253     } while (0);
2254 #else
2255 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
2256 #define update_pos(td, mb_y, mb_x)
2257 #endif
2258
2259 static av_always_inline void decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2260                                         int jobnr, int threadnr, int is_vp7)
2261 {
2262     VP8Context *s = avctx->priv_data;
2263     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2264     int mb_y = td->thread_mb_pos >> 16;
2265     int mb_x, mb_xy = mb_y * s->mb_width;
2266     int num_jobs = s->num_jobs;
2267     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2268     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2269     VP8Macroblock *mb;
2270     uint8_t *dst[3] = {
2271         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2272         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2273         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2274     };
2275     if (mb_y == 0)
2276         prev_td = td;
2277     else
2278         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2279     if (mb_y == s->mb_height - 1)
2280         next_td = td;
2281     else
2282         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2283     if (s->mb_layout == 1)
2284         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2285     else {
2286         // Make sure the previous frame has read its segmentation map,
2287         // if we re-use the same map.
2288         if (prev_frame && s->segmentation.enabled &&
2289             !s->segmentation.update_map)
2290             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2291         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2292         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2293         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2294     }
2295
2296     if (!is_vp7 || mb_y == 0)
2297         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2298
2299     s->mv_min.x = -MARGIN;
2300     s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2301
2302     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2303         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2304         if (prev_td != td) {
2305             if (threadnr != 0) {
2306                 check_thread_pos(td, prev_td,
2307                                  mb_x + (is_vp7 ? 2 : 1),
2308                                  mb_y - (is_vp7 ? 2 : 1));
2309             } else {
2310                 check_thread_pos(td, prev_td,
2311                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2312                                  mb_y - (is_vp7 ? 2 : 1));
2313             }
2314         }
2315
2316         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2317                          s->linesize, 4);
2318         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2319                          dst[2] - dst[1], 2);
2320
2321         if (!s->mb_layout)
2322             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2323                            prev_frame && prev_frame->seg_map ?
2324                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2325
2326         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2327
2328         if (!mb->skip)
2329             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2330
2331         if (mb->mode <= MODE_I4x4)
2332             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2333         else
2334             inter_predict(s, td, dst, mb, mb_x, mb_y);
2335
2336         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2337
2338         if (!mb->skip) {
2339             idct_mb(s, td, dst, mb);
2340         } else {
2341             AV_ZERO64(td->left_nnz);
2342             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2343
2344             /* Reset DC block predictors if they would exist
2345              * if the mb had coefficients */
2346             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2347                 td->left_nnz[8]     = 0;
2348                 s->top_nnz[mb_x][8] = 0;
2349             }
2350         }
2351
2352         if (s->deblock_filter)
2353             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2354
2355         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2356             if (s->filter.simple)
2357                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2358                                  NULL, NULL, s->linesize, 0, 1);
2359             else
2360                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2361                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2362         }
2363
2364         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2365
2366         dst[0]      += 16;
2367         dst[1]      += 8;
2368         dst[2]      += 8;
2369         s->mv_min.x -= 64;
2370         s->mv_max.x -= 64;
2371
2372         if (mb_x == s->mb_width + 1) {
2373             update_pos(td, mb_y, s->mb_width + 3);
2374         } else {
2375             update_pos(td, mb_y, mb_x);
2376         }
2377     }
2378 }
2379
2380 static void vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2381                                         int jobnr, int threadnr)
2382 {
2383     decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2384 }
2385
2386 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2387                                         int jobnr, int threadnr)
2388 {
2389     decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2390 }
2391
2392 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2393                               int jobnr, int threadnr, int is_vp7)
2394 {
2395     VP8Context *s = avctx->priv_data;
2396     VP8ThreadData *td = &s->thread_data[threadnr];
2397     int mb_x, mb_y = td->thread_mb_pos >> 16, num_jobs = s->num_jobs;
2398     AVFrame *curframe = s->curframe->tf.f;
2399     VP8Macroblock *mb;
2400     VP8ThreadData *prev_td, *next_td;
2401     uint8_t *dst[3] = {
2402         curframe->data[0] + 16 * mb_y * s->linesize,
2403         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2404         curframe->data[2] +  8 * mb_y * s->uvlinesize
2405     };
2406
2407     if (s->mb_layout == 1)
2408         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2409     else
2410         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2411
2412     if (mb_y == 0)
2413         prev_td = td;
2414     else
2415         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2416     if (mb_y == s->mb_height - 1)
2417         next_td = td;
2418     else
2419         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2420
2421     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2422         VP8FilterStrength *f = &td->filter_strength[mb_x];
2423         if (prev_td != td)
2424             check_thread_pos(td, prev_td,
2425                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2426         if (next_td != td)
2427             if (next_td != &s->thread_data[0])
2428                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2429
2430         if (num_jobs == 1) {
2431             if (s->filter.simple)
2432                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2433                                  NULL, NULL, s->linesize, 0, 1);
2434             else
2435                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2436                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2437         }
2438
2439         if (s->filter.simple)
2440             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2441         else
2442             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2443         dst[0] += 16;
2444         dst[1] += 8;
2445         dst[2] += 8;
2446
2447         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2448     }
2449 }
2450
2451 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2452                               int jobnr, int threadnr)
2453 {
2454     filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2455 }
2456
2457 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2458                               int jobnr, int threadnr)
2459 {
2460     filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2461 }
2462
2463 static av_always_inline
2464 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2465                               int threadnr, int is_vp7)
2466 {
2467     VP8Context *s = avctx->priv_data;
2468     VP8ThreadData *td = &s->thread_data[jobnr];
2469     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2470     VP8Frame *curframe = s->curframe;
2471     int mb_y, num_jobs = s->num_jobs;
2472
2473     td->thread_nr = threadnr;
2474     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2475         if (mb_y >= s->mb_height)
2476             break;
2477         td->thread_mb_pos = mb_y << 16;
2478         s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2479         if (s->deblock_filter)
2480             s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2481         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2482
2483         s->mv_min.y -= 64;
2484         s->mv_max.y -= 64;
2485
2486         if (avctx->active_thread_type == FF_THREAD_FRAME)
2487             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2488     }
2489
2490     return 0;
2491 }
2492
2493 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2494                                     int jobnr, int threadnr)
2495 {
2496     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2497 }
2498
2499 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2500                                     int jobnr, int threadnr)
2501 {
2502     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2503 }
2504
2505
2506 static av_always_inline
2507 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2508                       AVPacket *avpkt, int is_vp7)
2509 {
2510     VP8Context *s = avctx->priv_data;
2511     int ret, i, referenced, num_jobs;
2512     enum AVDiscard skip_thresh;
2513     VP8Frame *av_uninit(curframe), *prev_frame;
2514
2515     if (is_vp7)
2516         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2517     else
2518         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2519
2520     if (ret < 0)
2521         goto err;
2522
2523     prev_frame = s->framep[VP56_FRAME_CURRENT];
2524
2525     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2526                  s->update_altref == VP56_FRAME_CURRENT;
2527
2528     skip_thresh = !referenced ? AVDISCARD_NONREF
2529                               : !s->keyframe ? AVDISCARD_NONKEY
2530                                              : AVDISCARD_ALL;
2531
2532     if (avctx->skip_frame >= skip_thresh) {
2533         s->invisible = 1;
2534         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2535         goto skip_decode;
2536     }
2537     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2538
2539     // release no longer referenced frames
2540     for (i = 0; i < 5; i++)
2541         if (s->frames[i].tf.f->data[0] &&
2542             &s->frames[i] != prev_frame &&
2543             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2544             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2545             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2546             vp8_release_frame(s, &s->frames[i]);
2547
2548     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2549
2550     /* Given that arithmetic probabilities are updated every frame, it's quite
2551      * likely that the values we have on a random interframe are complete
2552      * junk if we didn't start decode on a keyframe. So just don't display
2553      * anything rather than junk. */
2554     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2555                          !s->framep[VP56_FRAME_GOLDEN]   ||
2556                          !s->framep[VP56_FRAME_GOLDEN2])) {
2557         av_log(avctx, AV_LOG_WARNING,
2558                "Discarding interframe without a prior keyframe!\n");
2559         ret = AVERROR_INVALIDDATA;
2560         goto err;
2561     }
2562
2563     curframe->tf.f->key_frame = s->keyframe;
2564     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2565                                             : AV_PICTURE_TYPE_P;
2566     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2567         goto err;
2568
2569     // check if golden and altref are swapped
2570     if (s->update_altref != VP56_FRAME_NONE)
2571         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2572     else
2573         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2574
2575     if (s->update_golden != VP56_FRAME_NONE)
2576         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2577     else
2578         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2579
2580     if (s->update_last)
2581         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2582     else
2583         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2584
2585     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2586
2587     if (avctx->codec->update_thread_context)
2588         ff_thread_finish_setup(avctx);
2589
2590     s->linesize   = curframe->tf.f->linesize[0];
2591     s->uvlinesize = curframe->tf.f->linesize[1];
2592
2593     memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2594     /* Zero macroblock structures for top/top-left prediction
2595      * from outside the frame. */
2596     if (!s->mb_layout)
2597         memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2598                (s->mb_width + 1) * sizeof(*s->macroblocks));
2599     if (!s->mb_layout && s->keyframe)
2600         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2601
2602     memset(s->ref_count, 0, sizeof(s->ref_count));
2603
2604     if (s->mb_layout == 1) {
2605         // Make sure the previous frame has read its segmentation map,
2606         // if we re-use the same map.
2607         if (prev_frame && s->segmentation.enabled &&
2608             !s->segmentation.update_map)
2609             ff_thread_await_progress(&prev_frame->tf, 1, 0);
2610         if (is_vp7)
2611             vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2612         else
2613             vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2614     }
2615
2616     if (avctx->active_thread_type == FF_THREAD_FRAME)
2617         num_jobs = 1;
2618     else
2619         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2620     s->num_jobs   = num_jobs;
2621     s->curframe   = curframe;
2622     s->prev_frame = prev_frame;
2623     s->mv_min.y   = -MARGIN;
2624     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2625     for (i = 0; i < MAX_THREADS; i++) {
2626         s->thread_data[i].thread_mb_pos = 0;
2627         s->thread_data[i].wait_mb_pos   = INT_MAX;
2628     }
2629     if (is_vp7)
2630         avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2631                         num_jobs);
2632     else
2633         avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2634                         num_jobs);
2635
2636     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2637     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2638
2639 skip_decode:
2640     // if future frames don't use the updated probabilities,
2641     // reset them to the values we saved
2642     if (!s->update_probabilities)
2643         s->prob[0] = s->prob[1];
2644
2645     if (!s->invisible) {
2646         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2647             return ret;
2648         *got_frame = 1;
2649     }
2650
2651     return avpkt->size;
2652 err:
2653     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2654     return ret;
2655 }
2656
2657 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2658                         AVPacket *avpkt)
2659 {
2660     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2661 }
2662
2663 #if CONFIG_VP7_DECODER
2664 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2665                             AVPacket *avpkt)
2666 {
2667     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2668 }
2669 #endif /* CONFIG_VP7_DECODER */
2670
2671 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2672 {
2673     VP8Context *s = avctx->priv_data;
2674     int i;
2675
2676     vp8_decode_flush_impl(avctx, 1);
2677     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2678         av_frame_free(&s->frames[i].tf.f);
2679
2680     return 0;
2681 }
2682
2683 static av_cold int vp8_init_frames(VP8Context *s)
2684 {
2685     int i;
2686     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2687         s->frames[i].tf.f = av_frame_alloc();
2688         if (!s->frames[i].tf.f)
2689             return AVERROR(ENOMEM);
2690     }
2691     return 0;
2692 }
2693
2694 static av_always_inline
2695 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2696 {
2697     VP8Context *s = avctx->priv_data;
2698     int ret;
2699
2700     s->avctx = avctx;
2701     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2702     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2703     avctx->internal->allocate_progress = 1;
2704
2705     // TODO(dalecurtis): w32pthreads.h includes static variables which result
2706     // in multiple copies for each includer.  Hack around our version not being
2707     // initialized by calling initialize again.
2708 #if HAVE_W32THREADS
2709     w32thread_init();
2710 #endif
2711
2712     ff_videodsp_init(&s->vdsp, 8);
2713
2714     ff_vp78dsp_init(&s->vp8dsp);
2715     if (CONFIG_VP7_DECODER && is_vp7) {
2716         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2717         ff_vp7dsp_init(&s->vp8dsp);
2718         s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2719         s->filter_mb_row           = vp7_filter_mb_row;
2720     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2721         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2722         ff_vp8dsp_init(&s->vp8dsp);
2723         s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2724         s->filter_mb_row           = vp8_filter_mb_row;
2725     }
2726
2727     /* does not change for VP8 */
2728     memcpy(s->prob[0].scan, zigzag_scan, sizeof(s->prob[0].scan));
2729
2730     if ((ret = vp8_init_frames(s)) < 0) {
2731         ff_vp8_decode_free(avctx);
2732         return ret;
2733     }
2734
2735     return 0;
2736 }
2737
2738 #if CONFIG_VP7_DECODER
2739 static int vp7_decode_init(AVCodecContext *avctx)
2740 {
2741     return vp78_decode_init(avctx, IS_VP7);
2742 }
2743 #endif /* CONFIG_VP7_DECODER */
2744
2745 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2746 {
2747     return vp78_decode_init(avctx, IS_VP8);
2748 }
2749
2750 #if CONFIG_VP8_DECODER
2751 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2752 {
2753     VP8Context *s = avctx->priv_data;
2754     int ret;
2755
2756     s->avctx = avctx;
2757
2758     if ((ret = vp8_init_frames(s)) < 0) {
2759         ff_vp8_decode_free(avctx);
2760         return ret;
2761     }
2762
2763     return 0;
2764 }
2765
2766 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2767
2768 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2769                                             const AVCodecContext *src)
2770 {
2771     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2772     int i;
2773
2774     if (s->macroblocks_base &&
2775         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2776         free_buffers(s);
2777         s->mb_width  = s_src->mb_width;
2778         s->mb_height = s_src->mb_height;
2779     }
2780
2781     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2782     s->segmentation = s_src->segmentation;
2783     s->lf_delta     = s_src->lf_delta;
2784     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2785
2786     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2787         if (s_src->frames[i].tf.f->data[0]) {
2788             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2789             if (ret < 0)
2790                 return ret;
2791         }
2792     }
2793
2794     s->framep[0] = REBASE(s_src->next_framep[0]);
2795     s->framep[1] = REBASE(s_src->next_framep[1]);
2796     s->framep[2] = REBASE(s_src->next_framep[2]);
2797     s->framep[3] = REBASE(s_src->next_framep[3]);
2798
2799     return 0;
2800 }
2801 #endif /* CONFIG_VP8_DECODER */
2802
2803 #if CONFIG_VP7_DECODER
2804 AVCodec ff_vp7_decoder = {
2805     .name                  = "vp7",
2806     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2807     .type                  = AVMEDIA_TYPE_VIDEO,
2808     .id                    = AV_CODEC_ID_VP7,
2809     .priv_data_size        = sizeof(VP8Context),
2810     .init                  = vp7_decode_init,
2811     .close                 = ff_vp8_decode_free,
2812     .decode                = vp7_decode_frame,
2813     .capabilities          = CODEC_CAP_DR1,
2814     .flush                 = vp8_decode_flush,
2815 };
2816 #endif /* CONFIG_VP7_DECODER */
2817
2818 #if CONFIG_VP8_DECODER
2819 AVCodec ff_vp8_decoder = {
2820     .name                  = "vp8",
2821     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2822     .type                  = AVMEDIA_TYPE_VIDEO,
2823     .id                    = AV_CODEC_ID_VP8,
2824     .priv_data_size        = sizeof(VP8Context),
2825     .init                  = ff_vp8_decode_init,
2826     .close                 = ff_vp8_decode_free,
2827     .decode                = ff_vp8_decode_frame,
2828     .capabilities          = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS | CODEC_CAP_SLICE_THREADS,
2829     .flush                 = vp8_decode_flush,
2830     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2831     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2832 };
2833 #endif /* CONFIG_VP7_DECODER */