libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of Libav.
  11  *
  12  * Libav is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * Libav is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with Libav; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "libavutil/imgutils.h"
  28
  29 #include "avcodec.h"
  30 #include "internal.h"
  31 #include "mathops.h"
  32 #include "rectangle.h"
  33 #include "thread.h"
  34 #include "vp8.h"
  35 #include "vp8data.h"
  36
  37 #if ARCH_ARM
  38 #   include "arm/vp8.h"
  39 #endif
  40
  41 static void free_buffers(VP8Context *s)
  42 {
  43     int i;
  44     if (s->thread_data)
  45         for (i = 0; i < MAX_THREADS; i++) {
  46 #if HAVE_THREADS
  47             pthread_cond_destroy(&s->thread_data[i].cond);
  48             pthread_mutex_destroy(&s->thread_data[i].lock);
  49 #endif
  50             av_freep(&s->thread_data[i].filter_strength);
  51         }
  52     av_freep(&s->thread_data);
  53     av_freep(&s->macroblocks_base);
  54     av_freep(&s->intra4x4_pred_mode_top);
  55     av_freep(&s->top_nnz);
  56     av_freep(&s->top_border);
  57
  58     s->macroblocks = NULL;
  59 }
  60
  61 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  62 {
  63     int ret;
  64     if ((ret = ff_thread_get_buffer(s->avctx, &f->tf,
  65                                     ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  66         return ret;
  67     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height))) {
  68         ff_thread_release_buffer(s->avctx, &f->tf);
  69         return AVERROR(ENOMEM);
  70     }
  71     return 0;
  72 }
  73
  74 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
  75 {
  76     av_buffer_unref(&f->seg_map);
  77     ff_thread_release_buffer(s->avctx, &f->tf);
  78 }
  79
  80 #if CONFIG_VP8_DECODER
  81 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
  82 {
  83     int ret;
  84
  85     vp8_release_frame(s, dst);
  86
  87     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
  88         return ret;
  89     if (src->seg_map &&
  90         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
  91         vp8_release_frame(s, dst);
  92         return AVERROR(ENOMEM);
  93     }
  94
  95     return 0;
  96 }
  97 #endif /* CONFIG_VP8_DECODER */
  98
  99 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 100 {
 101     VP8Context *s = avctx->priv_data;
 102     int i;
 103
 104     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 105         vp8_release_frame(s, &s->frames[i]);
 106     memset(s->framep, 0, sizeof(s->framep));
 107
 108     if (free_mem)
 109         free_buffers(s);
 110 }
 111
 112 static void vp8_decode_flush(AVCodecContext *avctx)
 113 {
 114     vp8_decode_flush_impl(avctx, 0);
 115 }
 116
 117 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 118 {
 119     VP8Frame *frame = NULL;
 120     int i;
 121
 122     // find a free buffer
 123     for (i = 0; i < 5; i++)
 124         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 125             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 126             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 127             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 128             frame = &s->frames[i];
 129             break;
 130         }
 131     if (i == 5) {
 132         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 133         abort();
 134     }
 135     if (frame->tf.f->data[0])
 136         vp8_release_frame(s, frame);
 137
 138     return frame;
 139 }
 140
 141 static av_always_inline
 142 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 143 {
 144     AVCodecContext *avctx = s->avctx;
 145     int i, ret;
 146
 147     if (width  != s->avctx->width ||
 148         height != s->avctx->height) {
 149         vp8_decode_flush_impl(s->avctx, 1);
 150
 151         ret = ff_set_dimensions(s->avctx, width, height);
 152         if (ret < 0)
 153             return ret;
 154     }
 155
 156     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 157     s->mb_height = (s->avctx->coded_height + 15) / 16;
 158
 159     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 160                    FFMIN(s->num_coeff_partitions, avctx->thread_count) > 1;
 161     if (!s->mb_layout) { // Frame threading and one thread
 162         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 163                                                sizeof(*s->macroblocks));
 164         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 165     } else // Sliced threading
 166         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 167                                          sizeof(*s->macroblocks));
 168     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 169     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 170     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 171
 172     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 173         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 174         free_buffers(s);
 175         return AVERROR(ENOMEM);
 176     }
 177
 178     for (i = 0; i < MAX_THREADS; i++) {
 179         s->thread_data[i].filter_strength =
 180             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 181         if (!s->thread_data[i].filter_strength) {
 182             free_buffers(s);
 183             return AVERROR(ENOMEM);
 184         }
 185 #if HAVE_THREADS
 186         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 187         pthread_cond_init(&s->thread_data[i].cond, NULL);
 188 #endif
 189     }
 190
 191     s->macroblocks = s->macroblocks_base + 1;
 192
 193     return 0;
 194 }
 195
 196 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 197 {
 198     return update_dimensions(s, width, height, IS_VP7);
 199 }
 200
 201 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 202 {
 203     return update_dimensions(s, width, height, IS_VP8);
 204 }
 205
 206 static void parse_segment_info(VP8Context *s)
 207 {
 208     VP56RangeCoder *c = &s->c;
 209     int i;
 210
 211     s->segmentation.update_map = vp8_rac_get(c);
 212
 213     if (vp8_rac_get(c)) { // update segment feature data
 214         s->segmentation.absolute_vals = vp8_rac_get(c);
 215
 216         for (i = 0; i < 4; i++)
 217             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 218
 219         for (i = 0; i < 4; i++)
 220             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 221     }
 222     if (s->segmentation.update_map)
 223         for (i = 0; i < 3; i++)
 224             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 225 }
 226
 227 static void update_lf_deltas(VP8Context *s)
 228 {
 229     VP56RangeCoder *c = &s->c;
 230     int i;
 231
 232     for (i = 0; i < 4; i++) {
 233         if (vp8_rac_get(c)) {
 234             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 235
 236             if (vp8_rac_get(c))
 237                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 238         }
 239     }
 240
 241     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 242         if (vp8_rac_get(c)) {
 243             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 244
 245             if (vp8_rac_get(c))
 246                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 247         }
 248     }
 249 }
 250
 251 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 252 {
 253     const uint8_t *sizes = buf;
 254     int i;
 255
 256     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 257
 258     buf      += 3 * (s->num_coeff_partitions - 1);
 259     buf_size -= 3 * (s->num_coeff_partitions - 1);
 260     if (buf_size < 0)
 261         return -1;
 262
 263     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 264         int size = AV_RL24(sizes + 3 * i);
 265         if (buf_size - size < 0)
 266             return -1;
 267
 268         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 269         buf      += size;
 270         buf_size -= size;
 271     }
 272     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 273
 274     return 0;
 275 }
 276
 277 static void vp7_get_quants(VP8Context *s)
 278 {
 279     VP56RangeCoder *c = &s->c;
 280
 281     int yac_qi  = vp8_rac_get_uint(c, 7);
 282     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 283     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 284     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 285     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 286     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 287
 288     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 289     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 290     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 291     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 292     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 293     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 294 }
 295
 296 static void get_quants(VP8Context *s)
 297 {
 298     VP56RangeCoder *c = &s->c;
 299     int i, base_qi;
 300
 301     int yac_qi     = vp8_rac_get_uint(c, 7);
 302     int ydc_delta  = vp8_rac_get_sint(c, 4);
 303     int y2dc_delta = vp8_rac_get_sint(c, 4);
 304     int y2ac_delta = vp8_rac_get_sint(c, 4);
 305     int uvdc_delta = vp8_rac_get_sint(c, 4);
 306     int uvac_delta = vp8_rac_get_sint(c, 4);
 307
 308     for (i = 0; i < 4; i++) {
 309         if (s->segmentation.enabled) {
 310             base_qi = s->segmentation.base_quant[i];
 311             if (!s->segmentation.absolute_vals)
 312                 base_qi += yac_qi;
 313         } else
 314             base_qi = yac_qi;
 315
 316         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta,  7)];
 317         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 318         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)] * 2;
 319         /* 101581>>16 is equivalent to 155/100 */
 320         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] * 101581 >> 16;
 321         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 322         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 323
 324         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 325         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 326     }
 327 }
 328
 329 /**
 330  * Determine which buffers golden and altref should be updated with after this frame.
 331  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 332  *
 333  * Intra frames update all 3 references
 334  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 335  * If the update (golden|altref) flag is set, it's updated with the current frame
 336  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 337  * If the flag is not set, the number read means:
 338  *      0: no update
 339  *      1: VP56_FRAME_PREVIOUS
 340  *      2: update golden with altref, or update altref with golden
 341  */
 342 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 343 {
 344     VP56RangeCoder *c = &s->c;
 345
 346     if (update)
 347         return VP56_FRAME_CURRENT;
 348
 349     switch (vp8_rac_get_uint(c, 2)) {
 350     case 1:
 351         return VP56_FRAME_PREVIOUS;
 352     case 2:
 353         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 354     }
 355     return VP56_FRAME_NONE;
 356 }
 357
 358 static void vp78_reset_probability_tables(VP8Context *s)
 359 {
 360     int i, j;
 361     for (i = 0; i < 4; i++)
 362         for (j = 0; j < 16; j++)
 363             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 364                    sizeof(s->prob->token[i][j]));
 365 }
 366
 367 static void vp78_update_probability_tables(VP8Context *s)
 368 {
 369     VP56RangeCoder *c = &s->c;
 370     int i, j, k, l, m;
 371
 372     for (i = 0; i < 4; i++)
 373         for (j = 0; j < 8; j++)
 374             for (k = 0; k < 3; k++)
 375                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 376                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 377                         int prob = vp8_rac_get_uint(c, 8);
 378                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 379                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 380                     }
 381 }
 382
 383 #define VP7_MVC_SIZE 17
 384 #define VP8_MVC_SIZE 19
 385
 386 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 387                                                             int mvc_size)
 388 {
 389     VP56RangeCoder *c = &s->c;
 390     int i, j;
 391
 392     if (vp8_rac_get(c))
 393         for (i = 0; i < 4; i++)
 394             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 395     if (vp8_rac_get(c))
 396         for (i = 0; i < 3; i++)
 397             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 398
 399     // 17.2 MV probability update
 400     for (i = 0; i < 2; i++)
 401         for (j = 0; j < mvc_size; j++)
 402             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 403                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 404 }
 405
 406 static void update_refs(VP8Context *s)
 407 {
 408     VP56RangeCoder *c = &s->c;
 409
 410     int update_golden = vp8_rac_get(c);
 411     int update_altref = vp8_rac_get(c);
 412
 413     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 414     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 415 }
 416
 417 static void copy_luma(AVFrame *dst, AVFrame *src, int width, int height)
 418 {
 419     int i, j;
 420
 421     for (j = 1; j < 3; j++) {
 422         for (i = 0; i < height / 2; i++)
 423             memcpy(dst->data[j] + i * dst->linesize[j],
 424                    src->data[j] + i * src->linesize[j], width / 2);
 425     }
 426 }
 427
 428 static void fade(uint8_t *dst, uint8_t *src,
 429                  int width, int height, ptrdiff_t linesize,
 430                  int alpha, int beta)
 431 {
 432     int i, j;
 433
 434     for (j = 0; j < height; j++) {
 435         for (i = 0; i < width; i++) {
 436             uint8_t y = src[j * linesize + i];
 437             dst[j * linesize + i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 438         }
 439     }
 440 }
 441
 442 static int vp7_fade_frame(VP8Context *s, VP56RangeCoder *c)
 443 {
 444     int alpha = (int8_t) vp8_rac_get_uint(c, 8);
 445     int beta  = (int8_t) vp8_rac_get_uint(c, 8);
 446     int ret;
 447
 448     if (!s->keyframe && (alpha || beta)) {
 449         int width  = s->mb_width * 16;
 450         int height = s->mb_height * 16;
 451         AVFrame *src, *dst;
 452
 453         if (!s->framep[VP56_FRAME_PREVIOUS])
 454             return AVERROR_INVALIDDATA;
 455
 456         dst =
 457         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 458
 459         /* preserve the golden frame, write a new previous frame */
 460         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 461             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 462             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 463                return ret;
 464
 465             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 466
 467             copy_luma(dst, src, width, height);
 468         }
 469
 470         fade(dst->data[0], src->data[0],
 471              width, height, dst->linesize[0], alpha, beta);
 472     }
 473
 474     return 0;
 475 }
 476
 477 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 478 {
 479     VP56RangeCoder *c = &s->c;
 480     int part1_size, hscale, vscale, i, j, ret;
 481     int width  = s->avctx->width;
 482     int height = s->avctx->height;
 483
 484     if (buf_size < 4) {
 485         return AVERROR_INVALIDDATA;
 486     }
 487
 488     s->profile = (buf[0] >> 1) & 7;
 489     if (s->profile > 1) {
 490         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 491         return AVERROR_INVALIDDATA;
 492     }
 493
 494     s->keyframe  = !(buf[0] & 1);
 495     s->invisible = 0;
 496     part1_size   = AV_RL24(buf) >> 4;
 497
 498     buf      += 4 - s->profile;
 499     buf_size -= 4 - s->profile;
 500
 501     if (buf_size < part1_size) {
 502         return AVERROR_INVALIDDATA;
 503     }
 504
 505     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 506
 507     ff_vp56_init_range_decoder(c, buf, part1_size);
 508     buf      += part1_size;
 509     buf_size -= part1_size;
 510
 511     /* A. Dimension information (keyframes only) */
 512     if (s->keyframe) {
 513         width  = vp8_rac_get_uint(c, 12);
 514         height = vp8_rac_get_uint(c, 12);
 515         hscale = vp8_rac_get_uint(c, 2);
 516         vscale = vp8_rac_get_uint(c, 2);
 517         if (hscale || vscale)
 518             avpriv_request_sample(s->avctx, "Upscaling");
 519
 520         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 521         vp78_reset_probability_tables(s);
 522         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 523                sizeof(s->prob->pred16x16));
 524         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 525                sizeof(s->prob->pred8x8c));
 526         for (i = 0; i < 2; i++)
 527             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 528                    sizeof(vp7_mv_default_prob[i]));
 529         memset(&s->segmentation, 0, sizeof(s->segmentation));
 530         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 531         memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
 532     }
 533
 534     if (s->keyframe || s->profile > 0)
 535         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 536
 537     /* B. Decoding information for all four macroblock-level features */
 538     for (i = 0; i < 4; i++) {
 539         s->feature_enabled[i] = vp8_rac_get(c);
 540         if (s->feature_enabled[i]) {
 541              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 542
 543              for (j = 0; j < 3; j++)
 544                  s->feature_index_prob[i][j] =
 545                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 546
 547              if (vp7_feature_value_size[s->profile][i])
 548                  for (j = 0; j < 4; j++)
 549                      s->feature_value[i][j] =
 550                          vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 551         }
 552     }
 553
 554     s->segmentation.enabled    = 0;
 555     s->segmentation.update_map = 0;
 556     s->lf_delta.enabled        = 0;
 557
 558     s->num_coeff_partitions = 1;
 559     ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 560
 561     if (!s->macroblocks_base || /* first frame */
 562         width != s->avctx->width || height != s->avctx->height ||
 563         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 564         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 565             return ret;
 566     }
 567
 568     /* C. Dequantization indices */
 569     vp7_get_quants(s);
 570
 571     /* D. Golden frame update flag (a Flag) for interframes only */
 572     if (!s->keyframe) {
 573         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 574         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 575     }
 576
 577     s->update_last          = 1;
 578     s->update_probabilities = 1;
 579     s->fade_present         = 1;
 580
 581     if (s->profile > 0) {
 582         s->update_probabilities = vp8_rac_get(c);
 583         if (!s->update_probabilities)
 584             s->prob[1] = s->prob[0];
 585
 586         if (!s->keyframe)
 587             s->fade_present = vp8_rac_get(c);
 588     }
 589
 590     /* E. Fading information for previous frame */
 591     if (s->fade_present && vp8_rac_get(c)) {
 592         if ((ret = vp7_fade_frame(s ,c)) < 0)
 593             return ret;
 594     }
 595
 596     /* F. Loop filter type */
 597     if (!s->profile)
 598         s->filter.simple = vp8_rac_get(c);
 599
 600     /* G. DCT coefficient ordering specification */
 601     if (vp8_rac_get(c))
 602         for (i = 1; i < 16; i++)
 603             s->prob[0].scan[i] = ff_zigzag_scan[vp8_rac_get_uint(c, 4)];
 604
 605     /* H. Loop filter levels  */
 606     if (s->profile > 0)
 607         s->filter.simple = vp8_rac_get(c);
 608     s->filter.level     = vp8_rac_get_uint(c, 6);
 609     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 610
 611     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 612     vp78_update_probability_tables(s);
 613
 614     s->mbskip_enabled = 0;
 615
 616     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 617     if (!s->keyframe) {
 618         s->prob->intra  = vp8_rac_get_uint(c, 8);
 619         s->prob->last   = vp8_rac_get_uint(c, 8);
 620         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 621     }
 622
 623     return 0;
 624 }
 625
 626 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 627 {
 628     VP56RangeCoder *c = &s->c;
 629     int header_size, hscale, vscale, ret;
 630     int width  = s->avctx->width;
 631     int height = s->avctx->height;
 632
 633     s->keyframe  = !(buf[0] & 1);
 634     s->profile   =  (buf[0]>>1) & 7;
 635     s->invisible = !(buf[0] & 0x10);
 636     header_size  = AV_RL24(buf) >> 5;
 637     buf      += 3;
 638     buf_size -= 3;
 639
 640     if (s->profile > 3)
 641         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 642
 643     if (!s->profile)
 644         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 645                sizeof(s->put_pixels_tab));
 646     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 647         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 648                sizeof(s->put_pixels_tab));
 649
 650     if (header_size > buf_size - 7 * s->keyframe) {
 651         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 652         return AVERROR_INVALIDDATA;
 653     }
 654
 655     if (s->keyframe) {
 656         if (AV_RL24(buf) != 0x2a019d) {
 657             av_log(s->avctx, AV_LOG_ERROR,
 658                    "Invalid start code 0x%x\n", AV_RL24(buf));
 659             return AVERROR_INVALIDDATA;
 660         }
 661         width     = AV_RL16(buf + 3) & 0x3fff;
 662         height    = AV_RL16(buf + 5) & 0x3fff;
 663         hscale    = buf[4] >> 6;
 664         vscale    = buf[6] >> 6;
 665         buf      += 7;
 666         buf_size -= 7;
 667
 668         if (hscale || vscale)
 669             avpriv_request_sample(s->avctx, "Upscaling");
 670
 671         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 672         vp78_reset_probability_tables(s);
 673         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 674                sizeof(s->prob->pred16x16));
 675         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 676                sizeof(s->prob->pred8x8c));
 677         memcpy(s->prob->mvc, vp8_mv_default_prob,
 678                sizeof(s->prob->mvc));
 679         memset(&s->segmentation, 0, sizeof(s->segmentation));
 680         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 681     }
 682
 683     ff_vp56_init_range_decoder(c, buf, header_size);
 684     buf      += header_size;
 685     buf_size -= header_size;
 686
 687     if (s->keyframe) {
 688         s->colorspace = vp8_rac_get(c);
 689         if (s->colorspace)
 690             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 691         s->fullrange = vp8_rac_get(c);
 692     }
 693
 694     if ((s->segmentation.enabled = vp8_rac_get(c)))
 695         parse_segment_info(s);
 696     else
 697         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 698
 699     s->filter.simple    = vp8_rac_get(c);
 700     s->filter.level     = vp8_rac_get_uint(c, 6);
 701     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 702
 703     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 704         if (vp8_rac_get(c))
 705             update_lf_deltas(s);
 706
 707     if (setup_partitions(s, buf, buf_size)) {
 708         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 709         return AVERROR_INVALIDDATA;
 710     }
 711
 712     if (!s->macroblocks_base || /* first frame */
 713         width != s->avctx->width || height != s->avctx->height)
 714         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 715             return ret;
 716
 717     get_quants(s);
 718
 719     if (!s->keyframe) {
 720         update_refs(s);
 721         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 722         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 723     }
 724
 725     // if we aren't saving this frame's probabilities for future frames,
 726     // make a copy of the current probabilities
 727     if (!(s->update_probabilities = vp8_rac_get(c)))
 728         s->prob[1] = s->prob[0];
 729
 730     s->update_last = s->keyframe || vp8_rac_get(c);
 731
 732     vp78_update_probability_tables(s);
 733
 734     if ((s->mbskip_enabled = vp8_rac_get(c)))
 735         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 736
 737     if (!s->keyframe) {
 738         s->prob->intra  = vp8_rac_get_uint(c, 8);
 739         s->prob->last   = vp8_rac_get_uint(c, 8);
 740         s->prob->golden = vp8_rac_get_uint(c, 8);
 741         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 742     }
 743
 744     return 0;
 745 }
 746
 747 static av_always_inline
 748 void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 749 {
 750     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 751     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 752 }
 753
 754 /**
 755  * Motion vector coding, 17.1.
 756  */
 757 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 758 {
 759     int bit, x = 0;
 760
 761     if (vp56_rac_get_prob_branchy(c, p[0])) {
 762         int i;
 763
 764         for (i = 0; i < 3; i++)
 765             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 766         for (i = (vp7 ? 7 : 9); i > 3; i--)
 767             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 768         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 769             x += 8;
 770     } else {
 771         // small_mvtree
 772         const uint8_t *ps = p + 2;
 773         bit = vp56_rac_get_prob(c, *ps);
 774         ps += 1 + 3 * bit;
 775         x  += 4 * bit;
 776         bit = vp56_rac_get_prob(c, *ps);
 777         ps += 1 + bit;
 778         x  += 2 * bit;
 779         x  += vp56_rac_get_prob(c, *ps);
 780     }
 781
 782     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 783 }
 784
 785 static av_always_inline
 786 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 787 {
 788     if (is_vp7)
 789         return vp7_submv_prob;
 790
 791     if (left == top)
 792         return vp8_submv_prob[4 - !!left];
 793     if (!top)
 794         return vp8_submv_prob[2];
 795     return vp8_submv_prob[1 - !!left];
 796 }
 797
 798 /**
 799  * Split motion vector prediction, 16.4.
 800  * @returns the number of motion vectors parsed (2, 4 or 16)
 801  */
 802 static av_always_inline
 803 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 804                     int layout, int is_vp7)
 805 {
 806     int part_idx;
 807     int n, num;
 808     VP8Macroblock *top_mb;
 809     VP8Macroblock *left_mb = &mb[-1];
 810     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 811     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 812     VP56mv *top_mv;
 813     VP56mv *left_mv = left_mb->bmv;
 814     VP56mv *cur_mv  = mb->bmv;
 815
 816     if (!layout) // layout is inlined, s->mb_layout is not
 817         top_mb = &mb[2];
 818     else
 819         top_mb = &mb[-s->mb_width - 1];
 820     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 821     top_mv       = top_mb->bmv;
 822
 823     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 824         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 825             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 826         else
 827             part_idx = VP8_SPLITMVMODE_8x8;
 828     } else {
 829         part_idx = VP8_SPLITMVMODE_4x4;
 830     }
 831
 832     num              = vp8_mbsplit_count[part_idx];
 833     mbsplits_cur     = vp8_mbsplits[part_idx],
 834     firstidx         = vp8_mbfirstidx[part_idx];
 835     mb->partitioning = part_idx;
 836
 837     for (n = 0; n < num; n++) {
 838         int k = firstidx[n];
 839         uint32_t left, above;
 840         const uint8_t *submv_prob;
 841
 842         if (!(k & 3))
 843             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 844         else
 845             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 846         if (k <= 3)
 847             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 848         else
 849             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 850
 851         submv_prob = get_submv_prob(left, above, is_vp7);
 852
 853         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 854             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 855                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 856                     mb->bmv[n].y = mb->mv.y +
 857                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 858                     mb->bmv[n].x = mb->mv.x +
 859                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 860                 } else {
 861                     AV_ZERO32(&mb->bmv[n]);
 862                 }
 863             } else {
 864                 AV_WN32A(&mb->bmv[n], above);
 865             }
 866         } else {
 867             AV_WN32A(&mb->bmv[n], left);
 868         }
 869     }
 870
 871     return num;
 872 }
 873
 874 /**
 875  * The vp7 reference decoder uses a padding macroblock column (added to right
 876  * edge of the frame) to guard against illegal macroblock offsets. The
 877  * algorithm has bugs that permit offsets to straddle the padding column.
 878  * This function replicates those bugs.
 879  *
 880  * @param[out] edge_x macroblock x address
 881  * @param[out] edge_y macroblock y address
 882  *
 883  * @return macroblock offset legal (boolean)
 884  */
 885 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
 886                                    int xoffset, int yoffset, int boundary,
 887                                    int *edge_x, int *edge_y)
 888 {
 889     int vwidth = mb_width + 1;
 890     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
 891     if (new < boundary || new % vwidth == vwidth - 1)
 892         return 0;
 893     *edge_y = new / vwidth;
 894     *edge_x = new % vwidth;
 895     return 1;
 896 }
 897
 898 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
 899 {
 900     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
 901 }
 902
 903 static av_always_inline
 904 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 905                     int mb_x, int mb_y, int layout)
 906 {
 907     VP8Macroblock *mb_edge[12];
 908     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
 909     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 910     int idx = CNT_ZERO;
 911     VP56mv near_mv[3];
 912     uint8_t cnt[3] = { 0 };
 913     VP56RangeCoder *c = &s->c;
 914     int i;
 915
 916     AV_ZERO32(&near_mv[0]);
 917     AV_ZERO32(&near_mv[1]);
 918     AV_ZERO32(&near_mv[2]);
 919
 920     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
 921         const VP7MVPred * pred = &vp7_mv_pred[i];
 922         int edge_x, edge_y;
 923
 924         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
 925                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
 926             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
 927                                              ? s->macroblocks_base + 1 + edge_x +
 928                                                (s->mb_width + 1) * (edge_y + 1)
 929                                              : s->macroblocks + edge_x +
 930                                                (s->mb_height - edge_y - 1) * 2;
 931             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
 932             if (mv) {
 933                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
 934                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
 935                         idx = CNT_NEAREST;
 936                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
 937                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
 938                             continue;
 939                         idx = CNT_NEAR;
 940                     } else {
 941                         AV_WN32A(&near_mv[CNT_NEAR], mv);
 942                         idx = CNT_NEAR;
 943                     }
 944                 } else {
 945                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
 946                     idx = CNT_NEAREST;
 947                 }
 948             } else {
 949                 idx = CNT_ZERO;
 950             }
 951         } else {
 952             idx = CNT_ZERO;
 953         }
 954         cnt[idx] += vp7_mv_pred[i].score;
 955     }
 956
 957     mb->partitioning = VP8_SPLITMVMODE_NONE;
 958
 959     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
 960         mb->mode = VP8_MVMODE_MV;
 961
 962         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
 963
 964             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
 965
 966                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
 967                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
 968                 else
 969                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
 970
 971                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
 972                     mb->mode = VP8_MVMODE_SPLIT;
 973                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
 974                 } else {
 975                     mb->mv.y += read_mv_component(c, s->prob->mvc[0], IS_VP7);
 976                     mb->mv.x += read_mv_component(c, s->prob->mvc[1], IS_VP7);
 977                     mb->bmv[0] = mb->mv;
 978                 }
 979             } else {
 980                 mb->mv = near_mv[CNT_NEAR];
 981                 mb->bmv[0] = mb->mv;
 982             }
 983         } else {
 984             mb->mv = near_mv[CNT_NEAREST];
 985             mb->bmv[0] = mb->mv;
 986         }
 987     } else {
 988         mb->mode = VP8_MVMODE_ZERO;
 989         AV_ZERO32(&mb->mv);
 990         mb->bmv[0] = mb->mv;
 991     }
 992 }
 993
 994 static av_always_inline
 995 void vp8_decode_mvs(VP8Context *s, VP8Macroblock *mb,
 996                     int mb_x, int mb_y, int layout)
 997 {
 998     VP8Macroblock *mb_edge[3] = { 0      /* top */,
 999                                   mb - 1 /* left */,
1000                                   0      /* top-left */ };
1001     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1002     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1003     int idx = CNT_ZERO;
1004     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1005     int8_t *sign_bias = s->sign_bias;
1006     VP56mv near_mv[4];
1007     uint8_t cnt[4] = { 0 };
1008     VP56RangeCoder *c = &s->c;
1009
1010     if (!layout) { // layout is inlined (s->mb_layout is not)
1011         mb_edge[0] = mb + 2;
1012         mb_edge[2] = mb + 1;
1013     } else {
1014         mb_edge[0] = mb - s->mb_width - 1;
1015         mb_edge[2] = mb - s->mb_width - 2;
1016     }
1017
1018     AV_ZERO32(&near_mv[0]);
1019     AV_ZERO32(&near_mv[1]);
1020     AV_ZERO32(&near_mv[2]);
1021
1022     /* Process MB on top, left and top-left */
1023 #define MV_EDGE_CHECK(n)                                                      \
1024     {                                                                         \
1025         VP8Macroblock *edge = mb_edge[n];                                     \
1026         int edge_ref = edge->ref_frame;                                       \
1027         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1028             uint32_t mv = AV_RN32A(&edge->mv);                                \
1029             if (mv) {                                                         \
1030                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1031                     /* SWAR negate of the values in mv. */                    \
1032                     mv = ~mv;                                                 \
1033                     mv = ((mv & 0x7fff7fff) +                                 \
1034                           0x00010001) ^ (mv & 0x80008000);                    \
1035                 }                                                             \
1036                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1037                     AV_WN32A(&near_mv[++idx], mv);                            \
1038                 cnt[idx] += 1 + (n != 2);                                     \
1039             } else                                                            \
1040                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1041         }                                                                     \
1042     }
1043
1044     MV_EDGE_CHECK(0)
1045     MV_EDGE_CHECK(1)
1046     MV_EDGE_CHECK(2)
1047
1048     mb->partitioning = VP8_SPLITMVMODE_NONE;
1049     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1050         mb->mode = VP8_MVMODE_MV;
1051
1052         /* If we have three distinct MVs, merge first and last if they're the same */
1053         if (cnt[CNT_SPLITMV] &&
1054             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1055             cnt[CNT_NEAREST] += 1;
1056
1057         /* Swap near and nearest if necessary */
1058         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1059             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1060             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1061         }
1062
1063         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1064             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1065                 /* Choose the best mv out of 0,0 and the nearest mv */
1066                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1067                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1068                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1069                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1070
1071                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1072                     mb->mode = VP8_MVMODE_SPLIT;
1073                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1074                 } else {
1075                     mb->mv.y  += read_mv_component(c, s->prob->mvc[0], IS_VP8);
1076                     mb->mv.x  += read_mv_component(c, s->prob->mvc[1], IS_VP8);
1077                     mb->bmv[0] = mb->mv;
1078                 }
1079             } else {
1080                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
1081                 mb->bmv[0] = mb->mv;
1082             }
1083         } else {
1084             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
1085             mb->bmv[0] = mb->mv;
1086         }
1087     } else {
1088         mb->mode = VP8_MVMODE_ZERO;
1089         AV_ZERO32(&mb->mv);
1090         mb->bmv[0] = mb->mv;
1091     }
1092 }
1093
1094 static av_always_inline
1095 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1096                            int mb_x, int keyframe, int layout)
1097 {
1098     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1099
1100     if (layout == 1) {
1101         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1102         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1103     }
1104     if (keyframe) {
1105         int x, y;
1106         uint8_t *top;
1107         uint8_t *const left = s->intra4x4_pred_mode_left;
1108         if (layout == 1)
1109             top = mb->intra4x4_pred_mode_top;
1110         else
1111             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1112         for (y = 0; y < 4; y++) {
1113             for (x = 0; x < 4; x++) {
1114                 const uint8_t *ctx;
1115                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1116                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1117                 left[y]   = top[x] = *intra4x4;
1118                 intra4x4++;
1119             }
1120         }
1121     } else {
1122         int i;
1123         for (i = 0; i < 16; i++)
1124             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1125                                            vp8_pred4x4_prob_inter);
1126     }
1127 }
1128
1129 static av_always_inline
1130 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1131                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1132 {
1133     VP56RangeCoder *c = &s->c;
1134     static const char *vp7_feature_name[] = { "q-index",
1135                                               "lf-delta",
1136                                               "partial-golden-update",
1137                                               "blit-pitch" };
1138     if (is_vp7) {
1139         int i;
1140         *segment = 0;
1141         for (i = 0; i < 4; i++) {
1142             if (s->feature_enabled[i]) {
1143                 if (vp56_rac_get_prob(c, s->feature_present_prob[i])) {
1144                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1145                                                    s->feature_index_prob[i]);
1146                       av_log(s->avctx, AV_LOG_WARNING,
1147                              "Feature %s present in macroblock (value 0x%x)\n",
1148                              vp7_feature_name[i], s->feature_value[i][index]);
1149                 }
1150            }
1151         }
1152     } else if (s->segmentation.update_map)
1153         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
1154     else if (s->segmentation.enabled)
1155         *segment = ref ? *ref : *segment;
1156     mb->segment = *segment;
1157
1158     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1159
1160     if (s->keyframe) {
1161         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1162                                     vp8_pred16x16_prob_intra);
1163
1164         if (mb->mode == MODE_I4x4) {
1165             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1166         } else {
1167             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1168                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1169             if (s->mb_layout == 1)
1170                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1171             else
1172                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1173             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1174         }
1175
1176         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1177                                                 vp8_pred8x8c_prob_intra);
1178         mb->ref_frame        = VP56_FRAME_CURRENT;
1179     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1180         // inter MB, 16.2
1181         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1182             mb->ref_frame =
1183                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1184                                                                    : VP56_FRAME_GOLDEN;
1185         else
1186             mb->ref_frame = VP56_FRAME_PREVIOUS;
1187         s->ref_count[mb->ref_frame - 1]++;
1188
1189         // motion vectors, 16.3
1190         if (is_vp7)
1191             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1192         else
1193             vp8_decode_mvs(s, mb, mb_x, mb_y, layout);
1194     } else {
1195         // intra MB, 16.1
1196         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1197
1198         if (mb->mode == MODE_I4x4)
1199             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1200
1201         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1202                                                 s->prob->pred8x8c);
1203         mb->ref_frame        = VP56_FRAME_CURRENT;
1204         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1205         AV_ZERO32(&mb->bmv[0]);
1206     }
1207 }
1208
1209 /**
1210  * @param r     arithmetic bitstream reader context
1211  * @param block destination for block coefficients
1212  * @param probs probabilities to use when reading trees from the bitstream
1213  * @param i     initial coeff index, 0 unless a separate DC block is coded
1214  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1215  *
1216  * @return 0 if no coeffs were decoded
1217  *         otherwise, the index of the last coeff decoded plus one
1218  */
1219 static av_always_inline
1220 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1221                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1222                                  int i, uint8_t *token_prob, int16_t qmul[2],
1223                                  const uint8_t scan[16], int vp7)
1224 {
1225     VP56RangeCoder c = *r;
1226     goto skip_eob;
1227     do {
1228         int coeff;
1229 restart:
1230         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1231             break;
1232
1233 skip_eob:
1234         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1235             if (++i == 16)
1236                 break; // invalid input; blocks should end with EOB
1237             token_prob = probs[i][0];
1238             if (vp7)
1239                 goto restart;
1240             goto skip_eob;
1241         }
1242
1243         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1244             coeff = 1;
1245             token_prob = probs[i + 1][1];
1246         } else {
1247             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1248                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1249                 if (coeff)
1250                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1251                 coeff += 2;
1252             } else {
1253                 // DCT_CAT*
1254                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1255                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1256                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1257                     } else {                                    // DCT_CAT2
1258                         coeff  = 7;
1259                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1260                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1261                     }
1262                 } else {    // DCT_CAT3 and up
1263                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1264                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1265                     int cat = (a << 1) + b;
1266                     coeff  = 3 + (8 << cat);
1267                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1268                 }
1269             }
1270             token_prob = probs[i + 1][2];
1271         }
1272         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1273     } while (++i < 16);
1274
1275     *r = c;
1276     return i;
1277 }
1278
1279 static av_always_inline
1280 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1281 {
1282     int16_t dc = block[0];
1283     int ret = 0;
1284
1285     if (pred[1] > 3) {
1286         dc += pred[0];
1287         ret = 1;
1288     }
1289
1290     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1291         block[0] = pred[0] = dc;
1292         pred[1] = 0;
1293     } else {
1294         if (pred[0] == dc)
1295             pred[1]++;
1296         block[0] = pred[0] = dc;
1297     }
1298
1299     return ret;
1300 }
1301
1302 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1303                                             int16_t block[16],
1304                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1305                                             int i, uint8_t *token_prob,
1306                                             int16_t qmul[2],
1307                                             const uint8_t scan[16])
1308 {
1309     return decode_block_coeffs_internal(r, block, probs, i,
1310                                         token_prob, qmul, scan, IS_VP7);
1311 }
1312
1313 #ifndef vp8_decode_block_coeffs_internal
1314 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1315                                             int16_t block[16],
1316                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1317                                             int i, uint8_t *token_prob,
1318                                             int16_t qmul[2])
1319 {
1320     return decode_block_coeffs_internal(r, block, probs, i,
1321                                         token_prob, qmul, ff_zigzag_scan, IS_VP8);
1322 }
1323 #endif
1324
1325 /**
1326  * @param c          arithmetic bitstream reader context
1327  * @param block      destination for block coefficients
1328  * @param probs      probabilities to use when reading trees from the bitstream
1329  * @param i          initial coeff index, 0 unless a separate DC block is coded
1330  * @param zero_nhood the initial prediction context for number of surrounding
1331  *                   all-zero blocks (only left/top, so 0-2)
1332  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1333  *
1334  * @return 0 if no coeffs were decoded
1335  *         otherwise, the index of the last coeff decoded plus one
1336  */
1337 static av_always_inline
1338 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1339                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1340                         int i, int zero_nhood, int16_t qmul[2],
1341                         const uint8_t scan[16], int vp7)
1342 {
1343     uint8_t *token_prob = probs[i][zero_nhood];
1344     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1345         return 0;
1346     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1347                                                   token_prob, qmul, scan)
1348                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1349                                                   token_prob, qmul);
1350 }
1351
1352 static av_always_inline
1353 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1354                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1355                       int is_vp7)
1356 {
1357     int i, x, y, luma_start = 0, luma_ctx = 3;
1358     int nnz_pred, nnz, nnz_total = 0;
1359     int segment = mb->segment;
1360     int block_dc = 0;
1361
1362     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1363         nnz_pred = t_nnz[8] + l_nnz[8];
1364
1365         // decode DC values and do hadamard
1366         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1367                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1368                                   ff_zigzag_scan, is_vp7);
1369         l_nnz[8] = t_nnz[8] = !!nnz;
1370
1371         if (is_vp7 && mb->mode > MODE_I4x4) {
1372             nnz |=  inter_predict_dc(td->block_dc,
1373                                      s->inter_dc_pred[mb->ref_frame - 1]);
1374         }
1375
1376         if (nnz) {
1377             nnz_total += nnz;
1378             block_dc   = 1;
1379             if (nnz == 1)
1380                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1381             else
1382                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1383         }
1384         luma_start = 1;
1385         luma_ctx   = 0;
1386     }
1387
1388     // luma blocks
1389     for (y = 0; y < 4; y++)
1390         for (x = 0; x < 4; x++) {
1391             nnz_pred = l_nnz[y] + t_nnz[x];
1392             nnz = decode_block_coeffs(c, td->block[y][x],
1393                                       s->prob->token[luma_ctx],
1394                                       luma_start, nnz_pred,
1395                                       s->qmat[segment].luma_qmul,
1396                                       s->prob[0].scan, is_vp7);
1397             /* nnz+block_dc may be one more than the actual last index,
1398              * but we don't care */
1399             td->non_zero_count_cache[y][x] = nnz + block_dc;
1400             t_nnz[x] = l_nnz[y] = !!nnz;
1401             nnz_total += nnz;
1402         }
1403
1404     // chroma blocks
1405     // TODO: what to do about dimensions? 2nd dim for luma is x,
1406     // but for chroma it's (y<<1)|x
1407     for (i = 4; i < 6; i++)
1408         for (y = 0; y < 2; y++)
1409             for (x = 0; x < 2; x++) {
1410                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1411                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1412                                           s->prob->token[2], 0, nnz_pred,
1413                                           s->qmat[segment].chroma_qmul,
1414                                           s->prob[0].scan, is_vp7);
1415                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1416                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1417                 nnz_total += nnz;
1418             }
1419
1420     // if there were no coded coeffs despite the macroblock not being marked skip,
1421     // we MUST not do the inner loop filter and should not do IDCT
1422     // Since skip isn't used for bitstream prediction, just manually set it.
1423     if (!nnz_total)
1424         mb->skip = 1;
1425 }
1426
1427 static av_always_inline
1428 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1429                       uint8_t *src_cb, uint8_t *src_cr,
1430                       ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple)
1431 {
1432     AV_COPY128(top_border, src_y + 15 * linesize);
1433     if (!simple) {
1434         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1435         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1436     }
1437 }
1438
1439 static av_always_inline
1440 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1441                     uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x,
1442                     int mb_y, int mb_width, int simple, int xchg)
1443 {
1444     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1445     src_y  -= linesize;
1446     src_cb -= uvlinesize;
1447     src_cr -= uvlinesize;
1448
1449 #define XCHG(a, b, xchg)                                                      \
1450     do {                                                                      \
1451         if (xchg)                                                             \
1452             AV_SWAP64(b, a);                                                  \
1453         else                                                                  \
1454             AV_COPY64(b, a);                                                  \
1455     } while (0)
1456
1457     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1458     XCHG(top_border, src_y, xchg);
1459     XCHG(top_border + 8, src_y + 8, 1);
1460     if (mb_x < mb_width - 1)
1461         XCHG(top_border + 32, src_y + 16, 1);
1462
1463     // only copy chroma for normal loop filter
1464     // or to initialize the top row to 127
1465     if (!simple || !mb_y) {
1466         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1467         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1468         XCHG(top_border + 16, src_cb, 1);
1469         XCHG(top_border + 24, src_cr, 1);
1470     }
1471 }
1472
1473 static av_always_inline
1474 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1475 {
1476     if (!mb_x)
1477         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1478     else
1479         return mb_y ? mode : LEFT_DC_PRED8x8;
1480 }
1481
1482 static av_always_inline
1483 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1484 {
1485     if (!mb_x)
1486         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1487     else
1488         return mb_y ? mode : HOR_PRED8x8;
1489 }
1490
1491 static av_always_inline
1492 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1493 {
1494     switch (mode) {
1495     case DC_PRED8x8:
1496         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1497     case VERT_PRED8x8:
1498         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1499     case HOR_PRED8x8:
1500         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1501     case PLANE_PRED8x8: /* TM */
1502         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1503     }
1504     return mode;
1505 }
1506
1507 static av_always_inline
1508 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1509 {
1510     if (!mb_x) {
1511         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1512     } else {
1513         return mb_y ? mode : HOR_VP8_PRED;
1514     }
1515 }
1516
1517 static av_always_inline
1518 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1519                                      int *copy_buf, int vp7)
1520 {
1521     switch (mode) {
1522     case VERT_PRED:
1523         if (!mb_x && mb_y) {
1524             *copy_buf = 1;
1525             return mode;
1526         }
1527         /* fall-through */
1528     case DIAG_DOWN_LEFT_PRED:
1529     case VERT_LEFT_PRED:
1530         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1531     case HOR_PRED:
1532         if (!mb_y) {
1533             *copy_buf = 1;
1534             return mode;
1535         }
1536         /* fall-through */
1537     case HOR_UP_PRED:
1538         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1539     case TM_VP8_PRED:
1540         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1541     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1542                    * as 16x16/8x8 DC */
1543     case DIAG_DOWN_RIGHT_PRED:
1544     case VERT_RIGHT_PRED:
1545     case HOR_DOWN_PRED:
1546         if (!mb_y || !mb_x)
1547             *copy_buf = 1;
1548         return mode;
1549     }
1550     return mode;
1551 }
1552
1553 static av_always_inline
1554 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1555                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1556 {
1557     int x, y, mode, nnz;
1558     uint32_t tr;
1559
1560     /* for the first row, we need to run xchg_mb_border to init the top edge
1561      * to 127 otherwise, skip it if we aren't going to deblock */
1562     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1563         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1564                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1565                        s->filter.simple, 1);
1566
1567     if (mb->mode < MODE_I4x4) {
1568         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1569         s->hpc.pred16x16[mode](dst[0], s->linesize);
1570     } else {
1571         uint8_t *ptr = dst[0];
1572         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1573         const uint8_t lo = is_vp7 ? 128 : 127;
1574         const uint8_t hi = is_vp7 ? 128 : 129;
1575         uint8_t tr_top[4] = { lo, lo, lo, lo };
1576
1577         // all blocks on the right edge of the macroblock use bottom edge
1578         // the top macroblock for their topright edge
1579         uint8_t *tr_right = ptr - s->linesize + 16;
1580
1581         // if we're on the right edge of the frame, said edge is extended
1582         // from the top macroblock
1583         if (mb_y && mb_x == s->mb_width - 1) {
1584             tr       = tr_right[-1] * 0x01010101u;
1585             tr_right = (uint8_t *) &tr;
1586         }
1587
1588         if (mb->skip)
1589             AV_ZERO128(td->non_zero_count_cache);
1590
1591         for (y = 0; y < 4; y++) {
1592             uint8_t *topright = ptr + 4 - s->linesize;
1593             for (x = 0; x < 4; x++) {
1594                 int copy = 0;
1595                 ptrdiff_t linesize = s->linesize;
1596                 uint8_t *dst = ptr + 4 * x;
1597                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5 * 8];
1598
1599                 if ((y == 0 || x == 3) && mb_y == 0) {
1600                     topright = tr_top;
1601                 } else if (x == 3)
1602                     topright = tr_right;
1603
1604                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1605                                                         mb_y + y, &copy, is_vp7);
1606                 if (copy) {
1607                     dst      = copy_dst + 12;
1608                     linesize = 8;
1609                     if (!(mb_y + y)) {
1610                         copy_dst[3] = lo;
1611                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1612                     } else {
1613                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1614                         if (!(mb_x + x)) {
1615                             copy_dst[3] = hi;
1616                         } else {
1617                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1618                         }
1619                     }
1620                     if (!(mb_x + x)) {
1621                         copy_dst[11] =
1622                         copy_dst[19] =
1623                         copy_dst[27] =
1624                         copy_dst[35] = hi;
1625                     } else {
1626                         copy_dst[11] = ptr[4 * x                   - 1];
1627                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1628                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1629                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1630                     }
1631                 }
1632                 s->hpc.pred4x4[mode](dst, topright, linesize);
1633                 if (copy) {
1634                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1635                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1636                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1637                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1638                 }
1639
1640                 nnz = td->non_zero_count_cache[y][x];
1641                 if (nnz) {
1642                     if (nnz == 1)
1643                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1644                                                   td->block[y][x], s->linesize);
1645                     else
1646                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1647                                                td->block[y][x], s->linesize);
1648                 }
1649                 topright += 4;
1650             }
1651
1652             ptr      += 4 * s->linesize;
1653             intra4x4 += 4;
1654         }
1655     }
1656
1657     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1658                                             mb_x, mb_y, is_vp7);
1659     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1660     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1661
1662     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1663         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1664                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1665                        s->filter.simple, 0);
1666 }
1667
1668 static const uint8_t subpel_idx[3][8] = {
1669     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1670                                 // also function pointer index
1671     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1672     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1673 };
1674
1675 /**
1676  * luma MC function
1677  *
1678  * @param s        VP8 decoding context
1679  * @param dst      target buffer for block data at block position
1680  * @param ref      reference picture buffer at origin (0, 0)
1681  * @param mv       motion vector (relative to block position) to get pixel data from
1682  * @param x_off    horizontal position of block from origin (0, 0)
1683  * @param y_off    vertical position of block from origin (0, 0)
1684  * @param block_w  width of block (16, 8 or 4)
1685  * @param block_h  height of block (always same as block_w)
1686  * @param width    width of src/dst plane data
1687  * @param height   height of src/dst plane data
1688  * @param linesize size of a single line of plane data, including padding
1689  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1690  */
1691 static av_always_inline
1692 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1693                  ThreadFrame *ref, const VP56mv *mv,
1694                  int x_off, int y_off, int block_w, int block_h,
1695                  int width, int height, ptrdiff_t linesize,
1696                  vp8_mc_func mc_func[3][3])
1697 {
1698     uint8_t *src = ref->f->data[0];
1699
1700     if (AV_RN32A(mv)) {
1701         ptrdiff_t src_linesize = linesize;
1702
1703         int mx = (mv->x << 1) & 7, mx_idx = subpel_idx[0][mx];
1704         int my = (mv->y << 1) & 7, my_idx = subpel_idx[0][my];
1705
1706         x_off += mv->x >> 2;
1707         y_off += mv->y >> 2;
1708
1709         // edge emulation
1710         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1711         src += y_off * linesize + x_off;
1712         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1713             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1714             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1715                                      src - my_idx * linesize - mx_idx,
1716                                      EDGE_EMU_LINESIZE, linesize,
1717                                      block_w + subpel_idx[1][mx],
1718                                      block_h + subpel_idx[1][my],
1719                                      x_off - mx_idx, y_off - my_idx,
1720                                      width, height);
1721             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1722             src_linesize = EDGE_EMU_LINESIZE;
1723         }
1724         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1725     } else {
1726         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1727         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1728                       linesize, block_h, 0, 0);
1729     }
1730 }
1731
1732 /**
1733  * chroma MC function
1734  *
1735  * @param s        VP8 decoding context
1736  * @param dst1     target buffer for block data at block position (U plane)
1737  * @param dst2     target buffer for block data at block position (V plane)
1738  * @param ref      reference picture buffer at origin (0, 0)
1739  * @param mv       motion vector (relative to block position) to get pixel data from
1740  * @param x_off    horizontal position of block from origin (0, 0)
1741  * @param y_off    vertical position of block from origin (0, 0)
1742  * @param block_w  width of block (16, 8 or 4)
1743  * @param block_h  height of block (always same as block_w)
1744  * @param width    width of src/dst plane data
1745  * @param height   height of src/dst plane data
1746  * @param linesize size of a single line of plane data, including padding
1747  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1748  */
1749 static av_always_inline
1750 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1751                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1752                    int x_off, int y_off, int block_w, int block_h,
1753                    int width, int height, ptrdiff_t linesize,
1754                    vp8_mc_func mc_func[3][3])
1755 {
1756     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1757
1758     if (AV_RN32A(mv)) {
1759         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1760         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1761
1762         x_off += mv->x >> 3;
1763         y_off += mv->y >> 3;
1764
1765         // edge emulation
1766         src1 += y_off * linesize + x_off;
1767         src2 += y_off * linesize + x_off;
1768         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1769         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1770             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1771             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1772                                      src1 - my_idx * linesize - mx_idx,
1773                                      EDGE_EMU_LINESIZE, linesize,
1774                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1775                                      x_off - mx_idx, y_off - my_idx, width, height);
1776             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1777             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1778
1779             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1780                                      src2 - my_idx * linesize - mx_idx,
1781                                      EDGE_EMU_LINESIZE, linesize,
1782                                      block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1783                                      x_off - mx_idx, y_off - my_idx, width, height);
1784             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1785             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1786         } else {
1787             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1788             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1789         }
1790     } else {
1791         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1792         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1793         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1794     }
1795 }
1796
1797 static av_always_inline
1798 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1799                  ThreadFrame *ref_frame, int x_off, int y_off,
1800                  int bx_off, int by_off, int block_w, int block_h,
1801                  int width, int height, VP56mv *mv)
1802 {
1803     VP56mv uvmv = *mv;
1804
1805     /* Y */
1806     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1807                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1808                 block_w, block_h, width, height, s->linesize,
1809                 s->put_pixels_tab[block_w == 8]);
1810
1811     /* U/V */
1812     if (s->profile == 3) {
1813         /* this block only applies VP8; it is safe to check
1814          * only the profile, as VP7 profile <= 1 */
1815         uvmv.x &= ~7;
1816         uvmv.y &= ~7;
1817     }
1818     x_off   >>= 1;
1819     y_off   >>= 1;
1820     bx_off  >>= 1;
1821     by_off  >>= 1;
1822     width   >>= 1;
1823     height  >>= 1;
1824     block_w >>= 1;
1825     block_h >>= 1;
1826     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1827                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1828                   &uvmv, x_off + bx_off, y_off + by_off,
1829                   block_w, block_h, width, height, s->uvlinesize,
1830                   s->put_pixels_tab[1 + (block_w == 4)]);
1831 }
1832
1833 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1834  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1835 static av_always_inline
1836 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1837                      int mb_xy, int ref)
1838 {
1839     /* Don't prefetch refs that haven't been used very often this frame. */
1840     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1841         int x_off = mb_x << 4, y_off = mb_y << 4;
1842         int mx = (mb->mv.x >> 2) + x_off + 8;
1843         int my = (mb->mv.y >> 2) + y_off;
1844         uint8_t **src = s->framep[ref]->tf.f->data;
1845         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1846         /* For threading, a ff_thread_await_progress here might be useful, but
1847          * it actually slows down the decoder. Since a bad prefetch doesn't
1848          * generate bad decoder output, we don't run it here. */
1849         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1850         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1851         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1852     }
1853 }
1854
1855 /**
1856  * Apply motion vectors to prediction buffer, chapter 18.
1857  */
1858 static av_always_inline
1859 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1860                    VP8Macroblock *mb, int mb_x, int mb_y)
1861 {
1862     int x_off = mb_x << 4, y_off = mb_y << 4;
1863     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1864     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1865     VP56mv *bmv = mb->bmv;
1866
1867     switch (mb->partitioning) {
1868     case VP8_SPLITMVMODE_NONE:
1869         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1870                     0, 0, 16, 16, width, height, &mb->mv);
1871         break;
1872     case VP8_SPLITMVMODE_4x4: {
1873         int x, y;
1874         VP56mv uvmv;
1875
1876         /* Y */
1877         for (y = 0; y < 4; y++) {
1878             for (x = 0; x < 4; x++) {
1879                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
1880                             ref, &bmv[4 * y + x],
1881                             4 * x + x_off, 4 * y + y_off, 4, 4,
1882                             width, height, s->linesize,
1883                             s->put_pixels_tab[2]);
1884             }
1885         }
1886
1887         /* U/V */
1888         x_off  >>= 1;
1889         y_off  >>= 1;
1890         width  >>= 1;
1891         height >>= 1;
1892         for (y = 0; y < 2; y++) {
1893             for (x = 0; x < 2; x++) {
1894                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
1895                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
1896                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
1897                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
1898                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
1899                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
1900                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
1901                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
1902                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
1903                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
1904                 if (s->profile == 3) {
1905                     uvmv.x &= ~7;
1906                     uvmv.y &= ~7;
1907                 }
1908                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
1909                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
1910                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
1911                               width, height, s->uvlinesize,
1912                               s->put_pixels_tab[2]);
1913             }
1914         }
1915         break;
1916     }
1917     case VP8_SPLITMVMODE_16x8:
1918         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1919                     0, 0, 16, 8, width, height, &bmv[0]);
1920         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1921                     0, 8, 16, 8, width, height, &bmv[1]);
1922         break;
1923     case VP8_SPLITMVMODE_8x16:
1924         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1925                     0, 0, 8, 16, width, height, &bmv[0]);
1926         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1927                     8, 0, 8, 16, width, height, &bmv[1]);
1928         break;
1929     case VP8_SPLITMVMODE_8x8:
1930         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1931                     0, 0, 8, 8, width, height, &bmv[0]);
1932         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1933                     8, 0, 8, 8, width, height, &bmv[1]);
1934         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1935                     0, 8, 8, 8, width, height, &bmv[2]);
1936         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1937                     8, 8, 8, 8, width, height, &bmv[3]);
1938         break;
1939     }
1940 }
1941
1942 static av_always_inline
1943 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
1944 {
1945     int x, y, ch;
1946
1947     if (mb->mode != MODE_I4x4) {
1948         uint8_t *y_dst = dst[0];
1949         for (y = 0; y < 4; y++) {
1950             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
1951             if (nnz4) {
1952                 if (nnz4 & ~0x01010101) {
1953                     for (x = 0; x < 4; x++) {
1954                         if ((uint8_t) nnz4 == 1)
1955                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
1956                                                       td->block[y][x],
1957                                                       s->linesize);
1958                         else if ((uint8_t) nnz4 > 1)
1959                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
1960                                                    td->block[y][x],
1961                                                    s->linesize);
1962                         nnz4 >>= 8;
1963                         if (!nnz4)
1964                             break;
1965                     }
1966                 } else {
1967                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
1968                 }
1969             }
1970             y_dst += 4 * s->linesize;
1971         }
1972     }
1973
1974     for (ch = 0; ch < 2; ch++) {
1975         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
1976         if (nnz4) {
1977             uint8_t *ch_dst = dst[1 + ch];
1978             if (nnz4 & ~0x01010101) {
1979                 for (y = 0; y < 2; y++) {
1980                     for (x = 0; x < 2; x++) {
1981                         if ((uint8_t) nnz4 == 1)
1982                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
1983                                                       td->block[4 + ch][(y << 1) + x],
1984                                                       s->uvlinesize);
1985                         else if ((uint8_t) nnz4 > 1)
1986                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
1987                                                    td->block[4 + ch][(y << 1) + x],
1988                                                    s->uvlinesize);
1989                         nnz4 >>= 8;
1990                         if (!nnz4)
1991                             goto chroma_idct_end;
1992                     }
1993                     ch_dst += 4 * s->uvlinesize;
1994                 }
1995             } else {
1996                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
1997             }
1998         }
1999 chroma_idct_end:
2000         ;
2001     }
2002 }
2003
2004 static av_always_inline
2005 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2006                          VP8FilterStrength *f, int is_vp7)
2007 {
2008     int interior_limit, filter_level;
2009
2010     if (s->segmentation.enabled) {
2011         filter_level = s->segmentation.filter_level[mb->segment];
2012         if (!s->segmentation.absolute_vals)
2013             filter_level += s->filter.level;
2014     } else
2015         filter_level = s->filter.level;
2016
2017     if (s->lf_delta.enabled) {
2018         filter_level += s->lf_delta.ref[mb->ref_frame];
2019         filter_level += s->lf_delta.mode[mb->mode];
2020     }
2021
2022     filter_level = av_clip_uintp2(filter_level, 6);
2023
2024     interior_limit = filter_level;
2025     if (s->filter.sharpness) {
2026         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2027         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2028     }
2029     interior_limit = FFMAX(interior_limit, 1);
2030
2031     f->filter_level = filter_level;
2032     f->inner_limit = interior_limit;
2033     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2034                       mb->mode == VP8_MVMODE_SPLIT;
2035 }
2036
2037 static av_always_inline
2038 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2039                int mb_x, int mb_y, int is_vp7)
2040 {
2041     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2042     int filter_level = f->filter_level;
2043     int inner_limit = f->inner_limit;
2044     int inner_filter = f->inner_filter;
2045     ptrdiff_t linesize   = s->linesize;
2046     ptrdiff_t uvlinesize = s->uvlinesize;
2047     static const uint8_t hev_thresh_lut[2][64] = {
2048         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2049           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2050           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2051           3, 3, 3, 3 },
2052         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2053           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2054           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2055           2, 2, 2, 2 }
2056     };
2057
2058     if (!filter_level)
2059         return;
2060
2061     if (is_vp7) {
2062         bedge_lim_y  = filter_level;
2063         bedge_lim_uv = filter_level * 2;
2064         mbedge_lim   = filter_level + 2;
2065     } else {
2066         bedge_lim_y  =
2067         bedge_lim_uv = filter_level * 2 + inner_limit;
2068         mbedge_lim   = bedge_lim_y + 4;
2069     }
2070
2071     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2072
2073     if (mb_x) {
2074         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2075                                        mbedge_lim, inner_limit, hev_thresh);
2076         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2077                                        mbedge_lim, inner_limit, hev_thresh);
2078     }
2079
2080 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2081     if (cond && inner_filter) {                                               \
2082         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2083                                              bedge_lim_y, inner_limit,        \
2084                                              hev_thresh);                     \
2085         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2086                                              bedge_lim_y, inner_limit,        \
2087                                              hev_thresh);                     \
2088         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2089                                              bedge_lim_y, inner_limit,        \
2090                                              hev_thresh);                     \
2091         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2092                                              uvlinesize,  bedge_lim_uv,       \
2093                                              inner_limit, hev_thresh);        \
2094     }
2095
2096     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2097
2098     if (mb_y) {
2099         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2100                                        mbedge_lim, inner_limit, hev_thresh);
2101         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2102                                        mbedge_lim, inner_limit, hev_thresh);
2103     }
2104
2105     if (inner_filter) {
2106         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2107                                              linesize, bedge_lim_y,
2108                                              inner_limit, hev_thresh);
2109         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2110                                              linesize, bedge_lim_y,
2111                                              inner_limit, hev_thresh);
2112         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2113                                              linesize, bedge_lim_y,
2114                                              inner_limit, hev_thresh);
2115         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2116                                              dst[2] +  4 * uvlinesize,
2117                                              uvlinesize, bedge_lim_uv,
2118                                              inner_limit, hev_thresh);
2119     }
2120
2121     H_LOOP_FILTER_16Y_INNER(is_vp7)
2122 }
2123
2124 static av_always_inline
2125 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2126                       int mb_x, int mb_y)
2127 {
2128     int mbedge_lim, bedge_lim;
2129     int filter_level = f->filter_level;
2130     int inner_limit  = f->inner_limit;
2131     int inner_filter = f->inner_filter;
2132     ptrdiff_t linesize = s->linesize;
2133
2134     if (!filter_level)
2135         return;
2136
2137     bedge_lim  = 2 * filter_level + inner_limit;
2138     mbedge_lim = bedge_lim + 4;
2139
2140     if (mb_x)
2141         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2142     if (inner_filter) {
2143         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2144         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2145         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2146     }
2147
2148     if (mb_y)
2149         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2150     if (inner_filter) {
2151         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2152         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2153         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2154     }
2155 }
2156
2157 #define MARGIN (16 << 2)
2158 static av_always_inline
2159 void vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2160                                     VP8Frame *prev_frame, int is_vp7)
2161 {
2162     VP8Context *s = avctx->priv_data;
2163     int mb_x, mb_y;
2164
2165     s->mv_min.y = -MARGIN;
2166     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2167     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2168         VP8Macroblock *mb = s->macroblocks_base +
2169                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2170         int mb_xy = mb_y * s->mb_width;
2171
2172         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2173
2174         s->mv_min.x = -MARGIN;
2175         s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2176         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2177             if (mb_y == 0)
2178                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2179                          DC_PRED * 0x01010101);
2180             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2181                            prev_frame && prev_frame->seg_map ?
2182                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2183             s->mv_min.x -= 64;
2184             s->mv_max.x -= 64;
2185         }
2186         s->mv_min.y -= 64;
2187         s->mv_max.y -= 64;
2188     }
2189 }
2190
2191 static void vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2192                                    VP8Frame *prev_frame)
2193 {
2194     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2195 }
2196
2197 static void vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2198                                    VP8Frame *prev_frame)
2199 {
2200     vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2201 }
2202
2203 #if HAVE_THREADS
2204 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2205     do {                                                                      \
2206         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2207         if (otd->thread_mb_pos < tmp) {                                       \
2208             pthread_mutex_lock(&otd->lock);                                   \
2209             td->wait_mb_pos = tmp;                                            \
2210             do {                                                              \
2211                 if (otd->thread_mb_pos >= tmp)                                \
2212                     break;                                                    \
2213                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2214             } while (1);                                                      \
2215             td->wait_mb_pos = INT_MAX;                                        \
2216             pthread_mutex_unlock(&otd->lock);                                 \
2217         }                                                                     \
2218     } while (0);
2219
2220 #define update_pos(td, mb_y, mb_x)                                            \
2221     do {                                                                      \
2222         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2223         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2224                                (num_jobs > 1);                                \
2225         int is_null          = !next_td || !prev_td;                          \
2226         int pos_check        = (is_null) ? 1                                  \
2227                                          : (next_td != td &&                  \
2228                                             pos >= next_td->wait_mb_pos) ||   \
2229                                            (prev_td != td &&                  \
2230                                             pos >= prev_td->wait_mb_pos);     \
2231         td->thread_mb_pos = pos;                                              \
2232         if (sliced_threading && pos_check) {                                  \
2233             pthread_mutex_lock(&td->lock);                                    \
2234             pthread_cond_broadcast(&td->cond);                                \
2235             pthread_mutex_unlock(&td->lock);                                  \
2236         }                                                                     \
2237     } while (0);
2238 #else
2239 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)
2240 #define update_pos(td, mb_y, mb_x)
2241 #endif
2242
2243 static void vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2244                                         int jobnr, int threadnr, int is_vp7)
2245 {
2246     VP8Context *s = avctx->priv_data;
2247     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2248     int mb_y = td->thread_mb_pos >> 16;
2249     int mb_x, mb_xy = mb_y * s->mb_width;
2250     int num_jobs = s->num_jobs;
2251     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2252     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2253     VP8Macroblock *mb;
2254     uint8_t *dst[3] = {
2255         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2256         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2257         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2258     };
2259     if (mb_y == 0)
2260         prev_td = td;
2261     else
2262         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2263     if (mb_y == s->mb_height - 1)
2264         next_td = td;
2265     else
2266         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2267     if (s->mb_layout == 1)
2268         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2269     else {
2270         // Make sure the previous frame has read its segmentation map,
2271         // if we re-use the same map.
2272         if (prev_frame && s->segmentation.enabled &&
2273             !s->segmentation.update_map)
2274             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2275         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2276         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2277         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2278     }
2279
2280     if (!is_vp7 || mb_y == 0)
2281         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2282
2283     s->mv_min.x = -MARGIN;
2284     s->mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2285
2286     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2287         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2288         if (prev_td != td) {
2289             if (threadnr != 0) {
2290                 check_thread_pos(td, prev_td,
2291                                  mb_x + (is_vp7 ? 2 : 1),
2292                                  mb_y - (is_vp7 ? 2 : 1));
2293             } else {
2294                 check_thread_pos(td, prev_td,
2295                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2296                                  mb_y - (is_vp7 ? 2 : 1));
2297             }
2298         }
2299
2300         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2301                          s->linesize, 4);
2302         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2303                          dst[2] - dst[1], 2);
2304
2305         if (!s->mb_layout)
2306             decode_mb_mode(s, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2307                            prev_frame && prev_frame->seg_map ?
2308                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2309
2310         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2311
2312         if (!mb->skip)
2313             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2314
2315         if (mb->mode <= MODE_I4x4)
2316             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2317         else
2318             inter_predict(s, td, dst, mb, mb_x, mb_y);
2319
2320         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2321
2322         if (!mb->skip) {
2323             idct_mb(s, td, dst, mb);
2324         } else {
2325             AV_ZERO64(td->left_nnz);
2326             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2327
2328             /* Reset DC block predictors if they would exist
2329              * if the mb had coefficients */
2330             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2331                 td->left_nnz[8]     = 0;
2332                 s->top_nnz[mb_x][8] = 0;
2333             }
2334         }
2335
2336         if (s->deblock_filter)
2337             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2338
2339         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2340             if (s->filter.simple)
2341                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2342                                  NULL, NULL, s->linesize, 0, 1);
2343             else
2344                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2345                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2346         }
2347
2348         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2349
2350         dst[0]      += 16;
2351         dst[1]      += 8;
2352         dst[2]      += 8;
2353         s->mv_min.x -= 64;
2354         s->mv_max.x -= 64;
2355
2356         if (mb_x == s->mb_width + 1) {
2357             update_pos(td, mb_y, s->mb_width + 3);
2358         } else {
2359             update_pos(td, mb_y, mb_x);
2360         }
2361     }
2362 }
2363
2364 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2365                               int jobnr, int threadnr, int is_vp7)
2366 {
2367     VP8Context *s = avctx->priv_data;
2368     VP8ThreadData *td = &s->thread_data[threadnr];
2369     int mb_x, mb_y = td->thread_mb_pos >> 16, num_jobs = s->num_jobs;
2370     AVFrame *curframe = s->curframe->tf.f;
2371     VP8Macroblock *mb;
2372     VP8ThreadData *prev_td, *next_td;
2373     uint8_t *dst[3] = {
2374         curframe->data[0] + 16 * mb_y * s->linesize,
2375         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2376         curframe->data[2] +  8 * mb_y * s->uvlinesize
2377     };
2378
2379     if (s->mb_layout == 1)
2380         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2381     else
2382         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2383
2384     if (mb_y == 0)
2385         prev_td = td;
2386     else
2387         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2388     if (mb_y == s->mb_height - 1)
2389         next_td = td;
2390     else
2391         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2392
2393     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2394         VP8FilterStrength *f = &td->filter_strength[mb_x];
2395         if (prev_td != td)
2396             check_thread_pos(td, prev_td,
2397                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2398         if (next_td != td)
2399             if (next_td != &s->thread_data[0])
2400                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2401
2402         if (num_jobs == 1) {
2403             if (s->filter.simple)
2404                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2405                                  NULL, NULL, s->linesize, 0, 1);
2406             else
2407                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2408                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2409         }
2410
2411         if (s->filter.simple)
2412             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2413         else
2414             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2415         dst[0] += 16;
2416         dst[1] += 8;
2417         dst[2] += 8;
2418
2419         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2420     }
2421 }
2422
2423 static av_always_inline
2424 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2425                               int threadnr, int is_vp7)
2426 {
2427     VP8Context *s = avctx->priv_data;
2428     VP8ThreadData *td = &s->thread_data[jobnr];
2429     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2430     VP8Frame *curframe = s->curframe;
2431     int mb_y, num_jobs = s->num_jobs;
2432
2433     td->thread_nr = threadnr;
2434     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2435         if (mb_y >= s->mb_height)
2436             break;
2437         td->thread_mb_pos = mb_y << 16;
2438         vp8_decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, is_vp7);
2439         if (s->deblock_filter)
2440             vp8_filter_mb_row(avctx, tdata, jobnr, threadnr, is_vp7);
2441         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2442
2443         s->mv_min.y -= 64;
2444         s->mv_max.y -= 64;
2445
2446         if (avctx->active_thread_type == FF_THREAD_FRAME)
2447             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2448     }
2449
2450     return 0;
2451 }
2452
2453 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2454                                     int jobnr, int threadnr)
2455 {
2456     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2457 }
2458
2459 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2460                                     int jobnr, int threadnr)
2461 {
2462     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2463 }
2464
2465
2466 static av_always_inline
2467 int vp78_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2468                       AVPacket *avpkt, int is_vp7)
2469 {
2470     VP8Context *s = avctx->priv_data;
2471     int ret, i, referenced, num_jobs;
2472     enum AVDiscard skip_thresh;
2473     VP8Frame *av_uninit(curframe), *prev_frame;
2474
2475     if (is_vp7)
2476         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2477     else
2478         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2479
2480     if (ret < 0)
2481         goto err;
2482
2483     prev_frame = s->framep[VP56_FRAME_CURRENT];
2484
2485     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2486                  s->update_altref == VP56_FRAME_CURRENT;
2487
2488     skip_thresh = !referenced ? AVDISCARD_NONREF
2489                               : !s->keyframe ? AVDISCARD_NONKEY
2490                                              : AVDISCARD_ALL;
2491
2492     if (avctx->skip_frame >= skip_thresh) {
2493         s->invisible = 1;
2494         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2495         goto skip_decode;
2496     }
2497     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2498
2499     // release no longer referenced frames
2500     for (i = 0; i < 5; i++)
2501         if (s->frames[i].tf.f->data[0] &&
2502             &s->frames[i] != prev_frame &&
2503             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2504             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2505             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2506             vp8_release_frame(s, &s->frames[i]);
2507
2508     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2509
2510     if (!s->colorspace)
2511         avctx->colorspace = AVCOL_SPC_BT470BG;
2512     if (s->fullrange)
2513         avctx->color_range = AVCOL_RANGE_JPEG;
2514     else
2515         avctx->color_range = AVCOL_RANGE_MPEG;
2516
2517     /* Given that arithmetic probabilities are updated every frame, it's quite
2518      * likely that the values we have on a random interframe are complete
2519      * junk if we didn't start decode on a keyframe. So just don't display
2520      * anything rather than junk. */
2521     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2522                          !s->framep[VP56_FRAME_GOLDEN]   ||
2523                          !s->framep[VP56_FRAME_GOLDEN2])) {
2524         av_log(avctx, AV_LOG_WARNING,
2525                "Discarding interframe without a prior keyframe!\n");
2526         ret = AVERROR_INVALIDDATA;
2527         goto err;
2528     }
2529
2530     curframe->tf.f->key_frame = s->keyframe;
2531     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2532                                             : AV_PICTURE_TYPE_P;
2533     if ((ret = vp8_alloc_frame(s, curframe, referenced))) {
2534         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
2535         goto err;
2536     }
2537
2538     // check if golden and altref are swapped
2539     if (s->update_altref != VP56_FRAME_NONE)
2540         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2541     else
2542         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2543
2544     if (s->update_golden != VP56_FRAME_NONE)
2545         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2546     else
2547         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2548
2549     if (s->update_last)
2550         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2551     else
2552         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2553
2554     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2555
2556     ff_thread_finish_setup(avctx);
2557
2558     s->linesize   = curframe->tf.f->linesize[0];
2559     s->uvlinesize = curframe->tf.f->linesize[1];
2560
2561     memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2562     /* Zero macroblock structures for top/top-left prediction
2563      * from outside the frame. */
2564     if (!s->mb_layout)
2565         memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2566                (s->mb_width + 1) * sizeof(*s->macroblocks));
2567     if (!s->mb_layout && s->keyframe)
2568         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2569
2570     memset(s->ref_count, 0, sizeof(s->ref_count));
2571
2572     if (s->mb_layout == 1) {
2573         // Make sure the previous frame has read its segmentation map,
2574         // if we re-use the same map.
2575         if (prev_frame && s->segmentation.enabled &&
2576             !s->segmentation.update_map)
2577             ff_thread_await_progress(&prev_frame->tf, 1, 0);
2578         if (is_vp7)
2579             vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2580         else
2581             vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2582     }
2583
2584     if (avctx->active_thread_type == FF_THREAD_FRAME)
2585         num_jobs = 1;
2586     else
2587         num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2588     s->num_jobs   = num_jobs;
2589     s->curframe   = curframe;
2590     s->prev_frame = prev_frame;
2591     s->mv_min.y   = -MARGIN;
2592     s->mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2593     for (i = 0; i < MAX_THREADS; i++) {
2594         s->thread_data[i].thread_mb_pos = 0;
2595         s->thread_data[i].wait_mb_pos   = INT_MAX;
2596     }
2597     if (is_vp7)
2598         avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2599                         num_jobs);
2600     else
2601         avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2602                         num_jobs);
2603
2604     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2605     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2606
2607 skip_decode:
2608     // if future frames don't use the updated probabilities,
2609     // reset them to the values we saved
2610     if (!s->update_probabilities)
2611         s->prob[0] = s->prob[1];
2612
2613     if (!s->invisible) {
2614         if ((ret = av_frame_ref(data, curframe->tf.f)) < 0)
2615             return ret;
2616         *got_frame = 1;
2617     }
2618
2619     return avpkt->size;
2620 err:
2621     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2622     return ret;
2623 }
2624
2625 int ff_vp8_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2626                         AVPacket *avpkt)
2627 {
2628     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP8);
2629 }
2630
2631 #if CONFIG_VP7_DECODER
2632 static int vp7_decode_frame(AVCodecContext *avctx, void *data, int *got_frame,
2633                             AVPacket *avpkt)
2634 {
2635     return vp78_decode_frame(avctx, data, got_frame, avpkt, IS_VP7);
2636 }
2637 #endif /* CONFIG_VP7_DECODER */
2638
2639 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2640 {
2641     VP8Context *s = avctx->priv_data;
2642     int i;
2643
2644     vp8_decode_flush_impl(avctx, 1);
2645     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2646         av_frame_free(&s->frames[i].tf.f);
2647
2648     return 0;
2649 }
2650
2651 static av_cold int vp8_init_frames(VP8Context *s)
2652 {
2653     int i;
2654     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2655         s->frames[i].tf.f = av_frame_alloc();
2656         if (!s->frames[i].tf.f)
2657             return AVERROR(ENOMEM);
2658     }
2659     return 0;
2660 }
2661
2662 static av_always_inline
2663 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2664 {
2665     VP8Context *s = avctx->priv_data;
2666     int ret;
2667
2668     s->avctx = avctx;
2669     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2670     avctx->internal->allocate_progress = 1;
2671
2672     ff_videodsp_init(&s->vdsp, 8);
2673
2674     ff_vp78dsp_init(&s->vp8dsp);
2675     if (CONFIG_VP7_DECODER && is_vp7) {
2676         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2677         ff_vp7dsp_init(&s->vp8dsp);
2678     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2679         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2680         ff_vp8dsp_init(&s->vp8dsp);
2681     }
2682
2683     /* does not change for VP8 */
2684     memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
2685
2686     if ((ret = vp8_init_frames(s)) < 0) {
2687         ff_vp8_decode_free(avctx);
2688         return ret;
2689     }
2690
2691     return 0;
2692 }
2693
2694 #if CONFIG_VP7_DECODER
2695 static int vp7_decode_init(AVCodecContext *avctx)
2696 {
2697     return vp78_decode_init(avctx, IS_VP7);
2698 }
2699 #endif /* CONFIG_VP7_DECODER */
2700
2701 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2702 {
2703     return vp78_decode_init(avctx, IS_VP8);
2704 }
2705
2706 #if CONFIG_VP8_DECODER
2707 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
2708 {
2709     VP8Context *s = avctx->priv_data;
2710     int ret;
2711
2712     s->avctx = avctx;
2713
2714     if ((ret = vp8_init_frames(s)) < 0) {
2715         ff_vp8_decode_free(avctx);
2716         return ret;
2717     }
2718
2719     return 0;
2720 }
2721
2722 #define REBASE(pic) pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
2723
2724 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2725                                             const AVCodecContext *src)
2726 {
2727     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2728     int i;
2729
2730     if (s->macroblocks_base &&
2731         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2732         free_buffers(s);
2733         s->mb_width  = s_src->mb_width;
2734         s->mb_height = s_src->mb_height;
2735     }
2736
2737     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2738     s->segmentation = s_src->segmentation;
2739     s->lf_delta     = s_src->lf_delta;
2740     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2741
2742     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2743         if (s_src->frames[i].tf.f->data[0]) {
2744             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2745             if (ret < 0)
2746                 return ret;
2747         }
2748     }
2749
2750     s->framep[0] = REBASE(s_src->next_framep[0]);
2751     s->framep[1] = REBASE(s_src->next_framep[1]);
2752     s->framep[2] = REBASE(s_src->next_framep[2]);
2753     s->framep[3] = REBASE(s_src->next_framep[3]);
2754
2755     return 0;
2756 }
2757 #endif /* CONFIG_VP8_DECODER */
2758
2759 #if CONFIG_VP7_DECODER
2760 AVCodec ff_vp7_decoder = {
2761     .name                  = "vp7",
2762     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP7"),
2763     .type                  = AVMEDIA_TYPE_VIDEO,
2764     .id                    = AV_CODEC_ID_VP7,
2765     .priv_data_size        = sizeof(VP8Context),
2766     .init                  = vp7_decode_init,
2767     .close                 = ff_vp8_decode_free,
2768     .decode                = vp7_decode_frame,
2769     .capabilities          = AV_CODEC_CAP_DR1,
2770     .flush                 = vp8_decode_flush,
2771 };
2772 #endif /* CONFIG_VP7_DECODER */
2773
2774 #if CONFIG_VP8_DECODER
2775 AVCodec ff_vp8_decoder = {
2776     .name                  = "vp8",
2777     .long_name             = NULL_IF_CONFIG_SMALL("On2 VP8"),
2778     .type                  = AVMEDIA_TYPE_VIDEO,
2779     .id                    = AV_CODEC_ID_VP8,
2780     .priv_data_size        = sizeof(VP8Context),
2781     .init                  = ff_vp8_decode_init,
2782     .close                 = ff_vp8_decode_free,
2783     .decode                = ff_vp8_decode_frame,
2784     .capabilities          = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2785                              AV_CODEC_CAP_SLICE_THREADS,
2786     .flush                 = vp8_decode_flush,
2787     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
2788     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2789 };
2790 #endif /* CONFIG_VP7_DECODER */