libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "config_components.h"
  28
  29 #include "libavutil/mem_internal.h"
  30
  31 #include "avcodec.h"
  32 #include "codec_internal.h"
  33 #include "decode.h"
  34 #include "hwaccel_internal.h"
  35 #include "hwconfig.h"
  36 #include "mathops.h"
  37 #include "refstruct.h"
  38 #include "thread.h"
  39 #include "threadframe.h"
  40 #include "vp8.h"
  41 #include "vp89_rac.h"
  42 #include "vp8data.h"
  43 #include "vpx_rac.h"
  44
  45 #if ARCH_ARM
  46 #   include "arm/vp8.h"
  47 #endif
  48
  49 // fixme: add 1 bit to all the calls to this?
  50 static int vp8_rac_get_sint(VPXRangeCoder *c, int bits)
  51 {
  52     int v;
  53
  54     if (!vp89_rac_get(c))
  55         return 0;
  56
  57     v = vp89_rac_get_uint(c, bits);
  58
  59     if (vp89_rac_get(c))
  60         v = -v;
  61
  62     return v;
  63 }
  64
  65 static int vp8_rac_get_nn(VPXRangeCoder *c)
  66 {
  67     int v = vp89_rac_get_uint(c, 7) << 1;
  68     return v + !v;
  69 }
  70
  71 // DCTextra
  72 static int vp8_rac_get_coeff(VPXRangeCoder *c, const uint8_t *prob)
  73 {
  74     int v = 0;
  75
  76     do {
  77         v = (v<<1) + vpx_rac_get_prob(c, *prob++);
  78     } while (*prob);
  79
  80     return v;
  81 }
  82
  83 static void free_buffers(VP8Context *s)
  84 {
  85     int i;
  86     if (s->thread_data)
  87         for (i = 0; i < MAX_THREADS; i++) {
  88 #if HAVE_THREADS
  89             pthread_cond_destroy(&s->thread_data[i].cond);
  90             pthread_mutex_destroy(&s->thread_data[i].lock);
  91 #endif
  92             av_freep(&s->thread_data[i].filter_strength);
  93         }
  94     av_freep(&s->thread_data);
  95     av_freep(&s->macroblocks_base);
  96     av_freep(&s->intra4x4_pred_mode_top);
  97     av_freep(&s->top_nnz);
  98     av_freep(&s->top_border);
  99
 100     s->macroblocks = NULL;
 101 }
 102
 103 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
 104 {
 105     int ret;
 106     if ((ret = ff_thread_get_ext_buffer(s->avctx, &f->tf,
 107                                         ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
 108         return ret;
 109     if (!(f->seg_map = ff_refstruct_allocz(s->mb_width * s->mb_height)))
 110         goto fail;
 111     ret = ff_hwaccel_frame_priv_alloc(s->avctx, &f->hwaccel_picture_private);
 112     if (ret < 0)
 113         goto fail;
 114
 115     return 0;
 116
 117 fail:
 118     ff_refstruct_unref(&f->seg_map);
 119     ff_thread_release_ext_buffer(&f->tf);
 120     return ret;
 121 }
 122
 123 static void vp8_release_frame(VP8Frame *f)
 124 {
 125     ff_refstruct_unref(&f->seg_map);
 126     ff_refstruct_unref(&f->hwaccel_picture_private);
 127     ff_thread_release_ext_buffer(&f->tf);
 128 }
 129
 130 #if CONFIG_VP8_DECODER
 131 static int vp8_ref_frame(VP8Frame *dst, const VP8Frame *src)
 132 {
 133     int ret;
 134
 135     vp8_release_frame(dst);
 136
 137     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
 138         return ret;
 139     ff_refstruct_replace(&dst->seg_map, src->seg_map);
 140     ff_refstruct_replace(&dst->hwaccel_picture_private,
 141                           src->hwaccel_picture_private);
 142
 143     return 0;
 144 }
 145 #endif /* CONFIG_VP8_DECODER */
 146
 147 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 148 {
 149     VP8Context *s = avctx->priv_data;
 150     int i;
 151
 152     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 153         vp8_release_frame(&s->frames[i]);
 154     memset(s->framep, 0, sizeof(s->framep));
 155
 156     if (free_mem)
 157         free_buffers(s);
 158
 159     if (FF_HW_HAS_CB(avctx, flush))
 160         FF_HW_SIMPLE_CALL(avctx, flush);
 161 }
 162
 163 static void vp8_decode_flush(AVCodecContext *avctx)
 164 {
 165     vp8_decode_flush_impl(avctx, 0);
 166 }
 167
 168 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 169 {
 170     VP8Frame *frame = NULL;
 171     int i;
 172
 173     // find a free buffer
 174     for (i = 0; i < 5; i++)
 175         if (&s->frames[i] != s->framep[VP8_FRAME_CURRENT]  &&
 176             &s->frames[i] != s->framep[VP8_FRAME_PREVIOUS] &&
 177             &s->frames[i] != s->framep[VP8_FRAME_GOLDEN]   &&
 178             &s->frames[i] != s->framep[VP8_FRAME_ALTREF]) {
 179             frame = &s->frames[i];
 180             break;
 181         }
 182     if (i == 5) {
 183         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 184         abort();
 185     }
 186     if (frame->tf.f->buf[0])
 187         vp8_release_frame(frame);
 188
 189     return frame;
 190 }
 191
 192 static enum AVPixelFormat get_pixel_format(VP8Context *s)
 193 {
 194     enum AVPixelFormat pix_fmts[] = {
 195 #if CONFIG_VP8_VAAPI_HWACCEL
 196         AV_PIX_FMT_VAAPI,
 197 #endif
 198 #if CONFIG_VP8_NVDEC_HWACCEL
 199         AV_PIX_FMT_CUDA,
 200 #endif
 201         AV_PIX_FMT_YUV420P,
 202         AV_PIX_FMT_NONE,
 203     };
 204
 205     return ff_get_format(s->avctx, pix_fmts);
 206 }
 207
 208 static av_always_inline
 209 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 210 {
 211     AVCodecContext *avctx = s->avctx;
 212     int i, ret, dim_reset = 0;
 213
 214     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 215         height != s->avctx->height) {
 216         vp8_decode_flush_impl(s->avctx, 1);
 217
 218         ret = ff_set_dimensions(s->avctx, width, height);
 219         if (ret < 0)
 220             return ret;
 221
 222         dim_reset = (s->macroblocks_base != NULL);
 223     }
 224
 225     if ((s->pix_fmt == AV_PIX_FMT_NONE || dim_reset) &&
 226          !s->actually_webp && !is_vp7) {
 227         s->pix_fmt = get_pixel_format(s);
 228         if (s->pix_fmt < 0)
 229             return AVERROR(EINVAL);
 230         avctx->pix_fmt = s->pix_fmt;
 231     }
 232
 233     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 234     s->mb_height = (s->avctx->coded_height + 15) / 16;
 235
 236     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 237                    avctx->thread_count > 1;
 238     if (!s->mb_layout) { // Frame threading and one thread
 239         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 240                                                sizeof(*s->macroblocks));
 241         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 242     } else // Sliced threading
 243         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 244                                          sizeof(*s->macroblocks));
 245     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 246     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 247     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 248
 249     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 250         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 251         free_buffers(s);
 252         return AVERROR(ENOMEM);
 253     }
 254
 255     for (i = 0; i < MAX_THREADS; i++) {
 256         s->thread_data[i].filter_strength =
 257             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 258         if (!s->thread_data[i].filter_strength) {
 259             free_buffers(s);
 260             return AVERROR(ENOMEM);
 261         }
 262 #if HAVE_THREADS
 263         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 264         pthread_cond_init(&s->thread_data[i].cond, NULL);
 265 #endif
 266     }
 267
 268     s->macroblocks = s->macroblocks_base + 1;
 269
 270     return 0;
 271 }
 272
 273 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 274 {
 275     return update_dimensions(s, width, height, IS_VP7);
 276 }
 277
 278 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 279 {
 280     return update_dimensions(s, width, height, IS_VP8);
 281 }
 282
 283
 284 static void parse_segment_info(VP8Context *s)
 285 {
 286     VPXRangeCoder *c = &s->c;
 287     int i;
 288
 289     s->segmentation.update_map = vp89_rac_get(c);
 290     s->segmentation.update_feature_data = vp89_rac_get(c);
 291
 292     if (s->segmentation.update_feature_data) {
 293         s->segmentation.absolute_vals = vp89_rac_get(c);
 294
 295         for (i = 0; i < 4; i++)
 296             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 297
 298         for (i = 0; i < 4; i++)
 299             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 300     }
 301     if (s->segmentation.update_map)
 302         for (i = 0; i < 3; i++)
 303             s->prob->segmentid[i] = vp89_rac_get(c) ? vp89_rac_get_uint(c, 8) : 255;
 304 }
 305
 306 static void update_lf_deltas(VP8Context *s)
 307 {
 308     VPXRangeCoder *c = &s->c;
 309     int i;
 310
 311     for (i = 0; i < 4; i++) {
 312         if (vp89_rac_get(c)) {
 313             s->lf_delta.ref[i] = vp89_rac_get_uint(c, 6);
 314
 315             if (vp89_rac_get(c))
 316                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 317         }
 318     }
 319
 320     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 321         if (vp89_rac_get(c)) {
 322             s->lf_delta.mode[i] = vp89_rac_get_uint(c, 6);
 323
 324             if (vp89_rac_get(c))
 325                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 326         }
 327     }
 328 }
 329
 330 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 331 {
 332     const uint8_t *sizes = buf;
 333     int i;
 334     int ret;
 335
 336     s->num_coeff_partitions = 1 << vp89_rac_get_uint(&s->c, 2);
 337
 338     buf      += 3 * (s->num_coeff_partitions - 1);
 339     buf_size -= 3 * (s->num_coeff_partitions - 1);
 340     if (buf_size < 0)
 341         return -1;
 342
 343     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 344         int size = AV_RL24(sizes + 3 * i);
 345         if (buf_size - size < 0)
 346             return -1;
 347         s->coeff_partition_size[i] = size;
 348
 349         ret = ff_vpx_init_range_decoder(&s->coeff_partition[i], buf, size);
 350         if (ret < 0)
 351             return ret;
 352         buf      += size;
 353         buf_size -= size;
 354     }
 355
 356     s->coeff_partition_size[i] = buf_size;
 357     ff_vpx_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 358
 359     return 0;
 360 }
 361
 362 static void vp7_get_quants(VP8Context *s)
 363 {
 364     VPXRangeCoder *c = &s->c;
 365
 366     int yac_qi  = vp89_rac_get_uint(c, 7);
 367     int ydc_qi  = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi;
 368     int y2dc_qi = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi;
 369     int y2ac_qi = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi;
 370     int uvdc_qi = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi;
 371     int uvac_qi = vp89_rac_get(c) ? vp89_rac_get_uint(c, 7) : yac_qi;
 372
 373     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 374     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 375     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 376     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 377     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 378     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 379 }
 380
 381 static void vp8_get_quants(VP8Context *s)
 382 {
 383     VPXRangeCoder *c = &s->c;
 384     int i, base_qi;
 385
 386     s->quant.yac_qi     = vp89_rac_get_uint(c, 7);
 387     s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
 388     s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
 389     s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
 390     s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
 391     s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
 392
 393     for (i = 0; i < 4; i++) {
 394         if (s->segmentation.enabled) {
 395             base_qi = s->segmentation.base_quant[i];
 396             if (!s->segmentation.absolute_vals)
 397                 base_qi += s->quant.yac_qi;
 398         } else
 399             base_qi = s->quant.yac_qi;
 400
 401         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
 402         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 403         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
 404         /* 101581>>16 is equivalent to 155/100 */
 405         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
 406         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
 407         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
 408
 409         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 410         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 411     }
 412 }
 413
 414 /**
 415  * Determine which buffers golden and altref should be updated with after this frame.
 416  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 417  *
 418  * Intra frames update all 3 references
 419  * Inter frames update VP8_FRAME_PREVIOUS if the update_last flag is set
 420  * If the update (golden|altref) flag is set, it's updated with the current frame
 421  *      if update_last is set, and VP8_FRAME_PREVIOUS otherwise.
 422  * If the flag is not set, the number read means:
 423  *      0: no update
 424  *      1: VP8_FRAME_PREVIOUS
 425  *      2: update golden with altref, or update altref with golden
 426  */
 427 static VP8FrameType ref_to_update(VP8Context *s, int update, VP8FrameType ref)
 428 {
 429     VPXRangeCoder *c = &s->c;
 430
 431     if (update)
 432         return VP8_FRAME_CURRENT;
 433
 434     switch (vp89_rac_get_uint(c, 2)) {
 435     case 1:
 436         return VP8_FRAME_PREVIOUS;
 437     case 2:
 438         return (ref == VP8_FRAME_GOLDEN) ? VP8_FRAME_ALTREF : VP8_FRAME_GOLDEN;
 439     }
 440     return VP8_FRAME_NONE;
 441 }
 442
 443 static void vp78_reset_probability_tables(VP8Context *s)
 444 {
 445     int i, j;
 446     for (i = 0; i < 4; i++)
 447         for (j = 0; j < 16; j++)
 448             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 449                    sizeof(s->prob->token[i][j]));
 450 }
 451
 452 static void vp78_update_probability_tables(VP8Context *s)
 453 {
 454     VPXRangeCoder *c = &s->c;
 455     int i, j, k, l, m;
 456
 457     for (i = 0; i < 4; i++)
 458         for (j = 0; j < 8; j++)
 459             for (k = 0; k < 3; k++)
 460                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 461                     if (vpx_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 462                         int prob = vp89_rac_get_uint(c, 8);
 463                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 464                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 465                     }
 466 }
 467
 468 #define VP7_MVC_SIZE 17
 469 #define VP8_MVC_SIZE 19
 470
 471 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 472                                                             int mvc_size)
 473 {
 474     VPXRangeCoder *c = &s->c;
 475     int i, j;
 476
 477     if (vp89_rac_get(c))
 478         for (i = 0; i < 4; i++)
 479             s->prob->pred16x16[i] = vp89_rac_get_uint(c, 8);
 480     if (vp89_rac_get(c))
 481         for (i = 0; i < 3; i++)
 482             s->prob->pred8x8c[i]  = vp89_rac_get_uint(c, 8);
 483
 484     // 17.2 MV probability update
 485     for (i = 0; i < 2; i++)
 486         for (j = 0; j < mvc_size; j++)
 487             if (vpx_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 488                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 489 }
 490
 491 static void update_refs(VP8Context *s)
 492 {
 493     VPXRangeCoder *c = &s->c;
 494
 495     int update_golden = vp89_rac_get(c);
 496     int update_altref = vp89_rac_get(c);
 497
 498     s->update_golden = ref_to_update(s, update_golden, VP8_FRAME_GOLDEN);
 499     s->update_altref = ref_to_update(s, update_altref, VP8_FRAME_ALTREF);
 500 }
 501
 502 static void copy_chroma(AVFrame *dst, const AVFrame *src, int width, int height)
 503 {
 504     int i, j;
 505
 506     for (j = 1; j < 3; j++) {
 507         for (i = 0; i < height / 2; i++)
 508             memcpy(dst->data[j] + i * dst->linesize[j],
 509                    src->data[j] + i * src->linesize[j], width / 2);
 510     }
 511 }
 512
 513 static void fade(uint8_t *dst, ptrdiff_t dst_linesize,
 514                  const uint8_t *src, ptrdiff_t src_linesize,
 515                  int width, int height,
 516                  int alpha, int beta)
 517 {
 518     int i, j;
 519     for (j = 0; j < height; j++) {
 520         const uint8_t *src2 = src + j * src_linesize;
 521         uint8_t *dst2 = dst + j * dst_linesize;
 522         for (i = 0; i < width; i++) {
 523             uint8_t y = src2[i];
 524             dst2[i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 525         }
 526     }
 527 }
 528
 529 static int vp7_fade_frame(VP8Context *s, int alpha, int beta)
 530 {
 531     int ret;
 532
 533     if (!s->keyframe && (alpha || beta)) {
 534         int width  = s->mb_width * 16;
 535         int height = s->mb_height * 16;
 536         const AVFrame *src;
 537         AVFrame *dst;
 538
 539         if (!s->framep[VP8_FRAME_PREVIOUS] ||
 540             !s->framep[VP8_FRAME_GOLDEN]) {
 541             av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
 542             return AVERROR_INVALIDDATA;
 543         }
 544
 545         src =
 546         dst = s->framep[VP8_FRAME_PREVIOUS]->tf.f;
 547
 548         /* preserve the golden frame, write a new previous frame */
 549         if (s->framep[VP8_FRAME_GOLDEN] == s->framep[VP8_FRAME_PREVIOUS]) {
 550             s->framep[VP8_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 551             if ((ret = vp8_alloc_frame(s, s->framep[VP8_FRAME_PREVIOUS], 1)) < 0)
 552                 return ret;
 553
 554             dst = s->framep[VP8_FRAME_PREVIOUS]->tf.f;
 555
 556             copy_chroma(dst, src, width, height);
 557         }
 558
 559         fade(dst->data[0], dst->linesize[0],
 560              src->data[0], src->linesize[0],
 561              width, height, alpha, beta);
 562     }
 563
 564     return 0;
 565 }
 566
 567 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 568 {
 569     VPXRangeCoder *c = &s->c;
 570     int part1_size, hscale, vscale, i, j, ret;
 571     int width  = s->avctx->width;
 572     int height = s->avctx->height;
 573     int alpha = 0;
 574     int beta  = 0;
 575     int fade_present = 1;
 576
 577     if (buf_size < 4) {
 578         return AVERROR_INVALIDDATA;
 579     }
 580
 581     s->profile = (buf[0] >> 1) & 7;
 582     if (s->profile > 1) {
 583         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 584         return AVERROR_INVALIDDATA;
 585     }
 586
 587     s->keyframe  = !(buf[0] & 1);
 588     s->invisible = 0;
 589     part1_size   = AV_RL24(buf) >> 4;
 590
 591     if (buf_size < 4 - s->profile + part1_size) {
 592         av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
 593         return AVERROR_INVALIDDATA;
 594     }
 595
 596     buf      += 4 - s->profile;
 597     buf_size -= 4 - s->profile;
 598
 599     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 600
 601     ret = ff_vpx_init_range_decoder(c, buf, part1_size);
 602     if (ret < 0)
 603         return ret;
 604     buf      += part1_size;
 605     buf_size -= part1_size;
 606
 607     /* A. Dimension information (keyframes only) */
 608     if (s->keyframe) {
 609         width  = vp89_rac_get_uint(c, 12);
 610         height = vp89_rac_get_uint(c, 12);
 611         hscale = vp89_rac_get_uint(c, 2);
 612         vscale = vp89_rac_get_uint(c, 2);
 613         if (hscale || vscale)
 614             avpriv_request_sample(s->avctx, "Upscaling");
 615
 616         s->update_golden = s->update_altref = VP8_FRAME_CURRENT;
 617         vp78_reset_probability_tables(s);
 618         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 619                sizeof(s->prob->pred16x16));
 620         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 621                sizeof(s->prob->pred8x8c));
 622         for (i = 0; i < 2; i++)
 623             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 624                    sizeof(vp7_mv_default_prob[i]));
 625         memset(&s->segmentation, 0, sizeof(s->segmentation));
 626         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 627         memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
 628     }
 629
 630     if (s->keyframe || s->profile > 0)
 631         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 632
 633     /* B. Decoding information for all four macroblock-level features */
 634     for (i = 0; i < 4; i++) {
 635         s->feature_enabled[i] = vp89_rac_get(c);
 636         if (s->feature_enabled[i]) {
 637              s->feature_present_prob[i] = vp89_rac_get_uint(c, 8);
 638
 639              for (j = 0; j < 3; j++)
 640                  s->feature_index_prob[i][j] =
 641                      vp89_rac_get(c) ? vp89_rac_get_uint(c, 8) : 255;
 642
 643              if (vp7_feature_value_size[s->profile][i])
 644                  for (j = 0; j < 4; j++)
 645                      s->feature_value[i][j] =
 646                         vp89_rac_get(c) ? vp89_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 647         }
 648     }
 649
 650     s->segmentation.enabled    = 0;
 651     s->segmentation.update_map = 0;
 652     s->lf_delta.enabled        = 0;
 653
 654     s->num_coeff_partitions = 1;
 655     ret = ff_vpx_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 656     if (ret < 0)
 657         return ret;
 658
 659     if (!s->macroblocks_base || /* first frame */
 660         width != s->avctx->width || height != s->avctx->height ||
 661         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 662         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 663             return ret;
 664     }
 665
 666     /* C. Dequantization indices */
 667     vp7_get_quants(s);
 668
 669     /* D. Golden frame update flag (a Flag) for interframes only */
 670     if (!s->keyframe) {
 671         s->update_golden = vp89_rac_get(c) ? VP8_FRAME_CURRENT : VP8_FRAME_NONE;
 672         s->sign_bias[VP8_FRAME_GOLDEN] = 0;
 673     }
 674
 675     s->update_last          = 1;
 676     s->update_probabilities = 1;
 677
 678     if (s->profile > 0) {
 679         s->update_probabilities = vp89_rac_get(c);
 680         if (!s->update_probabilities)
 681             s->prob[1] = s->prob[0];
 682
 683         if (!s->keyframe)
 684             fade_present = vp89_rac_get(c);
 685     }
 686
 687     if (vpx_rac_is_end(c))
 688         return AVERROR_INVALIDDATA;
 689     /* E. Fading information for previous frame */
 690     if (fade_present && vp89_rac_get(c)) {
 691         alpha = (int8_t) vp89_rac_get_uint(c, 8);
 692         beta  = (int8_t) vp89_rac_get_uint(c, 8);
 693     }
 694
 695     /* F. Loop filter type */
 696     if (!s->profile)
 697         s->filter.simple = vp89_rac_get(c);
 698
 699     /* G. DCT coefficient ordering specification */
 700     if (vp89_rac_get(c))
 701         for (i = 1; i < 16; i++)
 702             s->prob[0].scan[i] = ff_zigzag_scan[vp89_rac_get_uint(c, 4)];
 703
 704     /* H. Loop filter levels  */
 705     if (s->profile > 0)
 706         s->filter.simple = vp89_rac_get(c);
 707     s->filter.level     = vp89_rac_get_uint(c, 6);
 708     s->filter.sharpness = vp89_rac_get_uint(c, 3);
 709
 710     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 711     vp78_update_probability_tables(s);
 712
 713     s->mbskip_enabled = 0;
 714
 715     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 716     if (!s->keyframe) {
 717         s->prob->intra  = vp89_rac_get_uint(c, 8);
 718         s->prob->last   = vp89_rac_get_uint(c, 8);
 719         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 720     }
 721
 722     if (vpx_rac_is_end(c))
 723         return AVERROR_INVALIDDATA;
 724
 725     if ((ret = vp7_fade_frame(s, alpha, beta)) < 0)
 726         return ret;
 727
 728     return 0;
 729 }
 730
 731 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 732 {
 733     VPXRangeCoder *c = &s->c;
 734     int header_size, hscale, vscale, ret;
 735     int width  = s->avctx->width;
 736     int height = s->avctx->height;
 737
 738     if (buf_size < 3) {
 739         av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
 740         return AVERROR_INVALIDDATA;
 741     }
 742
 743     s->keyframe  = !(buf[0] & 1);
 744     s->profile   =  (buf[0]>>1) & 7;
 745     s->invisible = !(buf[0] & 0x10);
 746     header_size  = AV_RL24(buf) >> 5;
 747     buf      += 3;
 748     buf_size -= 3;
 749
 750     s->header_partition_size = header_size;
 751
 752     if (s->profile > 3)
 753         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 754
 755     if (!s->profile)
 756         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 757                sizeof(s->put_pixels_tab));
 758     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 759         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 760                sizeof(s->put_pixels_tab));
 761
 762     if (header_size > buf_size - 7 * s->keyframe) {
 763         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 764         return AVERROR_INVALIDDATA;
 765     }
 766
 767     if (s->keyframe) {
 768         if (AV_RL24(buf) != 0x2a019d) {
 769             av_log(s->avctx, AV_LOG_ERROR,
 770                    "Invalid start code 0x%x\n", AV_RL24(buf));
 771             return AVERROR_INVALIDDATA;
 772         }
 773         width     = AV_RL16(buf + 3) & 0x3fff;
 774         height    = AV_RL16(buf + 5) & 0x3fff;
 775         hscale    = buf[4] >> 6;
 776         vscale    = buf[6] >> 6;
 777         buf      += 7;
 778         buf_size -= 7;
 779
 780         if (hscale || vscale)
 781             avpriv_request_sample(s->avctx, "Upscaling");
 782
 783         s->update_golden = s->update_altref = VP8_FRAME_CURRENT;
 784         vp78_reset_probability_tables(s);
 785         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 786                sizeof(s->prob->pred16x16));
 787         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 788                sizeof(s->prob->pred8x8c));
 789         memcpy(s->prob->mvc, vp8_mv_default_prob,
 790                sizeof(s->prob->mvc));
 791         memset(&s->segmentation, 0, sizeof(s->segmentation));
 792         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 793     }
 794
 795     ret = ff_vpx_init_range_decoder(c, buf, header_size);
 796     if (ret < 0)
 797         return ret;
 798     buf      += header_size;
 799     buf_size -= header_size;
 800
 801     if (s->keyframe) {
 802         s->colorspace = vp89_rac_get(c);
 803         if (s->colorspace)
 804             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 805         s->fullrange = vp89_rac_get(c);
 806     }
 807
 808     if ((s->segmentation.enabled = vp89_rac_get(c)))
 809         parse_segment_info(s);
 810     else
 811         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 812
 813     s->filter.simple    = vp89_rac_get(c);
 814     s->filter.level     = vp89_rac_get_uint(c, 6);
 815     s->filter.sharpness = vp89_rac_get_uint(c, 3);
 816
 817     if ((s->lf_delta.enabled = vp89_rac_get(c))) {
 818         s->lf_delta.update = vp89_rac_get(c);
 819         if (s->lf_delta.update)
 820             update_lf_deltas(s);
 821     }
 822
 823     if (setup_partitions(s, buf, buf_size)) {
 824         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 825         return AVERROR_INVALIDDATA;
 826     }
 827
 828     if (!s->macroblocks_base || /* first frame */
 829         width != s->avctx->width || height != s->avctx->height ||
 830         (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
 831         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 832             return ret;
 833
 834     vp8_get_quants(s);
 835
 836     if (!s->keyframe) {
 837         update_refs(s);
 838         s->sign_bias[VP8_FRAME_GOLDEN] = vp89_rac_get(c);
 839         s->sign_bias[VP8_FRAME_ALTREF] = vp89_rac_get(c);
 840     }
 841
 842     // if we aren't saving this frame's probabilities for future frames,
 843     // make a copy of the current probabilities
 844     if (!(s->update_probabilities = vp89_rac_get(c)))
 845         s->prob[1] = s->prob[0];
 846
 847     s->update_last = s->keyframe || vp89_rac_get(c);
 848
 849     vp78_update_probability_tables(s);
 850
 851     if ((s->mbskip_enabled = vp89_rac_get(c)))
 852         s->prob->mbskip = vp89_rac_get_uint(c, 8);
 853
 854     if (!s->keyframe) {
 855         s->prob->intra  = vp89_rac_get_uint(c, 8);
 856         s->prob->last   = vp89_rac_get_uint(c, 8);
 857         s->prob->golden = vp89_rac_get_uint(c, 8);
 858         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 859     }
 860
 861     // Record the entropy coder state here so that hwaccels can use it.
 862     s->c.code_word = vpx_rac_renorm(&s->c);
 863     s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
 864     s->coder_state_at_header_end.range     = s->c.high;
 865     s->coder_state_at_header_end.value     = s->c.code_word >> 16;
 866     s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
 867
 868     return 0;
 869 }
 870
 871 static av_always_inline
 872 void clamp_mv(const VP8mvbounds *s, VP8mv *dst, const VP8mv *src)
 873 {
 874     dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
 875                              av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
 876     dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
 877                              av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 878 }
 879
 880 /**
 881  * Motion vector coding, 17.1.
 882  */
 883 static av_always_inline int read_mv_component(VPXRangeCoder *c, const uint8_t *p, int vp7)
 884 {
 885     int bit, x = 0;
 886
 887     if (vpx_rac_get_prob_branchy(c, p[0])) {
 888         int i;
 889
 890         for (i = 0; i < 3; i++)
 891             x += vpx_rac_get_prob(c, p[9 + i]) << i;
 892         for (i = (vp7 ? 7 : 9); i > 3; i--)
 893             x += vpx_rac_get_prob(c, p[9 + i]) << i;
 894         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vpx_rac_get_prob(c, p[12]))
 895             x += 8;
 896     } else {
 897         // small_mvtree
 898         const uint8_t *ps = p + 2;
 899         bit = vpx_rac_get_prob(c, *ps);
 900         ps += 1 + 3 * bit;
 901         x  += 4 * bit;
 902         bit = vpx_rac_get_prob(c, *ps);
 903         ps += 1 + bit;
 904         x  += 2 * bit;
 905         x  += vpx_rac_get_prob(c, *ps);
 906     }
 907
 908     return (x && vpx_rac_get_prob(c, p[1])) ? -x : x;
 909 }
 910
 911 static int vp7_read_mv_component(VPXRangeCoder *c, const uint8_t *p)
 912 {
 913     return read_mv_component(c, p, 1);
 914 }
 915
 916 static int vp8_read_mv_component(VPXRangeCoder *c, const uint8_t *p)
 917 {
 918     return read_mv_component(c, p, 0);
 919 }
 920
 921 static av_always_inline
 922 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 923 {
 924     if (is_vp7)
 925         return vp7_submv_prob;
 926
 927     if (left == top)
 928         return vp8_submv_prob[4 - !!left];
 929     if (!top)
 930         return vp8_submv_prob[2];
 931     return vp8_submv_prob[1 - !!left];
 932 }
 933
 934 /**
 935  * Split motion vector prediction, 16.4.
 936  * @returns the number of motion vectors parsed (2, 4 or 16)
 937  */
 938 static av_always_inline
 939 int decode_splitmvs(const VP8Context *s, VPXRangeCoder *c, VP8Macroblock *mb,
 940                     int layout, int is_vp7)
 941 {
 942     int part_idx;
 943     int n, num;
 944     const VP8Macroblock *top_mb;
 945     const VP8Macroblock *left_mb = &mb[-1];
 946     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 947     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 948     const VP8mv *top_mv;
 949     const VP8mv *left_mv = left_mb->bmv;
 950     const VP8mv *cur_mv  = mb->bmv;
 951
 952     if (!layout) // layout is inlined, s->mb_layout is not
 953         top_mb = &mb[2];
 954     else
 955         top_mb = &mb[-s->mb_width - 1];
 956     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 957     top_mv       = top_mb->bmv;
 958
 959     if (vpx_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 960         if (vpx_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 961             part_idx = VP8_SPLITMVMODE_16x8 + vpx_rac_get_prob(c, vp8_mbsplit_prob[2]);
 962         else
 963             part_idx = VP8_SPLITMVMODE_8x8;
 964     } else {
 965         part_idx = VP8_SPLITMVMODE_4x4;
 966     }
 967
 968     num              = vp8_mbsplit_count[part_idx];
 969     mbsplits_cur     = vp8_mbsplits[part_idx],
 970     firstidx         = vp8_mbfirstidx[part_idx];
 971     mb->partitioning = part_idx;
 972
 973     for (n = 0; n < num; n++) {
 974         int k = firstidx[n];
 975         uint32_t left, above;
 976         const uint8_t *submv_prob;
 977
 978         if (!(k & 3))
 979             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 980         else
 981             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 982         if (k <= 3)
 983             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 984         else
 985             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 986
 987         submv_prob = get_submv_prob(left, above, is_vp7);
 988
 989         if (vpx_rac_get_prob_branchy(c, submv_prob[0])) {
 990             if (vpx_rac_get_prob_branchy(c, submv_prob[1])) {
 991                 if (vpx_rac_get_prob_branchy(c, submv_prob[2])) {
 992                     mb->bmv[n].y = mb->mv.y +
 993                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 994                     mb->bmv[n].x = mb->mv.x +
 995                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 996                 } else {
 997                     AV_ZERO32(&mb->bmv[n]);
 998                 }
 999             } else {
1000                 AV_WN32A(&mb->bmv[n], above);
1001             }
1002         } else {
1003             AV_WN32A(&mb->bmv[n], left);
1004         }
1005     }
1006
1007     return num;
1008 }
1009
1010 /**
1011  * The vp7 reference decoder uses a padding macroblock column (added to right
1012  * edge of the frame) to guard against illegal macroblock offsets. The
1013  * algorithm has bugs that permit offsets to straddle the padding column.
1014  * This function replicates those bugs.
1015  *
1016  * @param[out] edge_x macroblock x address
1017  * @param[out] edge_y macroblock y address
1018  *
1019  * @return macroblock offset legal (boolean)
1020  */
1021 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
1022                                    int xoffset, int yoffset, int boundary,
1023                                    int *edge_x, int *edge_y)
1024 {
1025     int vwidth = mb_width + 1;
1026     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
1027     if (new < boundary || new % vwidth == vwidth - 1)
1028         return 0;
1029     *edge_y = new / vwidth;
1030     *edge_x = new % vwidth;
1031     return 1;
1032 }
1033
1034 static const VP8mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
1035 {
1036     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
1037 }
1038
1039 static av_always_inline
1040 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1041                     int mb_x, int mb_y, int layout)
1042 {
1043     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
1044     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1045     int idx = CNT_ZERO;
1046     VP8mv near_mv[3];
1047     uint8_t cnt[3] = { 0 };
1048     VPXRangeCoder *c = &s->c;
1049     int i;
1050
1051     AV_ZERO32(&near_mv[0]);
1052     AV_ZERO32(&near_mv[1]);
1053     AV_ZERO32(&near_mv[2]);
1054
1055     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
1056         const VP7MVPred * pred = &vp7_mv_pred[i];
1057         int edge_x, edge_y;
1058
1059         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
1060                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
1061             const VP8Macroblock *edge = (s->mb_layout == 1)
1062                                       ? s->macroblocks_base + 1 + edge_x +
1063                                         (s->mb_width + 1) * (edge_y + 1)
1064                                       : s->macroblocks + edge_x +
1065                                         (s->mb_height - edge_y - 1) * 2;
1066             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
1067             if (mv) {
1068                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
1069                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
1070                         idx = CNT_NEAREST;
1071                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
1072                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
1073                             continue;
1074                         idx = CNT_NEAR;
1075                     } else {
1076                         AV_WN32A(&near_mv[CNT_NEAR], mv);
1077                         idx = CNT_NEAR;
1078                     }
1079                 } else {
1080                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
1081                     idx = CNT_NEAREST;
1082                 }
1083             } else {
1084                 idx = CNT_ZERO;
1085             }
1086         } else {
1087             idx = CNT_ZERO;
1088         }
1089         cnt[idx] += vp7_mv_pred[i].score;
1090     }
1091
1092     mb->partitioning = VP8_SPLITMVMODE_NONE;
1093
1094     if (vpx_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
1095         mb->mode = VP8_MVMODE_MV;
1096
1097         if (vpx_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
1098
1099             if (vpx_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
1100
1101                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
1102                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
1103                 else
1104                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
1105
1106                 if (vpx_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
1107                     mb->mode = VP8_MVMODE_SPLIT;
1108                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
1109                 } else {
1110                     mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
1111                     mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
1112                     mb->bmv[0] = mb->mv;
1113                 }
1114             } else {
1115                 mb->mv = near_mv[CNT_NEAR];
1116                 mb->bmv[0] = mb->mv;
1117             }
1118         } else {
1119             mb->mv = near_mv[CNT_NEAREST];
1120             mb->bmv[0] = mb->mv;
1121         }
1122     } else {
1123         mb->mode = VP8_MVMODE_ZERO;
1124         AV_ZERO32(&mb->mv);
1125         mb->bmv[0] = mb->mv;
1126     }
1127 }
1128
1129 static av_always_inline
1130 void vp8_decode_mvs(VP8Context *s, const VP8mvbounds *mv_bounds, VP8Macroblock *mb,
1131                     int mb_x, int mb_y, int layout)
1132 {
1133     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1134                                   mb - 1 /* left */,
1135                                   0      /* top-left */ };
1136     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1137     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1138     int idx = CNT_ZERO;
1139     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1140     const int8_t *sign_bias = s->sign_bias;
1141     VP8mv near_mv[4];
1142     uint8_t cnt[4] = { 0 };
1143     VPXRangeCoder *c = &s->c;
1144
1145     if (!layout) { // layout is inlined (s->mb_layout is not)
1146         mb_edge[0] = mb + 2;
1147         mb_edge[2] = mb + 1;
1148     } else {
1149         mb_edge[0] = mb - s->mb_width - 1;
1150         mb_edge[2] = mb - s->mb_width - 2;
1151     }
1152
1153     AV_ZERO32(&near_mv[0]);
1154     AV_ZERO32(&near_mv[1]);
1155     AV_ZERO32(&near_mv[2]);
1156
1157     /* Process MB on top, left and top-left */
1158 #define MV_EDGE_CHECK(n)                                                      \
1159     {                                                                         \
1160         const VP8Macroblock *edge = mb_edge[n];                               \
1161         int edge_ref = edge->ref_frame;                                       \
1162         if (edge_ref != VP8_FRAME_CURRENT) {                                 \
1163             uint32_t mv = AV_RN32A(&edge->mv);                                \
1164             if (mv) {                                                         \
1165                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1166                     /* SWAR negate of the values in mv. */                    \
1167                     mv = ~mv;                                                 \
1168                     mv = ((mv & 0x7fff7fff) +                                 \
1169                           0x00010001) ^ (mv & 0x80008000);                    \
1170                 }                                                             \
1171                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1172                     AV_WN32A(&near_mv[++idx], mv);                            \
1173                 cnt[idx] += 1 + (n != 2);                                     \
1174             } else                                                            \
1175                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1176         }                                                                     \
1177     }
1178
1179     MV_EDGE_CHECK(0)
1180     MV_EDGE_CHECK(1)
1181     MV_EDGE_CHECK(2)
1182
1183     mb->partitioning = VP8_SPLITMVMODE_NONE;
1184     if (vpx_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1185         mb->mode = VP8_MVMODE_MV;
1186
1187         /* If we have three distinct MVs, merge first and last if they're the same */
1188         if (cnt[CNT_SPLITMV] &&
1189             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1190             cnt[CNT_NEAREST] += 1;
1191
1192         /* Swap near and nearest if necessary */
1193         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1194             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1195             FFSWAP(VP8mv,   near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1196         }
1197
1198         if (vpx_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1199             if (vpx_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1200                 /* Choose the best mv out of 0,0 and the nearest mv */
1201                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1202                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1203                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1204                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1205
1206                 if (vpx_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1207                     mb->mode = VP8_MVMODE_SPLIT;
1208                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1209                 } else {
1210                     mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1211                     mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1212                     mb->bmv[0] = mb->mv;
1213                 }
1214             } else {
1215                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAR]);
1216                 mb->bmv[0] = mb->mv;
1217             }
1218         } else {
1219             clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAREST]);
1220             mb->bmv[0] = mb->mv;
1221         }
1222     } else {
1223         mb->mode = VP8_MVMODE_ZERO;
1224         AV_ZERO32(&mb->mv);
1225         mb->bmv[0] = mb->mv;
1226     }
1227 }
1228
1229 static av_always_inline
1230 void decode_intra4x4_modes(VP8Context *s, VPXRangeCoder *c, VP8Macroblock *mb,
1231                            int mb_x, int keyframe, int layout)
1232 {
1233     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1234
1235     if (layout) {
1236         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1237         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1238     }
1239     if (keyframe) {
1240         int x, y;
1241         uint8_t *top;
1242         uint8_t *const left = s->intra4x4_pred_mode_left;
1243         if (layout)
1244             top = mb->intra4x4_pred_mode_top;
1245         else
1246             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1247         for (y = 0; y < 4; y++) {
1248             for (x = 0; x < 4; x++) {
1249                 const uint8_t *ctx;
1250                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1251                 *intra4x4 = vp89_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1252                 left[y]   = top[x] = *intra4x4;
1253                 intra4x4++;
1254             }
1255         }
1256     } else {
1257         int i;
1258         for (i = 0; i < 16; i++)
1259             intra4x4[i] = vp89_rac_get_tree(c, vp8_pred4x4_tree,
1260                                             vp8_pred4x4_prob_inter);
1261     }
1262 }
1263
1264 static av_always_inline
1265 void decode_mb_mode(VP8Context *s, const VP8mvbounds *mv_bounds,
1266                     VP8Macroblock *mb, int mb_x, int mb_y,
1267                     uint8_t *segment, const uint8_t *ref, int layout, int is_vp7)
1268 {
1269     VPXRangeCoder *c = &s->c;
1270     static const char * const vp7_feature_name[] = { "q-index",
1271                                                      "lf-delta",
1272                                                      "partial-golden-update",
1273                                                      "blit-pitch" };
1274     if (is_vp7) {
1275         int i;
1276         *segment = 0;
1277         for (i = 0; i < 4; i++) {
1278             if (s->feature_enabled[i]) {
1279                 if (vpx_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1280                       int index = vp89_rac_get_tree(c, vp7_feature_index_tree,
1281                                                     s->feature_index_prob[i]);
1282                       av_log(s->avctx, AV_LOG_WARNING,
1283                              "Feature %s present in macroblock (value 0x%x)\n",
1284                              vp7_feature_name[i], s->feature_value[i][index]);
1285                 }
1286            }
1287         }
1288     } else if (s->segmentation.update_map) {
1289         int bit  = vpx_rac_get_prob(c, s->prob->segmentid[0]);
1290         *segment = vpx_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1291     } else if (s->segmentation.enabled)
1292         *segment = ref ? *ref : *segment;
1293     mb->segment = *segment;
1294
1295     mb->skip = s->mbskip_enabled ? vpx_rac_get_prob(c, s->prob->mbskip) : 0;
1296
1297     if (s->keyframe) {
1298         mb->mode = vp89_rac_get_tree(c, vp8_pred16x16_tree_intra,
1299                                      vp8_pred16x16_prob_intra);
1300
1301         if (mb->mode == MODE_I4x4) {
1302             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1303         } else {
1304             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1305                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1306             if (s->mb_layout)
1307                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1308             else
1309                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1310             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1311         }
1312
1313         mb->chroma_pred_mode = vp89_rac_get_tree(c, vp8_pred8x8c_tree,
1314                                                  vp8_pred8x8c_prob_intra);
1315         mb->ref_frame        = VP8_FRAME_CURRENT;
1316     } else if (vpx_rac_get_prob_branchy(c, s->prob->intra)) {
1317         // inter MB, 16.2
1318         if (vpx_rac_get_prob_branchy(c, s->prob->last))
1319             mb->ref_frame =
1320                 (!is_vp7 && vpx_rac_get_prob(c, s->prob->golden)) ? VP8_FRAME_ALTREF
1321                                                                   : VP8_FRAME_GOLDEN;
1322         else
1323             mb->ref_frame = VP8_FRAME_PREVIOUS;
1324         s->ref_count[mb->ref_frame - 1]++;
1325
1326         // motion vectors, 16.3
1327         if (is_vp7)
1328             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1329         else
1330             vp8_decode_mvs(s, mv_bounds, mb, mb_x, mb_y, layout);
1331     } else {
1332         // intra MB, 16.1
1333         mb->mode = vp89_rac_get_tree(c, vp8_pred16x16_tree_inter,
1334                                      s->prob->pred16x16);
1335
1336         if (mb->mode == MODE_I4x4)
1337             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1338
1339         mb->chroma_pred_mode = vp89_rac_get_tree(c, vp8_pred8x8c_tree,
1340                                                  s->prob->pred8x8c);
1341         mb->ref_frame        = VP8_FRAME_CURRENT;
1342         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1343         AV_ZERO32(&mb->bmv[0]);
1344     }
1345 }
1346
1347 /**
1348  * @param r     arithmetic bitstream reader context
1349  * @param block destination for block coefficients
1350  * @param probs probabilities to use when reading trees from the bitstream
1351  * @param i     initial coeff index, 0 unless a separate DC block is coded
1352  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1353  *
1354  * @return 0 if no coeffs were decoded
1355  *         otherwise, the index of the last coeff decoded plus one
1356  */
1357 static av_always_inline
1358 int decode_block_coeffs_internal(VPXRangeCoder *r, int16_t block[16],
1359                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1360                                  int i, const uint8_t *token_prob, const int16_t qmul[2],
1361                                  const uint8_t scan[16], int vp7)
1362 {
1363     VPXRangeCoder c = *r;
1364     goto skip_eob;
1365     do {
1366         int coeff;
1367 restart:
1368         if (!vpx_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1369             break;
1370
1371 skip_eob:
1372         if (!vpx_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1373             if (++i == 16)
1374                 break; // invalid input; blocks should end with EOB
1375             token_prob = probs[i][0];
1376             if (vp7)
1377                 goto restart;
1378             goto skip_eob;
1379         }
1380
1381         if (!vpx_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1382             coeff = 1;
1383             token_prob = probs[i + 1][1];
1384         } else {
1385             if (!vpx_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1386                 coeff = vpx_rac_get_prob_branchy(&c, token_prob[4]);
1387                 if (coeff)
1388                     coeff += vpx_rac_get_prob(&c, token_prob[5]);
1389                 coeff += 2;
1390             } else {
1391                 // DCT_CAT*
1392                 if (!vpx_rac_get_prob_branchy(&c, token_prob[6])) {
1393                     if (!vpx_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1394                         coeff = 5 + vpx_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1395                     } else {                                    // DCT_CAT2
1396                         coeff  = 7;
1397                         coeff += vpx_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1398                         coeff += vpx_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1399                     }
1400                 } else {    // DCT_CAT3 and up
1401                     int a   = vpx_rac_get_prob(&c, token_prob[8]);
1402                     int b   = vpx_rac_get_prob(&c, token_prob[9 + a]);
1403                     int cat = (a << 1) + b;
1404                     coeff  = 3 + (8 << cat);
1405                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1406                 }
1407             }
1408             token_prob = probs[i + 1][2];
1409         }
1410         block[scan[i]] = (vp89_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1411     } while (++i < 16);
1412
1413     *r = c;
1414     return i;
1415 }
1416
1417 static av_always_inline
1418 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1419 {
1420     int16_t dc = block[0];
1421     int ret = 0;
1422
1423     if (pred[1] > 3) {
1424         dc += pred[0];
1425         ret = 1;
1426     }
1427
1428     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1429         block[0] = pred[0] = dc;
1430         pred[1] = 0;
1431     } else {
1432         if (pred[0] == dc)
1433             pred[1]++;
1434         block[0] = pred[0] = dc;
1435     }
1436
1437     return ret;
1438 }
1439
1440 static int vp7_decode_block_coeffs_internal(VPXRangeCoder *r,
1441                                             int16_t block[16],
1442                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1443                                             int i, const uint8_t *token_prob,
1444                                             const int16_t qmul[2],
1445                                             const uint8_t scan[16])
1446 {
1447     return decode_block_coeffs_internal(r, block, probs, i,
1448                                         token_prob, qmul, scan, IS_VP7);
1449 }
1450
1451 #ifndef vp8_decode_block_coeffs_internal
1452 static int vp8_decode_block_coeffs_internal(VPXRangeCoder *r,
1453                                             int16_t block[16],
1454                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1455                                             int i, const uint8_t *token_prob,
1456                                             const int16_t qmul[2])
1457 {
1458     return decode_block_coeffs_internal(r, block, probs, i,
1459                                         token_prob, qmul, ff_zigzag_scan, IS_VP8);
1460 }
1461 #endif
1462
1463 /**
1464  * @param c          arithmetic bitstream reader context
1465  * @param block      destination for block coefficients
1466  * @param probs      probabilities to use when reading trees from the bitstream
1467  * @param i          initial coeff index, 0 unless a separate DC block is coded
1468  * @param zero_nhood the initial prediction context for number of surrounding
1469  *                   all-zero blocks (only left/top, so 0-2)
1470  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1471  * @param scan       scan pattern (VP7 only)
1472  *
1473  * @return 0 if no coeffs were decoded
1474  *         otherwise, the index of the last coeff decoded plus one
1475  */
1476 static av_always_inline
1477 int decode_block_coeffs(VPXRangeCoder *c, int16_t block[16],
1478                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1479                         int i, int zero_nhood, const int16_t qmul[2],
1480                         const uint8_t scan[16], int vp7)
1481 {
1482     const uint8_t *token_prob = probs[i][zero_nhood];
1483     if (!vpx_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1484         return 0;
1485     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1486                                                   token_prob, qmul, scan)
1487                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1488                                                   token_prob, qmul);
1489 }
1490
1491 static av_always_inline
1492 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VPXRangeCoder *c,
1493                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1494                       int is_vp7)
1495 {
1496     int i, x, y, luma_start = 0, luma_ctx = 3;
1497     int nnz_pred, nnz, nnz_total = 0;
1498     int segment = mb->segment;
1499     int block_dc = 0;
1500
1501     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1502         nnz_pred = t_nnz[8] + l_nnz[8];
1503
1504         // decode DC values and do hadamard
1505         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1506                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1507                                   ff_zigzag_scan, is_vp7);
1508         l_nnz[8] = t_nnz[8] = !!nnz;
1509
1510         if (is_vp7 && mb->mode > MODE_I4x4) {
1511             nnz |=  inter_predict_dc(td->block_dc,
1512                                      s->inter_dc_pred[mb->ref_frame - 1]);
1513         }
1514
1515         if (nnz) {
1516             nnz_total += nnz;
1517             block_dc   = 1;
1518             if (nnz == 1)
1519                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1520             else
1521                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1522         }
1523         luma_start = 1;
1524         luma_ctx   = 0;
1525     }
1526
1527     // luma blocks
1528     for (y = 0; y < 4; y++)
1529         for (x = 0; x < 4; x++) {
1530             nnz_pred = l_nnz[y] + t_nnz[x];
1531             nnz = decode_block_coeffs(c, td->block[y][x],
1532                                       s->prob->token[luma_ctx],
1533                                       luma_start, nnz_pred,
1534                                       s->qmat[segment].luma_qmul,
1535                                       s->prob[0].scan, is_vp7);
1536             /* nnz+block_dc may be one more than the actual last index,
1537              * but we don't care */
1538             td->non_zero_count_cache[y][x] = nnz + block_dc;
1539             t_nnz[x] = l_nnz[y] = !!nnz;
1540             nnz_total += nnz;
1541         }
1542
1543     // chroma blocks
1544     // TODO: what to do about dimensions? 2nd dim for luma is x,
1545     // but for chroma it's (y<<1)|x
1546     for (i = 4; i < 6; i++)
1547         for (y = 0; y < 2; y++)
1548             for (x = 0; x < 2; x++) {
1549                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1550                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1551                                           s->prob->token[2], 0, nnz_pred,
1552                                           s->qmat[segment].chroma_qmul,
1553                                           s->prob[0].scan, is_vp7);
1554                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1555                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1556                 nnz_total += nnz;
1557             }
1558
1559     // if there were no coded coeffs despite the macroblock not being marked skip,
1560     // we MUST not do the inner loop filter and should not do IDCT
1561     // Since skip isn't used for bitstream prediction, just manually set it.
1562     if (!nnz_total)
1563         mb->skip = 1;
1564 }
1565
1566 static av_always_inline
1567 void backup_mb_border(uint8_t *top_border, const uint8_t *src_y,
1568                       const uint8_t *src_cb, const uint8_t *src_cr,
1569                       ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple)
1570 {
1571     AV_COPY128(top_border, src_y + 15 * linesize);
1572     if (!simple) {
1573         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1574         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1575     }
1576 }
1577
1578 static av_always_inline
1579 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1580                     uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x,
1581                     int mb_y, int mb_width, int simple, int xchg)
1582 {
1583     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1584     src_y  -= linesize;
1585     src_cb -= uvlinesize;
1586     src_cr -= uvlinesize;
1587
1588 #define XCHG(a, b, xchg)                                                      \
1589     do {                                                                      \
1590         if (xchg)                                                             \
1591             AV_SWAP64(b, a);                                                  \
1592         else                                                                  \
1593             AV_COPY64(b, a);                                                  \
1594     } while (0)
1595
1596     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1597     XCHG(top_border, src_y, xchg);
1598     XCHG(top_border + 8, src_y + 8, 1);
1599     if (mb_x < mb_width - 1)
1600         XCHG(top_border + 32, src_y + 16, 1);
1601
1602     // only copy chroma for normal loop filter
1603     // or to initialize the top row to 127
1604     if (!simple || !mb_y) {
1605         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1606         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1607         XCHG(top_border + 16, src_cb, 1);
1608         XCHG(top_border + 24, src_cr, 1);
1609     }
1610 }
1611
1612 static av_always_inline
1613 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1614 {
1615     if (!mb_x)
1616         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1617     else
1618         return mb_y ? mode : LEFT_DC_PRED8x8;
1619 }
1620
1621 static av_always_inline
1622 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1623 {
1624     if (!mb_x)
1625         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1626     else
1627         return mb_y ? mode : HOR_PRED8x8;
1628 }
1629
1630 static av_always_inline
1631 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1632 {
1633     switch (mode) {
1634     case DC_PRED8x8:
1635         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1636     case VERT_PRED8x8:
1637         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1638     case HOR_PRED8x8:
1639         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1640     case PLANE_PRED8x8: /* TM */
1641         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1642     }
1643     return mode;
1644 }
1645
1646 static av_always_inline
1647 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1648 {
1649     if (!mb_x) {
1650         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1651     } else {
1652         return mb_y ? mode : HOR_VP8_PRED;
1653     }
1654 }
1655
1656 static av_always_inline
1657 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1658                                      int *copy_buf, int vp7)
1659 {
1660     switch (mode) {
1661     case VERT_PRED:
1662         if (!mb_x && mb_y) {
1663             *copy_buf = 1;
1664             return mode;
1665         }
1666         /* fall-through */
1667     case DIAG_DOWN_LEFT_PRED:
1668     case VERT_LEFT_PRED:
1669         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1670     case HOR_PRED:
1671         if (!mb_y) {
1672             *copy_buf = 1;
1673             return mode;
1674         }
1675         /* fall-through */
1676     case HOR_UP_PRED:
1677         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1678     case TM_VP8_PRED:
1679         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1680     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1681                    * as 16x16/8x8 DC */
1682     case DIAG_DOWN_RIGHT_PRED:
1683     case VERT_RIGHT_PRED:
1684     case HOR_DOWN_PRED:
1685         if (!mb_y || !mb_x)
1686             *copy_buf = 1;
1687         return mode;
1688     }
1689     return mode;
1690 }
1691
1692 static av_always_inline
1693 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *const dst[3],
1694                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1695 {
1696     int x, y, mode, nnz;
1697     uint32_t tr;
1698
1699     /* for the first row, we need to run xchg_mb_border to init the top edge
1700      * to 127 otherwise, skip it if we aren't going to deblock */
1701     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1702         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1703                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1704                        s->filter.simple, 1);
1705
1706     if (mb->mode < MODE_I4x4) {
1707         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1708         s->hpc.pred16x16[mode](dst[0], s->linesize);
1709     } else {
1710         uint8_t *ptr = dst[0];
1711         const uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1712         const uint8_t lo = is_vp7 ? 128 : 127;
1713         const uint8_t hi = is_vp7 ? 128 : 129;
1714         const uint8_t tr_top[4] = { lo, lo, lo, lo };
1715
1716         // all blocks on the right edge of the macroblock use bottom edge
1717         // the top macroblock for their topright edge
1718         const uint8_t *tr_right = ptr - s->linesize + 16;
1719
1720         // if we're on the right edge of the frame, said edge is extended
1721         // from the top macroblock
1722         if (mb_y && mb_x == s->mb_width - 1) {
1723             tr       = tr_right[-1] * 0x01010101u;
1724             tr_right = (uint8_t *) &tr;
1725         }
1726
1727         if (mb->skip)
1728             AV_ZERO128(td->non_zero_count_cache);
1729
1730         for (y = 0; y < 4; y++) {
1731             const uint8_t *topright = ptr + 4 - s->linesize;
1732             for (x = 0; x < 4; x++) {
1733                 int copy = 0;
1734                 ptrdiff_t linesize = s->linesize;
1735                 uint8_t *dst = ptr + 4 * x;
1736                 LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
1737
1738                 if ((y == 0 || x == 3) && mb_y == 0) {
1739                     topright = tr_top;
1740                 } else if (x == 3)
1741                     topright = tr_right;
1742
1743                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1744                                                         mb_y + y, &copy, is_vp7);
1745                 if (copy) {
1746                     dst      = copy_dst + 12;
1747                     linesize = 8;
1748                     if (!(mb_y + y)) {
1749                         copy_dst[3] = lo;
1750                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1751                     } else {
1752                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1753                         if (!(mb_x + x)) {
1754                             copy_dst[3] = hi;
1755                         } else {
1756                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1757                         }
1758                     }
1759                     if (!(mb_x + x)) {
1760                         copy_dst[11] =
1761                         copy_dst[19] =
1762                         copy_dst[27] =
1763                         copy_dst[35] = hi;
1764                     } else {
1765                         copy_dst[11] = ptr[4 * x                   - 1];
1766                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1767                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1768                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1769                     }
1770                 }
1771                 s->hpc.pred4x4[mode](dst, topright, linesize);
1772                 if (copy) {
1773                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1774                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1775                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1776                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1777                 }
1778
1779                 nnz = td->non_zero_count_cache[y][x];
1780                 if (nnz) {
1781                     if (nnz == 1)
1782                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1783                                                   td->block[y][x], s->linesize);
1784                     else
1785                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1786                                                td->block[y][x], s->linesize);
1787                 }
1788                 topright += 4;
1789             }
1790
1791             ptr      += 4 * s->linesize;
1792             intra4x4 += 4;
1793         }
1794     }
1795
1796     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1797                                             mb_x, mb_y, is_vp7);
1798     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1799     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1800
1801     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1802         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1803                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1804                        s->filter.simple, 0);
1805 }
1806
1807 static const uint8_t subpel_idx[3][8] = {
1808     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1809                                 // also function pointer index
1810     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1811     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1812 };
1813
1814 /**
1815  * luma MC function
1816  *
1817  * @param s        VP8 decoding context
1818  * @param dst      target buffer for block data at block position
1819  * @param ref      reference picture buffer at origin (0, 0)
1820  * @param mv       motion vector (relative to block position) to get pixel data from
1821  * @param x_off    horizontal position of block from origin (0, 0)
1822  * @param y_off    vertical position of block from origin (0, 0)
1823  * @param block_w  width of block (16, 8 or 4)
1824  * @param block_h  height of block (always same as block_w)
1825  * @param width    width of src/dst plane data
1826  * @param height   height of src/dst plane data
1827  * @param linesize size of a single line of plane data, including padding
1828  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1829  */
1830 static av_always_inline
1831 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1832                  const ThreadFrame *ref, const VP8mv *mv,
1833                  int x_off, int y_off, int block_w, int block_h,
1834                  int width, int height, ptrdiff_t linesize,
1835                  vp8_mc_func mc_func[3][3])
1836 {
1837     const uint8_t *src = ref->f->data[0];
1838
1839     if (AV_RN32A(mv)) {
1840         ptrdiff_t src_linesize = linesize;
1841
1842         int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
1843         int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
1844
1845         x_off += mv->x >> 2;
1846         y_off += mv->y >> 2;
1847
1848         // edge emulation
1849         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1850         src += y_off * linesize + x_off;
1851         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1852             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1853             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1854                                      src - my_idx * linesize - mx_idx,
1855                                      EDGE_EMU_LINESIZE, linesize,
1856                                      block_w + subpel_idx[1][mx],
1857                                      block_h + subpel_idx[1][my],
1858                                      x_off - mx_idx, y_off - my_idx,
1859                                      width, height);
1860             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1861             src_linesize = EDGE_EMU_LINESIZE;
1862         }
1863         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1864     } else {
1865         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1866         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1867                       linesize, block_h, 0, 0);
1868     }
1869 }
1870
1871 /**
1872  * chroma MC function
1873  *
1874  * @param s        VP8 decoding context
1875  * @param dst1     target buffer for block data at block position (U plane)
1876  * @param dst2     target buffer for block data at block position (V plane)
1877  * @param ref      reference picture buffer at origin (0, 0)
1878  * @param mv       motion vector (relative to block position) to get pixel data from
1879  * @param x_off    horizontal position of block from origin (0, 0)
1880  * @param y_off    vertical position of block from origin (0, 0)
1881  * @param block_w  width of block (16, 8 or 4)
1882  * @param block_h  height of block (always same as block_w)
1883  * @param width    width of src/dst plane data
1884  * @param height   height of src/dst plane data
1885  * @param linesize size of a single line of plane data, including padding
1886  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1887  */
1888 static av_always_inline
1889 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1890                    uint8_t *dst2, const ThreadFrame *ref, const VP8mv *mv,
1891                    int x_off, int y_off, int block_w, int block_h,
1892                    int width, int height, ptrdiff_t linesize,
1893                    vp8_mc_func mc_func[3][3])
1894 {
1895     const uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1896
1897     if (AV_RN32A(mv)) {
1898         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1899         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1900
1901         x_off += mv->x >> 3;
1902         y_off += mv->y >> 3;
1903
1904         // edge emulation
1905         src1 += y_off * linesize + x_off;
1906         src2 += y_off * linesize + x_off;
1907         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1908         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1909             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1910             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1911                                      src1 - my_idx * linesize - mx_idx,
1912                                      EDGE_EMU_LINESIZE, linesize,
1913                                      block_w + subpel_idx[1][mx],
1914                                      block_h + subpel_idx[1][my],
1915                                      x_off - mx_idx, y_off - my_idx, width, height);
1916             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1917             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1918
1919             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1920                                      src2 - my_idx * linesize - mx_idx,
1921                                      EDGE_EMU_LINESIZE, linesize,
1922                                      block_w + subpel_idx[1][mx],
1923                                      block_h + subpel_idx[1][my],
1924                                      x_off - mx_idx, y_off - my_idx, width, height);
1925             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1926             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1927         } else {
1928             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1929             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1930         }
1931     } else {
1932         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1933         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1934         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1935     }
1936 }
1937
1938 static av_always_inline
1939 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *const dst[3],
1940                  const ThreadFrame *ref_frame, int x_off, int y_off,
1941                  int bx_off, int by_off, int block_w, int block_h,
1942                  int width, int height, const VP8mv *mv)
1943 {
1944     VP8mv uvmv = *mv;
1945
1946     /* Y */
1947     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1948                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1949                 block_w, block_h, width, height, s->linesize,
1950                 s->put_pixels_tab[block_w == 8]);
1951
1952     /* U/V */
1953     if (s->profile == 3) {
1954         /* this block only applies VP8; it is safe to check
1955          * only the profile, as VP7 profile <= 1 */
1956         uvmv.x &= ~7;
1957         uvmv.y &= ~7;
1958     }
1959     x_off   >>= 1;
1960     y_off   >>= 1;
1961     bx_off  >>= 1;
1962     by_off  >>= 1;
1963     width   >>= 1;
1964     height  >>= 1;
1965     block_w >>= 1;
1966     block_h >>= 1;
1967     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1968                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1969                   &uvmv, x_off + bx_off, y_off + by_off,
1970                   block_w, block_h, width, height, s->uvlinesize,
1971                   s->put_pixels_tab[1 + (block_w == 4)]);
1972 }
1973
1974 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1975  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1976 static av_always_inline
1977 void prefetch_motion(const VP8Context *s, const VP8Macroblock *mb,
1978                      int mb_x, int mb_y, int mb_xy, int ref)
1979 {
1980     /* Don't prefetch refs that haven't been used very often this frame. */
1981     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1982         int x_off = mb_x << 4, y_off = mb_y << 4;
1983         int mx = (mb->mv.x >> 2) + x_off + 8;
1984         int my = (mb->mv.y >> 2) + y_off;
1985         uint8_t **src = s->framep[ref]->tf.f->data;
1986         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1987         /* For threading, a ff_thread_await_progress here might be useful, but
1988          * it actually slows down the decoder. Since a bad prefetch doesn't
1989          * generate bad decoder output, we don't run it here. */
1990         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1991         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1992         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1993     }
1994 }
1995
1996 /**
1997  * Apply motion vectors to prediction buffer, chapter 18.
1998  */
1999 static av_always_inline
2000 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *const dst[3],
2001                    VP8Macroblock *mb, int mb_x, int mb_y)
2002 {
2003     int x_off = mb_x << 4, y_off = mb_y << 4;
2004     int width = 16 * s->mb_width, height = 16 * s->mb_height;
2005     const ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
2006     const VP8mv *bmv = mb->bmv;
2007
2008     switch (mb->partitioning) {
2009     case VP8_SPLITMVMODE_NONE:
2010         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2011                     0, 0, 16, 16, width, height, &mb->mv);
2012         break;
2013     case VP8_SPLITMVMODE_4x4: {
2014         int x, y;
2015         VP8mv uvmv;
2016
2017         /* Y */
2018         for (y = 0; y < 4; y++) {
2019             for (x = 0; x < 4; x++) {
2020                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
2021                             ref, &bmv[4 * y + x],
2022                             4 * x + x_off, 4 * y + y_off, 4, 4,
2023                             width, height, s->linesize,
2024                             s->put_pixels_tab[2]);
2025             }
2026         }
2027
2028         /* U/V */
2029         x_off  >>= 1;
2030         y_off  >>= 1;
2031         width  >>= 1;
2032         height >>= 1;
2033         for (y = 0; y < 2; y++) {
2034             for (x = 0; x < 2; x++) {
2035                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
2036                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
2037                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
2038                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
2039                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
2040                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
2041                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
2042                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
2043                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
2044                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
2045                 if (s->profile == 3) {
2046                     uvmv.x &= ~7;
2047                     uvmv.y &= ~7;
2048                 }
2049                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
2050                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
2051                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
2052                               width, height, s->uvlinesize,
2053                               s->put_pixels_tab[2]);
2054             }
2055         }
2056         break;
2057     }
2058     case VP8_SPLITMVMODE_16x8:
2059         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2060                     0, 0, 16, 8, width, height, &bmv[0]);
2061         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2062                     0, 8, 16, 8, width, height, &bmv[1]);
2063         break;
2064     case VP8_SPLITMVMODE_8x16:
2065         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2066                     0, 0, 8, 16, width, height, &bmv[0]);
2067         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2068                     8, 0, 8, 16, width, height, &bmv[1]);
2069         break;
2070     case VP8_SPLITMVMODE_8x8:
2071         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2072                     0, 0, 8, 8, width, height, &bmv[0]);
2073         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2074                     8, 0, 8, 8, width, height, &bmv[1]);
2075         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2076                     0, 8, 8, 8, width, height, &bmv[2]);
2077         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2078                     8, 8, 8, 8, width, height, &bmv[3]);
2079         break;
2080     }
2081 }
2082
2083 static av_always_inline
2084 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *const dst[3],
2085              const VP8Macroblock *mb)
2086 {
2087     int x, y, ch;
2088
2089     if (mb->mode != MODE_I4x4) {
2090         uint8_t *y_dst = dst[0];
2091         for (y = 0; y < 4; y++) {
2092             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
2093             if (nnz4) {
2094                 if (nnz4 & ~0x01010101) {
2095                     for (x = 0; x < 4; x++) {
2096                         if ((uint8_t) nnz4 == 1)
2097                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
2098                                                       td->block[y][x],
2099                                                       s->linesize);
2100                         else if ((uint8_t) nnz4 > 1)
2101                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
2102                                                    td->block[y][x],
2103                                                    s->linesize);
2104                         nnz4 >>= 8;
2105                         if (!nnz4)
2106                             break;
2107                     }
2108                 } else {
2109                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
2110                 }
2111             }
2112             y_dst += 4 * s->linesize;
2113         }
2114     }
2115
2116     for (ch = 0; ch < 2; ch++) {
2117         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
2118         if (nnz4) {
2119             uint8_t *ch_dst = dst[1 + ch];
2120             if (nnz4 & ~0x01010101) {
2121                 for (y = 0; y < 2; y++) {
2122                     for (x = 0; x < 2; x++) {
2123                         if ((uint8_t) nnz4 == 1)
2124                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2125                                                       td->block[4 + ch][(y << 1) + x],
2126                                                       s->uvlinesize);
2127                         else if ((uint8_t) nnz4 > 1)
2128                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2129                                                    td->block[4 + ch][(y << 1) + x],
2130                                                    s->uvlinesize);
2131                         nnz4 >>= 8;
2132                         if (!nnz4)
2133                             goto chroma_idct_end;
2134                     }
2135                     ch_dst += 4 * s->uvlinesize;
2136                 }
2137             } else {
2138                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2139             }
2140         }
2141 chroma_idct_end:
2142         ;
2143     }
2144 }
2145
2146 static av_always_inline
2147 void filter_level_for_mb(const VP8Context *s, const VP8Macroblock *mb,
2148                          VP8FilterStrength *f, int is_vp7)
2149 {
2150     int interior_limit, filter_level;
2151
2152     if (s->segmentation.enabled) {
2153         filter_level = s->segmentation.filter_level[mb->segment];
2154         if (!s->segmentation.absolute_vals)
2155             filter_level += s->filter.level;
2156     } else
2157         filter_level = s->filter.level;
2158
2159     if (s->lf_delta.enabled) {
2160         filter_level += s->lf_delta.ref[mb->ref_frame];
2161         filter_level += s->lf_delta.mode[mb->mode];
2162     }
2163
2164     filter_level = av_clip_uintp2(filter_level, 6);
2165
2166     interior_limit = filter_level;
2167     if (s->filter.sharpness) {
2168         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2169         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2170     }
2171     interior_limit = FFMAX(interior_limit, 1);
2172
2173     f->filter_level = filter_level;
2174     f->inner_limit = interior_limit;
2175     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2176                       mb->mode == VP8_MVMODE_SPLIT;
2177 }
2178
2179 static av_always_inline
2180 void filter_mb(const VP8Context *s, uint8_t *const dst[3], const VP8FilterStrength *f,
2181                int mb_x, int mb_y, int is_vp7)
2182 {
2183     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2184     int filter_level = f->filter_level;
2185     int inner_limit = f->inner_limit;
2186     int inner_filter = f->inner_filter;
2187     ptrdiff_t linesize   = s->linesize;
2188     ptrdiff_t uvlinesize = s->uvlinesize;
2189     static const uint8_t hev_thresh_lut[2][64] = {
2190         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2191           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2192           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2193           3, 3, 3, 3 },
2194         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2195           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2196           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2197           2, 2, 2, 2 }
2198     };
2199
2200     if (!filter_level)
2201         return;
2202
2203     if (is_vp7) {
2204         bedge_lim_y  = filter_level;
2205         bedge_lim_uv = filter_level * 2;
2206         mbedge_lim   = filter_level + 2;
2207     } else {
2208         bedge_lim_y  =
2209         bedge_lim_uv = filter_level * 2 + inner_limit;
2210         mbedge_lim   = bedge_lim_y + 4;
2211     }
2212
2213     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2214
2215     if (mb_x) {
2216         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2217                                        mbedge_lim, inner_limit, hev_thresh);
2218         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2219                                        mbedge_lim, inner_limit, hev_thresh);
2220     }
2221
2222 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2223     if (cond && inner_filter) {                                               \
2224         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2225                                              bedge_lim_y, inner_limit,        \
2226                                              hev_thresh);                     \
2227         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2228                                              bedge_lim_y, inner_limit,        \
2229                                              hev_thresh);                     \
2230         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2231                                              bedge_lim_y, inner_limit,        \
2232                                              hev_thresh);                     \
2233         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2234                                              uvlinesize,  bedge_lim_uv,       \
2235                                              inner_limit, hev_thresh);        \
2236     }
2237
2238     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2239
2240     if (mb_y) {
2241         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2242                                        mbedge_lim, inner_limit, hev_thresh);
2243         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2244                                        mbedge_lim, inner_limit, hev_thresh);
2245     }
2246
2247     if (inner_filter) {
2248         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2249                                              linesize, bedge_lim_y,
2250                                              inner_limit, hev_thresh);
2251         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2252                                              linesize, bedge_lim_y,
2253                                              inner_limit, hev_thresh);
2254         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2255                                              linesize, bedge_lim_y,
2256                                              inner_limit, hev_thresh);
2257         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2258                                              dst[2] +  4 * uvlinesize,
2259                                              uvlinesize, bedge_lim_uv,
2260                                              inner_limit, hev_thresh);
2261     }
2262
2263     H_LOOP_FILTER_16Y_INNER(is_vp7)
2264 }
2265
2266 static av_always_inline
2267 void filter_mb_simple(const VP8Context *s, uint8_t *dst, const VP8FilterStrength *f,
2268                       int mb_x, int mb_y)
2269 {
2270     int mbedge_lim, bedge_lim;
2271     int filter_level = f->filter_level;
2272     int inner_limit  = f->inner_limit;
2273     int inner_filter = f->inner_filter;
2274     ptrdiff_t linesize = s->linesize;
2275
2276     if (!filter_level)
2277         return;
2278
2279     bedge_lim  = 2 * filter_level + inner_limit;
2280     mbedge_lim = bedge_lim + 4;
2281
2282     if (mb_x)
2283         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2284     if (inner_filter) {
2285         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2286         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2287         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2288     }
2289
2290     if (mb_y)
2291         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2292     if (inner_filter) {
2293         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2294         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2295         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2296     }
2297 }
2298
2299 #define MARGIN (16 << 2)
2300 static av_always_inline
2301 int vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2302                             const VP8Frame *prev_frame, int is_vp7)
2303 {
2304     VP8Context *s = avctx->priv_data;
2305     int mb_x, mb_y;
2306
2307     s->mv_bounds.mv_min.y = -MARGIN;
2308     s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2309     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2310         VP8Macroblock *mb = s->macroblocks_base +
2311                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2312         int mb_xy = mb_y * s->mb_width;
2313
2314         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2315
2316         s->mv_bounds.mv_min.x = -MARGIN;
2317         s->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2318
2319         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2320             if (vpx_rac_is_end(&s->c)) {
2321                 return AVERROR_INVALIDDATA;
2322             }
2323             if (mb_y == 0)
2324                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2325                          DC_PRED * 0x01010101);
2326             decode_mb_mode(s, &s->mv_bounds, mb, mb_x, mb_y, curframe->seg_map + mb_xy,
2327                            prev_frame && prev_frame->seg_map ?
2328                            prev_frame->seg_map + mb_xy : NULL, 1, is_vp7);
2329             s->mv_bounds.mv_min.x -= 64;
2330             s->mv_bounds.mv_max.x -= 64;
2331         }
2332         s->mv_bounds.mv_min.y -= 64;
2333         s->mv_bounds.mv_max.y -= 64;
2334     }
2335     return 0;
2336 }
2337
2338 static int vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2339                                   const VP8Frame *prev_frame)
2340 {
2341     return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2342 }
2343
2344 static int vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2345                                   const VP8Frame *prev_frame)
2346 {
2347     return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2348 }
2349
2350 #if HAVE_THREADS
2351 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2352     do {                                                                      \
2353         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2354         if (atomic_load(&otd->thread_mb_pos) < tmp) {                         \
2355             pthread_mutex_lock(&otd->lock);                                   \
2356             atomic_store(&td->wait_mb_pos, tmp);                              \
2357             do {                                                              \
2358                 if (atomic_load(&otd->thread_mb_pos) >= tmp)                  \
2359                     break;                                                    \
2360                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2361             } while (1);                                                      \
2362             atomic_store(&td->wait_mb_pos, INT_MAX);                          \
2363             pthread_mutex_unlock(&otd->lock);                                 \
2364         }                                                                     \
2365     } while (0)
2366
2367 #define update_pos(td, mb_y, mb_x)                                            \
2368     do {                                                                      \
2369         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2370         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2371                                (num_jobs > 1);                                \
2372         int is_null          = !next_td || !prev_td;                          \
2373         int pos_check        = (is_null) ? 1 :                                \
2374             (next_td != td && pos >= atomic_load(&next_td->wait_mb_pos)) ||   \
2375             (prev_td != td && pos >= atomic_load(&prev_td->wait_mb_pos));     \
2376         atomic_store(&td->thread_mb_pos, pos);                                \
2377         if (sliced_threading && pos_check) {                                  \
2378             pthread_mutex_lock(&td->lock);                                    \
2379             pthread_cond_broadcast(&td->cond);                                \
2380             pthread_mutex_unlock(&td->lock);                                  \
2381         }                                                                     \
2382     } while (0)
2383 #else
2384 #define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
2385 #define update_pos(td, mb_y, mb_x) while(0)
2386 #endif
2387
2388 static av_always_inline int decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2389                                         int jobnr, int threadnr, int is_vp7)
2390 {
2391     VP8Context *s = avctx->priv_data;
2392     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2393     int mb_y = atomic_load(&td->thread_mb_pos) >> 16;
2394     int mb_x, mb_xy = mb_y * s->mb_width;
2395     int num_jobs = s->num_jobs;
2396     const VP8Frame *prev_frame = s->prev_frame;
2397     VP8Frame *curframe = s->curframe;
2398     VPXRangeCoder *coeff_c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2399
2400     VP8Macroblock *mb;
2401     uint8_t *dst[3] = {
2402         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2403         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2404         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2405     };
2406
2407     if (vpx_rac_is_end(&s->c))
2408          return AVERROR_INVALIDDATA;
2409
2410     if (mb_y == 0)
2411         prev_td = td;
2412     else
2413         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2414     if (mb_y == s->mb_height - 1)
2415         next_td = td;
2416     else
2417         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2418     if (s->mb_layout == 1)
2419         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2420     else {
2421         // Make sure the previous frame has read its segmentation map,
2422         // if we re-use the same map.
2423         if (prev_frame && s->segmentation.enabled &&
2424             !s->segmentation.update_map)
2425             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2426         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2427         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2428         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2429     }
2430
2431     if (!is_vp7 || mb_y == 0)
2432         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2433
2434     td->mv_bounds.mv_min.x = -MARGIN;
2435     td->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2436
2437     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2438         if (vpx_rac_is_end(&s->c))
2439             return AVERROR_INVALIDDATA;
2440         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2441         if (prev_td != td) {
2442             if (threadnr != 0) {
2443                 check_thread_pos(td, prev_td,
2444                                  mb_x + (is_vp7 ? 2 : 1),
2445                                  mb_y - (is_vp7 ? 2 : 1));
2446             } else {
2447                 check_thread_pos(td, prev_td,
2448                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2449                                  mb_y - (is_vp7 ? 2 : 1));
2450             }
2451         }
2452
2453         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2454                          s->linesize, 4);
2455         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2456                          dst[2] - dst[1], 2);
2457
2458         if (!s->mb_layout)
2459             decode_mb_mode(s, &td->mv_bounds, mb, mb_x, mb_y, curframe->seg_map + mb_xy,
2460                            prev_frame && prev_frame->seg_map ?
2461                            prev_frame->seg_map + mb_xy : NULL, 0, is_vp7);
2462
2463         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP8_FRAME_PREVIOUS);
2464
2465         if (!mb->skip) {
2466             if (vpx_rac_is_end(coeff_c))
2467                 return AVERROR_INVALIDDATA;
2468             decode_mb_coeffs(s, td, coeff_c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2469         }
2470
2471         if (mb->mode <= MODE_I4x4)
2472             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2473         else
2474             inter_predict(s, td, dst, mb, mb_x, mb_y);
2475
2476         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP8_FRAME_GOLDEN);
2477
2478         if (!mb->skip) {
2479             idct_mb(s, td, dst, mb);
2480         } else {
2481             AV_ZERO64(td->left_nnz);
2482             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2483
2484             /* Reset DC block predictors if they would exist
2485              * if the mb had coefficients */
2486             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2487                 td->left_nnz[8]     = 0;
2488                 s->top_nnz[mb_x][8] = 0;
2489             }
2490         }
2491
2492         if (s->deblock_filter)
2493             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2494
2495         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2496             if (s->filter.simple)
2497                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2498                                  NULL, NULL, s->linesize, 0, 1);
2499             else
2500                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2501                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2502         }
2503
2504         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP8_FRAME_ALTREF);
2505
2506         dst[0]      += 16;
2507         dst[1]      += 8;
2508         dst[2]      += 8;
2509         td->mv_bounds.mv_min.x -= 64;
2510         td->mv_bounds.mv_max.x -= 64;
2511
2512         if (mb_x == s->mb_width + 1) {
2513             update_pos(td, mb_y, s->mb_width + 3);
2514         } else {
2515             update_pos(td, mb_y, mb_x);
2516         }
2517     }
2518     return 0;
2519 }
2520
2521 static int vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2522                                         int jobnr, int threadnr)
2523 {
2524     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2525 }
2526
2527 static int vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2528                                         int jobnr, int threadnr)
2529 {
2530     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2531 }
2532
2533 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2534                               int jobnr, int threadnr, int is_vp7)
2535 {
2536     VP8Context *s = avctx->priv_data;
2537     VP8ThreadData *td = &s->thread_data[threadnr];
2538     int mb_x, mb_y = atomic_load(&td->thread_mb_pos) >> 16, num_jobs = s->num_jobs;
2539     AVFrame *curframe = s->curframe->tf.f;
2540     VP8Macroblock *mb;
2541     VP8ThreadData *prev_td, *next_td;
2542     uint8_t *dst[3] = {
2543         curframe->data[0] + 16 * mb_y * s->linesize,
2544         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2545         curframe->data[2] +  8 * mb_y * s->uvlinesize
2546     };
2547
2548     if (s->mb_layout == 1)
2549         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2550     else
2551         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2552
2553     if (mb_y == 0)
2554         prev_td = td;
2555     else
2556         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2557     if (mb_y == s->mb_height - 1)
2558         next_td = td;
2559     else
2560         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2561
2562     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2563         const VP8FilterStrength *f = &td->filter_strength[mb_x];
2564         if (prev_td != td)
2565             check_thread_pos(td, prev_td,
2566                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2567         if (next_td != td)
2568             if (next_td != &s->thread_data[0])
2569                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2570
2571         if (num_jobs == 1) {
2572             if (s->filter.simple)
2573                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2574                                  NULL, NULL, s->linesize, 0, 1);
2575             else
2576                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2577                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2578         }
2579
2580         if (s->filter.simple)
2581             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2582         else
2583             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2584         dst[0] += 16;
2585         dst[1] += 8;
2586         dst[2] += 8;
2587
2588         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2589     }
2590 }
2591
2592 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2593                               int jobnr, int threadnr)
2594 {
2595     filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2596 }
2597
2598 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2599                               int jobnr, int threadnr)
2600 {
2601     filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2602 }
2603
2604 static av_always_inline
2605 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2606                               int threadnr, int is_vp7)
2607 {
2608     const VP8Context *s = avctx->priv_data;
2609     VP8ThreadData *td = &s->thread_data[jobnr];
2610     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2611     VP8Frame *curframe = s->curframe;
2612     int mb_y, num_jobs = s->num_jobs;
2613     int ret;
2614
2615     td->thread_nr = threadnr;
2616     td->mv_bounds.mv_min.y   = -MARGIN - 64 * threadnr;
2617     td->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN - 64 * threadnr;
2618     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2619         atomic_store(&td->thread_mb_pos, mb_y << 16);
2620         ret = s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2621         if (ret < 0) {
2622             update_pos(td, s->mb_height, INT_MAX & 0xFFFF);
2623             return ret;
2624         }
2625         if (s->deblock_filter)
2626             s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2627         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2628
2629         td->mv_bounds.mv_min.y -= 64 * num_jobs;
2630         td->mv_bounds.mv_max.y -= 64 * num_jobs;
2631
2632         if (avctx->active_thread_type == FF_THREAD_FRAME)
2633             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2634     }
2635
2636     return 0;
2637 }
2638
2639 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2640                                     int jobnr, int threadnr)
2641 {
2642     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2643 }
2644
2645 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2646                                     int jobnr, int threadnr)
2647 {
2648     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2649 }
2650
2651 static av_always_inline
2652 int vp78_decode_frame(AVCodecContext *avctx, AVFrame *rframe, int *got_frame,
2653                       const AVPacket *avpkt, int is_vp7)
2654 {
2655     VP8Context *s = avctx->priv_data;
2656     int ret, i, referenced, num_jobs;
2657     enum AVDiscard skip_thresh;
2658     VP8Frame *av_uninit(curframe), *prev_frame;
2659
2660     if (is_vp7)
2661         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2662     else
2663         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2664
2665     if (ret < 0)
2666         goto err;
2667
2668     if (s->actually_webp) {
2669         // avctx->pix_fmt already set in caller.
2670     } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
2671         s->pix_fmt = get_pixel_format(s);
2672         if (s->pix_fmt < 0) {
2673             ret = AVERROR(EINVAL);
2674             goto err;
2675         }
2676         avctx->pix_fmt = s->pix_fmt;
2677     }
2678
2679     prev_frame = s->framep[VP8_FRAME_CURRENT];
2680
2681     referenced = s->update_last || s->update_golden == VP8_FRAME_CURRENT ||
2682                  s->update_altref == VP8_FRAME_CURRENT;
2683
2684     skip_thresh = !referenced ? AVDISCARD_NONREF
2685                               : !s->keyframe ? AVDISCARD_NONKEY
2686                                              : AVDISCARD_ALL;
2687
2688     if (avctx->skip_frame >= skip_thresh) {
2689         s->invisible = 1;
2690         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2691         goto skip_decode;
2692     }
2693     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2694
2695     // release no longer referenced frames
2696     for (i = 0; i < 5; i++)
2697         if (s->frames[i].tf.f->buf[0] &&
2698             &s->frames[i] != prev_frame &&
2699             &s->frames[i] != s->framep[VP8_FRAME_PREVIOUS] &&
2700             &s->frames[i] != s->framep[VP8_FRAME_GOLDEN]   &&
2701             &s->frames[i] != s->framep[VP8_FRAME_ALTREF])
2702             vp8_release_frame(&s->frames[i]);
2703
2704     curframe = s->framep[VP8_FRAME_CURRENT] = vp8_find_free_buffer(s);
2705
2706     if (!s->colorspace)
2707         avctx->colorspace = AVCOL_SPC_BT470BG;
2708     if (s->fullrange)
2709         avctx->color_range = AVCOL_RANGE_JPEG;
2710     else
2711         avctx->color_range = AVCOL_RANGE_MPEG;
2712
2713     /* Given that arithmetic probabilities are updated every frame, it's quite
2714      * likely that the values we have on a random interframe are complete
2715      * junk if we didn't start decode on a keyframe. So just don't display
2716      * anything rather than junk. */
2717     if (!s->keyframe && (!s->framep[VP8_FRAME_PREVIOUS] ||
2718                          !s->framep[VP8_FRAME_GOLDEN]   ||
2719                          !s->framep[VP8_FRAME_ALTREF])) {
2720         av_log(avctx, AV_LOG_WARNING,
2721                "Discarding interframe without a prior keyframe!\n");
2722         ret = AVERROR_INVALIDDATA;
2723         goto err;
2724     }
2725
2726     if (s->keyframe)
2727         curframe->tf.f->flags |= AV_FRAME_FLAG_KEY;
2728     else
2729         curframe->tf.f->flags &= ~AV_FRAME_FLAG_KEY;
2730     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2731                                             : AV_PICTURE_TYPE_P;
2732     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2733         goto err;
2734
2735     // check if golden and altref are swapped
2736     if (s->update_altref != VP8_FRAME_NONE)
2737         s->next_framep[VP8_FRAME_ALTREF] = s->framep[s->update_altref];
2738     else
2739         s->next_framep[VP8_FRAME_ALTREF] = s->framep[VP8_FRAME_ALTREF];
2740
2741     if (s->update_golden != VP8_FRAME_NONE)
2742         s->next_framep[VP8_FRAME_GOLDEN] = s->framep[s->update_golden];
2743     else
2744         s->next_framep[VP8_FRAME_GOLDEN] = s->framep[VP8_FRAME_GOLDEN];
2745
2746     if (s->update_last)
2747         s->next_framep[VP8_FRAME_PREVIOUS] = curframe;
2748     else
2749         s->next_framep[VP8_FRAME_PREVIOUS] = s->framep[VP8_FRAME_PREVIOUS];
2750
2751     s->next_framep[VP8_FRAME_CURRENT] = curframe;
2752
2753     if (ffcodec(avctx->codec)->update_thread_context)
2754         ff_thread_finish_setup(avctx);
2755
2756     if (avctx->hwaccel) {
2757         const FFHWAccel *hwaccel = ffhwaccel(avctx->hwaccel);
2758         ret = hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
2759         if (ret < 0)
2760             goto err;
2761
2762         ret = hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
2763         if (ret < 0)
2764             goto err;
2765
2766         ret = hwaccel->end_frame(avctx);
2767         if (ret < 0)
2768             goto err;
2769
2770     } else {
2771         s->linesize   = curframe->tf.f->linesize[0];
2772         s->uvlinesize = curframe->tf.f->linesize[1];
2773
2774         memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2775         /* Zero macroblock structures for top/top-left prediction
2776          * from outside the frame. */
2777         if (!s->mb_layout)
2778             memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2779                    (s->mb_width + 1) * sizeof(*s->macroblocks));
2780         if (!s->mb_layout && s->keyframe)
2781             memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2782
2783         memset(s->ref_count, 0, sizeof(s->ref_count));
2784
2785         if (s->mb_layout == 1) {
2786             // Make sure the previous frame has read its segmentation map,
2787             // if we re-use the same map.
2788             if (prev_frame && s->segmentation.enabled &&
2789                 !s->segmentation.update_map)
2790                 ff_thread_await_progress(&prev_frame->tf, 1, 0);
2791             if (is_vp7)
2792                 ret = vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2793             else
2794                 ret = vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2795             if (ret < 0)
2796                 goto err;
2797         }
2798
2799         if (avctx->active_thread_type == FF_THREAD_FRAME)
2800             num_jobs = 1;
2801         else
2802             num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2803         s->num_jobs   = num_jobs;
2804         s->curframe   = curframe;
2805         s->prev_frame = prev_frame;
2806         s->mv_bounds.mv_min.y   = -MARGIN;
2807         s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2808         for (i = 0; i < MAX_THREADS; i++) {
2809             VP8ThreadData *td = &s->thread_data[i];
2810             atomic_init(&td->thread_mb_pos, 0);
2811             atomic_init(&td->wait_mb_pos, INT_MAX);
2812         }
2813         if (is_vp7)
2814             avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2815                             num_jobs);
2816         else
2817             avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2818                             num_jobs);
2819     }
2820
2821     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2822     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2823
2824 skip_decode:
2825     // if future frames don't use the updated probabilities,
2826     // reset them to the values we saved
2827     if (!s->update_probabilities)
2828         s->prob[0] = s->prob[1];
2829
2830     if (!s->invisible) {
2831         if ((ret = av_frame_ref(rframe, curframe->tf.f)) < 0)
2832             return ret;
2833         *got_frame = 1;
2834     }
2835
2836     return avpkt->size;
2837 err:
2838     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2839     return ret;
2840 }
2841
2842 int ff_vp8_decode_frame(AVCodecContext *avctx, AVFrame *frame,
2843                         int *got_frame, AVPacket *avpkt)
2844 {
2845     return vp78_decode_frame(avctx, frame, got_frame, avpkt, IS_VP8);
2846 }
2847
2848 #if CONFIG_VP7_DECODER
2849 static int vp7_decode_frame(AVCodecContext *avctx, AVFrame *frame,
2850                             int *got_frame, AVPacket *avpkt)
2851 {
2852     return vp78_decode_frame(avctx, frame, got_frame, avpkt, IS_VP7);
2853 }
2854 #endif /* CONFIG_VP7_DECODER */
2855
2856 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2857 {
2858     VP8Context *s = avctx->priv_data;
2859     int i;
2860
2861     vp8_decode_flush_impl(avctx, 1);
2862     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2863         av_frame_free(&s->frames[i].tf.f);
2864
2865     return 0;
2866 }
2867
2868 static av_cold int vp8_init_frames(VP8Context *s)
2869 {
2870     int i;
2871     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2872         s->frames[i].tf.f = av_frame_alloc();
2873         if (!s->frames[i].tf.f)
2874             return AVERROR(ENOMEM);
2875     }
2876     return 0;
2877 }
2878
2879 static av_always_inline
2880 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2881 {
2882     VP8Context *s = avctx->priv_data;
2883     int ret;
2884
2885     s->avctx = avctx;
2886     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2887     s->pix_fmt = AV_PIX_FMT_NONE;
2888     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2889
2890     ff_videodsp_init(&s->vdsp, 8);
2891
2892     ff_vp78dsp_init(&s->vp8dsp);
2893     if (CONFIG_VP7_DECODER && is_vp7) {
2894         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2895         ff_vp7dsp_init(&s->vp8dsp);
2896         s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2897         s->filter_mb_row           = vp7_filter_mb_row;
2898     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2899         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2900         ff_vp8dsp_init(&s->vp8dsp);
2901         s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2902         s->filter_mb_row           = vp8_filter_mb_row;
2903     }
2904
2905     /* does not change for VP8 */
2906     memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
2907
2908     if ((ret = vp8_init_frames(s)) < 0) {
2909         ff_vp8_decode_free(avctx);
2910         return ret;
2911     }
2912
2913     return 0;
2914 }
2915
2916 #if CONFIG_VP7_DECODER
2917 static int vp7_decode_init(AVCodecContext *avctx)
2918 {
2919     return vp78_decode_init(avctx, IS_VP7);
2920 }
2921 #endif /* CONFIG_VP7_DECODER */
2922
2923 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2924 {
2925     return vp78_decode_init(avctx, IS_VP8);
2926 }
2927
2928 #if CONFIG_VP8_DECODER
2929 #if HAVE_THREADS
2930 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2931
2932 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2933                                             const AVCodecContext *src)
2934 {
2935     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2936     int i;
2937
2938     if (s->macroblocks_base &&
2939         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2940         free_buffers(s);
2941         s->mb_width  = s_src->mb_width;
2942         s->mb_height = s_src->mb_height;
2943     }
2944
2945     s->pix_fmt      = s_src->pix_fmt;
2946     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2947     s->segmentation = s_src->segmentation;
2948     s->lf_delta     = s_src->lf_delta;
2949     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2950
2951     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2952         if (s_src->frames[i].tf.f->buf[0]) {
2953             int ret = vp8_ref_frame(&s->frames[i], &s_src->frames[i]);
2954             if (ret < 0)
2955                 return ret;
2956         }
2957     }
2958
2959     s->framep[0] = REBASE(s_src->next_framep[0]);
2960     s->framep[1] = REBASE(s_src->next_framep[1]);
2961     s->framep[2] = REBASE(s_src->next_framep[2]);
2962     s->framep[3] = REBASE(s_src->next_framep[3]);
2963
2964     return 0;
2965 }
2966 #endif /* HAVE_THREADS */
2967 #endif /* CONFIG_VP8_DECODER */
2968
2969 #if CONFIG_VP7_DECODER
2970 const FFCodec ff_vp7_decoder = {
2971     .p.name                = "vp7",
2972     CODEC_LONG_NAME("On2 VP7"),
2973     .p.type                = AVMEDIA_TYPE_VIDEO,
2974     .p.id                  = AV_CODEC_ID_VP7,
2975     .priv_data_size        = sizeof(VP8Context),
2976     .init                  = vp7_decode_init,
2977     .close                 = ff_vp8_decode_free,
2978     FF_CODEC_DECODE_CB(vp7_decode_frame),
2979     .p.capabilities        = AV_CODEC_CAP_DR1,
2980     .flush                 = vp8_decode_flush,
2981 };
2982 #endif /* CONFIG_VP7_DECODER */
2983
2984 #if CONFIG_VP8_DECODER
2985 const FFCodec ff_vp8_decoder = {
2986     .p.name                = "vp8",
2987     CODEC_LONG_NAME("On2 VP8"),
2988     .p.type                = AVMEDIA_TYPE_VIDEO,
2989     .p.id                  = AV_CODEC_ID_VP8,
2990     .priv_data_size        = sizeof(VP8Context),
2991     .init                  = ff_vp8_decode_init,
2992     .close                 = ff_vp8_decode_free,
2993     FF_CODEC_DECODE_CB(ff_vp8_decode_frame),
2994     .p.capabilities        = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2995                              AV_CODEC_CAP_SLICE_THREADS,
2996     .caps_internal         = FF_CODEC_CAP_ALLOCATE_PROGRESS,
2997     .flush                 = vp8_decode_flush,
2998     UPDATE_THREAD_CONTEXT(vp8_decode_update_thread_context),
2999     .hw_configs            = (const AVCodecHWConfigInternal *const []) {
3000 #if CONFIG_VP8_VAAPI_HWACCEL
3001                                HWACCEL_VAAPI(vp8),
3002 #endif
3003 #if CONFIG_VP8_NVDEC_HWACCEL
3004                                HWACCEL_NVDEC(vp8),
3005 #endif
3006                                NULL
3007                            },
3008 };
3009 #endif /* CONFIG_VP7_DECODER */