libavcodec/vp8.c

   1 /*
   2  * VP7/VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Fiona Glaser
   7  * Copyright (C) 2012 Daniel Kang
   8  * Copyright (C) 2014 Peter Ross
   9  *
  10  * This file is part of FFmpeg.
  11  *
  12  * FFmpeg is free software; you can redistribute it and/or
  13  * modify it under the terms of the GNU Lesser General Public
  14  * License as published by the Free Software Foundation; either
  15  * version 2.1 of the License, or (at your option) any later version.
  16  *
  17  * FFmpeg is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  20  * Lesser General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU Lesser General Public
  23  * License along with FFmpeg; if not, write to the Free Software
  24  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  25  */
  26
  27 #include "config_components.h"
  28
  29 #include "libavutil/imgutils.h"
  30 #include "libavutil/mem_internal.h"
  31
  32 #include "avcodec.h"
  33 #include "codec_internal.h"
  34 #include "hwconfig.h"
  35 #include "internal.h"
  36 #include "mathops.h"
  37 #include "rectangle.h"
  38 #include "thread.h"
  39 #include "threadframe.h"
  40 #include "vp8.h"
  41 #include "vp8data.h"
  42
  43 #if ARCH_ARM
  44 #   include "arm/vp8.h"
  45 #endif
  46
  47 #if CONFIG_VP7_DECODER && CONFIG_VP8_DECODER
  48 #define VPX(vp7, f) (vp7 ? vp7_ ## f : vp8_ ## f)
  49 #elif CONFIG_VP7_DECODER
  50 #define VPX(vp7, f) vp7_ ## f
  51 #else // CONFIG_VP8_DECODER
  52 #define VPX(vp7, f) vp8_ ## f
  53 #endif
  54
  55 static void free_buffers(VP8Context *s)
  56 {
  57     int i;
  58     if (s->thread_data)
  59         for (i = 0; i < MAX_THREADS; i++) {
  60 #if HAVE_THREADS
  61             pthread_cond_destroy(&s->thread_data[i].cond);
  62             pthread_mutex_destroy(&s->thread_data[i].lock);
  63 #endif
  64             av_freep(&s->thread_data[i].filter_strength);
  65         }
  66     av_freep(&s->thread_data);
  67     av_freep(&s->macroblocks_base);
  68     av_freep(&s->intra4x4_pred_mode_top);
  69     av_freep(&s->top_nnz);
  70     av_freep(&s->top_border);
  71
  72     s->macroblocks = NULL;
  73 }
  74
  75 static int vp8_alloc_frame(VP8Context *s, VP8Frame *f, int ref)
  76 {
  77     int ret;
  78     if ((ret = ff_thread_get_ext_buffer(s->avctx, &f->tf,
  79                                         ref ? AV_GET_BUFFER_FLAG_REF : 0)) < 0)
  80         return ret;
  81     if (!(f->seg_map = av_buffer_allocz(s->mb_width * s->mb_height)))
  82         goto fail;
  83     if (s->avctx->hwaccel) {
  84         const AVHWAccel *hwaccel = s->avctx->hwaccel;
  85         if (hwaccel->frame_priv_data_size) {
  86             f->hwaccel_priv_buf = av_buffer_allocz(hwaccel->frame_priv_data_size);
  87             if (!f->hwaccel_priv_buf)
  88                 goto fail;
  89             f->hwaccel_picture_private = f->hwaccel_priv_buf->data;
  90         }
  91     }
  92     return 0;
  93
  94 fail:
  95     av_buffer_unref(&f->seg_map);
  96     ff_thread_release_ext_buffer(s->avctx, &f->tf);
  97     return AVERROR(ENOMEM);
  98 }
  99
 100 static void vp8_release_frame(VP8Context *s, VP8Frame *f)
 101 {
 102     av_buffer_unref(&f->seg_map);
 103     av_buffer_unref(&f->hwaccel_priv_buf);
 104     f->hwaccel_picture_private = NULL;
 105     ff_thread_release_ext_buffer(s->avctx, &f->tf);
 106 }
 107
 108 #if CONFIG_VP8_DECODER
 109 static int vp8_ref_frame(VP8Context *s, VP8Frame *dst, VP8Frame *src)
 110 {
 111     int ret;
 112
 113     vp8_release_frame(s, dst);
 114
 115     if ((ret = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0)
 116         return ret;
 117     if (src->seg_map &&
 118         !(dst->seg_map = av_buffer_ref(src->seg_map))) {
 119         vp8_release_frame(s, dst);
 120         return AVERROR(ENOMEM);
 121     }
 122     if (src->hwaccel_picture_private) {
 123         dst->hwaccel_priv_buf = av_buffer_ref(src->hwaccel_priv_buf);
 124         if (!dst->hwaccel_priv_buf)
 125             return AVERROR(ENOMEM);
 126         dst->hwaccel_picture_private = dst->hwaccel_priv_buf->data;
 127     }
 128
 129     return 0;
 130 }
 131 #endif /* CONFIG_VP8_DECODER */
 132
 133 static void vp8_decode_flush_impl(AVCodecContext *avctx, int free_mem)
 134 {
 135     VP8Context *s = avctx->priv_data;
 136     int i;
 137
 138     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
 139         vp8_release_frame(s, &s->frames[i]);
 140     memset(s->framep, 0, sizeof(s->framep));
 141
 142     if (free_mem)
 143         free_buffers(s);
 144 }
 145
 146 static void vp8_decode_flush(AVCodecContext *avctx)
 147 {
 148     vp8_decode_flush_impl(avctx, 0);
 149 }
 150
 151 static VP8Frame *vp8_find_free_buffer(VP8Context *s)
 152 {
 153     VP8Frame *frame = NULL;
 154     int i;
 155
 156     // find a free buffer
 157     for (i = 0; i < 5; i++)
 158         if (&s->frames[i] != s->framep[VP56_FRAME_CURRENT]  &&
 159             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
 160             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
 161             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
 162             frame = &s->frames[i];
 163             break;
 164         }
 165     if (i == 5) {
 166         av_log(s->avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
 167         abort();
 168     }
 169     if (frame->tf.f->buf[0])
 170         vp8_release_frame(s, frame);
 171
 172     return frame;
 173 }
 174
 175 static enum AVPixelFormat get_pixel_format(VP8Context *s)
 176 {
 177     enum AVPixelFormat pix_fmts[] = {
 178 #if CONFIG_VP8_VAAPI_HWACCEL
 179         AV_PIX_FMT_VAAPI,
 180 #endif
 181 #if CONFIG_VP8_NVDEC_HWACCEL
 182         AV_PIX_FMT_CUDA,
 183 #endif
 184         AV_PIX_FMT_YUV420P,
 185         AV_PIX_FMT_NONE,
 186     };
 187
 188     return ff_get_format(s->avctx, pix_fmts);
 189 }
 190
 191 static av_always_inline
 192 int update_dimensions(VP8Context *s, int width, int height, int is_vp7)
 193 {
 194     AVCodecContext *avctx = s->avctx;
 195     int i, ret, dim_reset = 0;
 196
 197     if (width  != s->avctx->width || ((width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height) && s->macroblocks_base ||
 198         height != s->avctx->height) {
 199         vp8_decode_flush_impl(s->avctx, 1);
 200
 201         ret = ff_set_dimensions(s->avctx, width, height);
 202         if (ret < 0)
 203             return ret;
 204
 205         dim_reset = (s->macroblocks_base != NULL);
 206     }
 207
 208     if ((s->pix_fmt == AV_PIX_FMT_NONE || dim_reset) &&
 209          !s->actually_webp && !is_vp7) {
 210         s->pix_fmt = get_pixel_format(s);
 211         if (s->pix_fmt < 0)
 212             return AVERROR(EINVAL);
 213         avctx->pix_fmt = s->pix_fmt;
 214     }
 215
 216     s->mb_width  = (s->avctx->coded_width  + 15) / 16;
 217     s->mb_height = (s->avctx->coded_height + 15) / 16;
 218
 219     s->mb_layout = is_vp7 || avctx->active_thread_type == FF_THREAD_SLICE &&
 220                    avctx->thread_count > 1;
 221     if (!s->mb_layout) { // Frame threading and one thread
 222         s->macroblocks_base       = av_mallocz((s->mb_width + s->mb_height * 2 + 1) *
 223                                                sizeof(*s->macroblocks));
 224         s->intra4x4_pred_mode_top = av_mallocz(s->mb_width * 4);
 225     } else // Sliced threading
 226         s->macroblocks_base = av_mallocz((s->mb_width + 2) * (s->mb_height + 2) *
 227                                          sizeof(*s->macroblocks));
 228     s->top_nnz     = av_mallocz(s->mb_width * sizeof(*s->top_nnz));
 229     s->top_border  = av_mallocz((s->mb_width + 1) * sizeof(*s->top_border));
 230     s->thread_data = av_mallocz(MAX_THREADS * sizeof(VP8ThreadData));
 231
 232     if (!s->macroblocks_base || !s->top_nnz || !s->top_border ||
 233         !s->thread_data || (!s->intra4x4_pred_mode_top && !s->mb_layout)) {
 234         free_buffers(s);
 235         return AVERROR(ENOMEM);
 236     }
 237
 238     for (i = 0; i < MAX_THREADS; i++) {
 239         s->thread_data[i].filter_strength =
 240             av_mallocz(s->mb_width * sizeof(*s->thread_data[0].filter_strength));
 241         if (!s->thread_data[i].filter_strength) {
 242             free_buffers(s);
 243             return AVERROR(ENOMEM);
 244         }
 245 #if HAVE_THREADS
 246         pthread_mutex_init(&s->thread_data[i].lock, NULL);
 247         pthread_cond_init(&s->thread_data[i].cond, NULL);
 248 #endif
 249     }
 250
 251     s->macroblocks = s->macroblocks_base + 1;
 252
 253     return 0;
 254 }
 255
 256 static int vp7_update_dimensions(VP8Context *s, int width, int height)
 257 {
 258     return update_dimensions(s, width, height, IS_VP7);
 259 }
 260
 261 static int vp8_update_dimensions(VP8Context *s, int width, int height)
 262 {
 263     return update_dimensions(s, width, height, IS_VP8);
 264 }
 265
 266
 267 static void parse_segment_info(VP8Context *s)
 268 {
 269     VP56RangeCoder *c = &s->c;
 270     int i;
 271
 272     s->segmentation.update_map = vp8_rac_get(c);
 273     s->segmentation.update_feature_data = vp8_rac_get(c);
 274
 275     if (s->segmentation.update_feature_data) {
 276         s->segmentation.absolute_vals = vp8_rac_get(c);
 277
 278         for (i = 0; i < 4; i++)
 279             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 280
 281         for (i = 0; i < 4; i++)
 282             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 283     }
 284     if (s->segmentation.update_map)
 285         for (i = 0; i < 3; i++)
 286             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 287 }
 288
 289 static void update_lf_deltas(VP8Context *s)
 290 {
 291     VP56RangeCoder *c = &s->c;
 292     int i;
 293
 294     for (i = 0; i < 4; i++) {
 295         if (vp8_rac_get(c)) {
 296             s->lf_delta.ref[i] = vp8_rac_get_uint(c, 6);
 297
 298             if (vp8_rac_get(c))
 299                 s->lf_delta.ref[i] = -s->lf_delta.ref[i];
 300         }
 301     }
 302
 303     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++) {
 304         if (vp8_rac_get(c)) {
 305             s->lf_delta.mode[i] = vp8_rac_get_uint(c, 6);
 306
 307             if (vp8_rac_get(c))
 308                 s->lf_delta.mode[i] = -s->lf_delta.mode[i];
 309         }
 310     }
 311 }
 312
 313 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 314 {
 315     const uint8_t *sizes = buf;
 316     int i;
 317     int ret;
 318
 319     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 320
 321     buf      += 3 * (s->num_coeff_partitions - 1);
 322     buf_size -= 3 * (s->num_coeff_partitions - 1);
 323     if (buf_size < 0)
 324         return -1;
 325
 326     for (i = 0; i < s->num_coeff_partitions - 1; i++) {
 327         int size = AV_RL24(sizes + 3 * i);
 328         if (buf_size - size < 0)
 329             return -1;
 330         s->coeff_partition_size[i] = size;
 331
 332         ret = ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 333         if (ret < 0)
 334             return ret;
 335         buf      += size;
 336         buf_size -= size;
 337     }
 338
 339     s->coeff_partition_size[i] = buf_size;
 340     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 341
 342     return 0;
 343 }
 344
 345 static void vp7_get_quants(VP8Context *s)
 346 {
 347     VP56RangeCoder *c = &s->c;
 348
 349     int yac_qi  = vp8_rac_get_uint(c, 7);
 350     int ydc_qi  = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 351     int y2dc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 352     int y2ac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 353     int uvdc_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 354     int uvac_qi = vp8_rac_get(c) ? vp8_rac_get_uint(c, 7) : yac_qi;
 355
 356     s->qmat[0].luma_qmul[0]    =       vp7_ydc_qlookup[ydc_qi];
 357     s->qmat[0].luma_qmul[1]    =       vp7_yac_qlookup[yac_qi];
 358     s->qmat[0].luma_dc_qmul[0] =       vp7_y2dc_qlookup[y2dc_qi];
 359     s->qmat[0].luma_dc_qmul[1] =       vp7_y2ac_qlookup[y2ac_qi];
 360     s->qmat[0].chroma_qmul[0]  = FFMIN(vp7_ydc_qlookup[uvdc_qi], 132);
 361     s->qmat[0].chroma_qmul[1]  =       vp7_yac_qlookup[uvac_qi];
 362 }
 363
 364 static void vp8_get_quants(VP8Context *s)
 365 {
 366     VP56RangeCoder *c = &s->c;
 367     int i, base_qi;
 368
 369     s->quant.yac_qi     = vp8_rac_get_uint(c, 7);
 370     s->quant.ydc_delta  = vp8_rac_get_sint(c, 4);
 371     s->quant.y2dc_delta = vp8_rac_get_sint(c, 4);
 372     s->quant.y2ac_delta = vp8_rac_get_sint(c, 4);
 373     s->quant.uvdc_delta = vp8_rac_get_sint(c, 4);
 374     s->quant.uvac_delta = vp8_rac_get_sint(c, 4);
 375
 376     for (i = 0; i < 4; i++) {
 377         if (s->segmentation.enabled) {
 378             base_qi = s->segmentation.base_quant[i];
 379             if (!s->segmentation.absolute_vals)
 380                 base_qi += s->quant.yac_qi;
 381         } else
 382             base_qi = s->quant.yac_qi;
 383
 384         s->qmat[i].luma_qmul[0]    = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.ydc_delta,  7)];
 385         s->qmat[i].luma_qmul[1]    = vp8_ac_qlookup[av_clip_uintp2(base_qi,              7)];
 386         s->qmat[i].luma_dc_qmul[0] = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.y2dc_delta, 7)] * 2;
 387         /* 101581>>16 is equivalent to 155/100 */
 388         s->qmat[i].luma_dc_qmul[1] = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.y2ac_delta, 7)] * 101581 >> 16;
 389         s->qmat[i].chroma_qmul[0]  = vp8_dc_qlookup[av_clip_uintp2(base_qi + s->quant.uvdc_delta, 7)];
 390         s->qmat[i].chroma_qmul[1]  = vp8_ac_qlookup[av_clip_uintp2(base_qi + s->quant.uvac_delta, 7)];
 391
 392         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 393         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 394     }
 395 }
 396
 397 /**
 398  * Determine which buffers golden and altref should be updated with after this frame.
 399  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 400  *
 401  * Intra frames update all 3 references
 402  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 403  * If the update (golden|altref) flag is set, it's updated with the current frame
 404  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 405  * If the flag is not set, the number read means:
 406  *      0: no update
 407  *      1: VP56_FRAME_PREVIOUS
 408  *      2: update golden with altref, or update altref with golden
 409  */
 410 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 411 {
 412     VP56RangeCoder *c = &s->c;
 413
 414     if (update)
 415         return VP56_FRAME_CURRENT;
 416
 417     switch (vp8_rac_get_uint(c, 2)) {
 418     case 1:
 419         return VP56_FRAME_PREVIOUS;
 420     case 2:
 421         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 422     }
 423     return VP56_FRAME_NONE;
 424 }
 425
 426 static void vp78_reset_probability_tables(VP8Context *s)
 427 {
 428     int i, j;
 429     for (i = 0; i < 4; i++)
 430         for (j = 0; j < 16; j++)
 431             memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 432                    sizeof(s->prob->token[i][j]));
 433 }
 434
 435 static void vp78_update_probability_tables(VP8Context *s)
 436 {
 437     VP56RangeCoder *c = &s->c;
 438     int i, j, k, l, m;
 439
 440     for (i = 0; i < 4; i++)
 441         for (j = 0; j < 8; j++)
 442             for (k = 0; k < 3; k++)
 443                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 444                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 445                         int prob = vp8_rac_get_uint(c, 8);
 446                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 447                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 448                     }
 449 }
 450
 451 #define VP7_MVC_SIZE 17
 452 #define VP8_MVC_SIZE 19
 453
 454 static void vp78_update_pred16x16_pred8x8_mvc_probabilities(VP8Context *s,
 455                                                             int mvc_size)
 456 {
 457     VP56RangeCoder *c = &s->c;
 458     int i, j;
 459
 460     if (vp8_rac_get(c))
 461         for (i = 0; i < 4; i++)
 462             s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 463     if (vp8_rac_get(c))
 464         for (i = 0; i < 3; i++)
 465             s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 466
 467     // 17.2 MV probability update
 468     for (i = 0; i < 2; i++)
 469         for (j = 0; j < mvc_size; j++)
 470             if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 471                 s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 472 }
 473
 474 static void update_refs(VP8Context *s)
 475 {
 476     VP56RangeCoder *c = &s->c;
 477
 478     int update_golden = vp8_rac_get(c);
 479     int update_altref = vp8_rac_get(c);
 480
 481     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 482     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 483 }
 484
 485 static void copy_chroma(AVFrame *dst, AVFrame *src, int width, int height)
 486 {
 487     int i, j;
 488
 489     for (j = 1; j < 3; j++) {
 490         for (i = 0; i < height / 2; i++)
 491             memcpy(dst->data[j] + i * dst->linesize[j],
 492                    src->data[j] + i * src->linesize[j], width / 2);
 493     }
 494 }
 495
 496 static void fade(uint8_t *dst, ptrdiff_t dst_linesize,
 497                  const uint8_t *src, ptrdiff_t src_linesize,
 498                  int width, int height,
 499                  int alpha, int beta)
 500 {
 501     int i, j;
 502     for (j = 0; j < height; j++) {
 503         const uint8_t *src2 = src + j * src_linesize;
 504         uint8_t *dst2 = dst + j * dst_linesize;
 505         for (i = 0; i < width; i++) {
 506             uint8_t y = src2[i];
 507             dst2[i] = av_clip_uint8(y + ((y * beta) >> 8) + alpha);
 508         }
 509     }
 510 }
 511
 512 static int vp7_fade_frame(VP8Context *s, int alpha, int beta)
 513 {
 514     int ret;
 515
 516     if (!s->keyframe && (alpha || beta)) {
 517         int width  = s->mb_width * 16;
 518         int height = s->mb_height * 16;
 519         AVFrame *src, *dst;
 520
 521         if (!s->framep[VP56_FRAME_PREVIOUS] ||
 522             !s->framep[VP56_FRAME_GOLDEN]) {
 523             av_log(s->avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
 524             return AVERROR_INVALIDDATA;
 525         }
 526
 527         dst =
 528         src = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 529
 530         /* preserve the golden frame, write a new previous frame */
 531         if (s->framep[VP56_FRAME_GOLDEN] == s->framep[VP56_FRAME_PREVIOUS]) {
 532             s->framep[VP56_FRAME_PREVIOUS] = vp8_find_free_buffer(s);
 533             if ((ret = vp8_alloc_frame(s, s->framep[VP56_FRAME_PREVIOUS], 1)) < 0)
 534                 return ret;
 535
 536             dst = s->framep[VP56_FRAME_PREVIOUS]->tf.f;
 537
 538             copy_chroma(dst, src, width, height);
 539         }
 540
 541         fade(dst->data[0], dst->linesize[0],
 542              src->data[0], src->linesize[0],
 543              width, height, alpha, beta);
 544     }
 545
 546     return 0;
 547 }
 548
 549 static int vp7_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 550 {
 551     VP56RangeCoder *c = &s->c;
 552     int part1_size, hscale, vscale, i, j, ret;
 553     int width  = s->avctx->width;
 554     int height = s->avctx->height;
 555     int alpha = 0;
 556     int beta  = 0;
 557
 558     if (buf_size < 4) {
 559         return AVERROR_INVALIDDATA;
 560     }
 561
 562     s->profile = (buf[0] >> 1) & 7;
 563     if (s->profile > 1) {
 564         avpriv_request_sample(s->avctx, "Unknown profile %d", s->profile);
 565         return AVERROR_INVALIDDATA;
 566     }
 567
 568     s->keyframe  = !(buf[0] & 1);
 569     s->invisible = 0;
 570     part1_size   = AV_RL24(buf) >> 4;
 571
 572     if (buf_size < 4 - s->profile + part1_size) {
 573         av_log(s->avctx, AV_LOG_ERROR, "Buffer size %d is too small, needed : %d\n", buf_size, 4 - s->profile + part1_size);
 574         return AVERROR_INVALIDDATA;
 575     }
 576
 577     buf      += 4 - s->profile;
 578     buf_size -= 4 - s->profile;
 579
 580     memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 581
 582     ret = ff_vp56_init_range_decoder(c, buf, part1_size);
 583     if (ret < 0)
 584         return ret;
 585     buf      += part1_size;
 586     buf_size -= part1_size;
 587
 588     /* A. Dimension information (keyframes only) */
 589     if (s->keyframe) {
 590         width  = vp8_rac_get_uint(c, 12);
 591         height = vp8_rac_get_uint(c, 12);
 592         hscale = vp8_rac_get_uint(c, 2);
 593         vscale = vp8_rac_get_uint(c, 2);
 594         if (hscale || vscale)
 595             avpriv_request_sample(s->avctx, "Upscaling");
 596
 597         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 598         vp78_reset_probability_tables(s);
 599         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 600                sizeof(s->prob->pred16x16));
 601         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 602                sizeof(s->prob->pred8x8c));
 603         for (i = 0; i < 2; i++)
 604             memcpy(s->prob->mvc[i], vp7_mv_default_prob[i],
 605                    sizeof(vp7_mv_default_prob[i]));
 606         memset(&s->segmentation, 0, sizeof(s->segmentation));
 607         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 608         memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
 609     }
 610
 611     if (s->keyframe || s->profile > 0)
 612         memset(s->inter_dc_pred, 0 , sizeof(s->inter_dc_pred));
 613
 614     /* B. Decoding information for all four macroblock-level features */
 615     for (i = 0; i < 4; i++) {
 616         s->feature_enabled[i] = vp8_rac_get(c);
 617         if (s->feature_enabled[i]) {
 618              s->feature_present_prob[i] = vp8_rac_get_uint(c, 8);
 619
 620              for (j = 0; j < 3; j++)
 621                  s->feature_index_prob[i][j] =
 622                      vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 623
 624              if (vp7_feature_value_size[s->profile][i])
 625                  for (j = 0; j < 4; j++)
 626                      s->feature_value[i][j] =
 627                         vp8_rac_get(c) ? vp8_rac_get_uint(c, vp7_feature_value_size[s->profile][i]) : 0;
 628         }
 629     }
 630
 631     s->segmentation.enabled    = 0;
 632     s->segmentation.update_map = 0;
 633     s->lf_delta.enabled        = 0;
 634
 635     s->num_coeff_partitions = 1;
 636     ret = ff_vp56_init_range_decoder(&s->coeff_partition[0], buf, buf_size);
 637     if (ret < 0)
 638         return ret;
 639
 640     if (!s->macroblocks_base || /* first frame */
 641         width != s->avctx->width || height != s->avctx->height ||
 642         (width + 15) / 16 != s->mb_width || (height + 15) / 16 != s->mb_height) {
 643         if ((ret = vp7_update_dimensions(s, width, height)) < 0)
 644             return ret;
 645     }
 646
 647     /* C. Dequantization indices */
 648     vp7_get_quants(s);
 649
 650     /* D. Golden frame update flag (a Flag) for interframes only */
 651     if (!s->keyframe) {
 652         s->update_golden = vp8_rac_get(c) ? VP56_FRAME_CURRENT : VP56_FRAME_NONE;
 653         s->sign_bias[VP56_FRAME_GOLDEN] = 0;
 654     }
 655
 656     s->update_last          = 1;
 657     s->update_probabilities = 1;
 658     s->fade_present         = 1;
 659
 660     if (s->profile > 0) {
 661         s->update_probabilities = vp8_rac_get(c);
 662         if (!s->update_probabilities)
 663             s->prob[1] = s->prob[0];
 664
 665         if (!s->keyframe)
 666             s->fade_present = vp8_rac_get(c);
 667     }
 668
 669     if (vpX_rac_is_end(c))
 670         return AVERROR_INVALIDDATA;
 671     /* E. Fading information for previous frame */
 672     if (s->fade_present && vp8_rac_get(c)) {
 673         alpha = (int8_t) vp8_rac_get_uint(c, 8);
 674         beta  = (int8_t) vp8_rac_get_uint(c, 8);
 675     }
 676
 677     /* F. Loop filter type */
 678     if (!s->profile)
 679         s->filter.simple = vp8_rac_get(c);
 680
 681     /* G. DCT coefficient ordering specification */
 682     if (vp8_rac_get(c))
 683         for (i = 1; i < 16; i++)
 684             s->prob[0].scan[i] = ff_zigzag_scan[vp8_rac_get_uint(c, 4)];
 685
 686     /* H. Loop filter levels  */
 687     if (s->profile > 0)
 688         s->filter.simple = vp8_rac_get(c);
 689     s->filter.level     = vp8_rac_get_uint(c, 6);
 690     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 691
 692     /* I. DCT coefficient probability update; 13.3 Token Probability Updates */
 693     vp78_update_probability_tables(s);
 694
 695     s->mbskip_enabled = 0;
 696
 697     /* J. The remaining frame header data occurs ONLY FOR INTERFRAMES */
 698     if (!s->keyframe) {
 699         s->prob->intra  = vp8_rac_get_uint(c, 8);
 700         s->prob->last   = vp8_rac_get_uint(c, 8);
 701         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP7_MVC_SIZE);
 702     }
 703
 704     if (vpX_rac_is_end(c))
 705         return AVERROR_INVALIDDATA;
 706
 707     if ((ret = vp7_fade_frame(s, alpha, beta)) < 0)
 708         return ret;
 709
 710     return 0;
 711 }
 712
 713 static int vp8_decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 714 {
 715     VP56RangeCoder *c = &s->c;
 716     int header_size, hscale, vscale, ret;
 717     int width  = s->avctx->width;
 718     int height = s->avctx->height;
 719
 720     if (buf_size < 3) {
 721         av_log(s->avctx, AV_LOG_ERROR, "Insufficent data (%d) for header\n", buf_size);
 722         return AVERROR_INVALIDDATA;
 723     }
 724
 725     s->keyframe  = !(buf[0] & 1);
 726     s->profile   =  (buf[0]>>1) & 7;
 727     s->invisible = !(buf[0] & 0x10);
 728     header_size  = AV_RL24(buf) >> 5;
 729     buf      += 3;
 730     buf_size -= 3;
 731
 732     s->header_partition_size = header_size;
 733
 734     if (s->profile > 3)
 735         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 736
 737     if (!s->profile)
 738         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab,
 739                sizeof(s->put_pixels_tab));
 740     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 741         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab,
 742                sizeof(s->put_pixels_tab));
 743
 744     if (header_size > buf_size - 7 * s->keyframe) {
 745         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 746         return AVERROR_INVALIDDATA;
 747     }
 748
 749     if (s->keyframe) {
 750         if (AV_RL24(buf) != 0x2a019d) {
 751             av_log(s->avctx, AV_LOG_ERROR,
 752                    "Invalid start code 0x%x\n", AV_RL24(buf));
 753             return AVERROR_INVALIDDATA;
 754         }
 755         width     = AV_RL16(buf + 3) & 0x3fff;
 756         height    = AV_RL16(buf + 5) & 0x3fff;
 757         hscale    = buf[4] >> 6;
 758         vscale    = buf[6] >> 6;
 759         buf      += 7;
 760         buf_size -= 7;
 761
 762         if (hscale || vscale)
 763             avpriv_request_sample(s->avctx, "Upscaling");
 764
 765         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 766         vp78_reset_probability_tables(s);
 767         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter,
 768                sizeof(s->prob->pred16x16));
 769         memcpy(s->prob->pred8x8c, vp8_pred8x8c_prob_inter,
 770                sizeof(s->prob->pred8x8c));
 771         memcpy(s->prob->mvc, vp8_mv_default_prob,
 772                sizeof(s->prob->mvc));
 773         memset(&s->segmentation, 0, sizeof(s->segmentation));
 774         memset(&s->lf_delta, 0, sizeof(s->lf_delta));
 775     }
 776
 777     ret = ff_vp56_init_range_decoder(c, buf, header_size);
 778     if (ret < 0)
 779         return ret;
 780     buf      += header_size;
 781     buf_size -= header_size;
 782
 783     if (s->keyframe) {
 784         s->colorspace = vp8_rac_get(c);
 785         if (s->colorspace)
 786             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 787         s->fullrange = vp8_rac_get(c);
 788     }
 789
 790     if ((s->segmentation.enabled = vp8_rac_get(c)))
 791         parse_segment_info(s);
 792     else
 793         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 794
 795     s->filter.simple    = vp8_rac_get(c);
 796     s->filter.level     = vp8_rac_get_uint(c, 6);
 797     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 798
 799     if ((s->lf_delta.enabled = vp8_rac_get(c))) {
 800         s->lf_delta.update = vp8_rac_get(c);
 801         if (s->lf_delta.update)
 802             update_lf_deltas(s);
 803     }
 804
 805     if (setup_partitions(s, buf, buf_size)) {
 806         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 807         return AVERROR_INVALIDDATA;
 808     }
 809
 810     if (!s->macroblocks_base || /* first frame */
 811         width != s->avctx->width || height != s->avctx->height ||
 812         (width+15)/16 != s->mb_width || (height+15)/16 != s->mb_height)
 813         if ((ret = vp8_update_dimensions(s, width, height)) < 0)
 814             return ret;
 815
 816     vp8_get_quants(s);
 817
 818     if (!s->keyframe) {
 819         update_refs(s);
 820         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 821         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 822     }
 823
 824     // if we aren't saving this frame's probabilities for future frames,
 825     // make a copy of the current probabilities
 826     if (!(s->update_probabilities = vp8_rac_get(c)))
 827         s->prob[1] = s->prob[0];
 828
 829     s->update_last = s->keyframe || vp8_rac_get(c);
 830
 831     vp78_update_probability_tables(s);
 832
 833     if ((s->mbskip_enabled = vp8_rac_get(c)))
 834         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 835
 836     if (!s->keyframe) {
 837         s->prob->intra  = vp8_rac_get_uint(c, 8);
 838         s->prob->last   = vp8_rac_get_uint(c, 8);
 839         s->prob->golden = vp8_rac_get_uint(c, 8);
 840         vp78_update_pred16x16_pred8x8_mvc_probabilities(s, VP8_MVC_SIZE);
 841     }
 842
 843     // Record the entropy coder state here so that hwaccels can use it.
 844     s->c.code_word = vp56_rac_renorm(&s->c);
 845     s->coder_state_at_header_end.input     = s->c.buffer - (-s->c.bits / 8);
 846     s->coder_state_at_header_end.range     = s->c.high;
 847     s->coder_state_at_header_end.value     = s->c.code_word >> 16;
 848     s->coder_state_at_header_end.bit_count = -s->c.bits % 8;
 849
 850     return 0;
 851 }
 852
 853 static av_always_inline
 854 void clamp_mv(VP8mvbounds *s, VP56mv *dst, const VP56mv *src)
 855 {
 856     dst->x = av_clip(src->x, av_clip(s->mv_min.x, INT16_MIN, INT16_MAX),
 857                              av_clip(s->mv_max.x, INT16_MIN, INT16_MAX));
 858     dst->y = av_clip(src->y, av_clip(s->mv_min.y, INT16_MIN, INT16_MAX),
 859                              av_clip(s->mv_max.y, INT16_MIN, INT16_MAX));
 860 }
 861
 862 /**
 863  * Motion vector coding, 17.1.
 864  */
 865 static av_always_inline int read_mv_component(VP56RangeCoder *c, const uint8_t *p, int vp7)
 866 {
 867     int bit, x = 0;
 868
 869     if (vp56_rac_get_prob_branchy(c, p[0])) {
 870         int i;
 871
 872         for (i = 0; i < 3; i++)
 873             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 874         for (i = (vp7 ? 7 : 9); i > 3; i--)
 875             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 876         if (!(x & (vp7 ? 0xF0 : 0xFFF0)) || vp56_rac_get_prob(c, p[12]))
 877             x += 8;
 878     } else {
 879         // small_mvtree
 880         const uint8_t *ps = p + 2;
 881         bit = vp56_rac_get_prob(c, *ps);
 882         ps += 1 + 3 * bit;
 883         x  += 4 * bit;
 884         bit = vp56_rac_get_prob(c, *ps);
 885         ps += 1 + bit;
 886         x  += 2 * bit;
 887         x  += vp56_rac_get_prob(c, *ps);
 888     }
 889
 890     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 891 }
 892
 893 static int vp7_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 894 {
 895     return read_mv_component(c, p, 1);
 896 }
 897
 898 static int vp8_read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 899 {
 900     return read_mv_component(c, p, 0);
 901 }
 902
 903 static av_always_inline
 904 const uint8_t *get_submv_prob(uint32_t left, uint32_t top, int is_vp7)
 905 {
 906     if (is_vp7)
 907         return vp7_submv_prob;
 908
 909     if (left == top)
 910         return vp8_submv_prob[4 - !!left];
 911     if (!top)
 912         return vp8_submv_prob[2];
 913     return vp8_submv_prob[1 - !!left];
 914 }
 915
 916 /**
 917  * Split motion vector prediction, 16.4.
 918  * @returns the number of motion vectors parsed (2, 4 or 16)
 919  */
 920 static av_always_inline
 921 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 922                     int layout, int is_vp7)
 923 {
 924     int part_idx;
 925     int n, num;
 926     VP8Macroblock *top_mb;
 927     VP8Macroblock *left_mb = &mb[-1];
 928     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning];
 929     const uint8_t *mbsplits_top, *mbsplits_cur, *firstidx;
 930     VP56mv *top_mv;
 931     VP56mv *left_mv = left_mb->bmv;
 932     VP56mv *cur_mv  = mb->bmv;
 933
 934     if (!layout) // layout is inlined, s->mb_layout is not
 935         top_mb = &mb[2];
 936     else
 937         top_mb = &mb[-s->mb_width - 1];
 938     mbsplits_top = vp8_mbsplits[top_mb->partitioning];
 939     top_mv       = top_mb->bmv;
 940
 941     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 942         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1]))
 943             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 944         else
 945             part_idx = VP8_SPLITMVMODE_8x8;
 946     } else {
 947         part_idx = VP8_SPLITMVMODE_4x4;
 948     }
 949
 950     num              = vp8_mbsplit_count[part_idx];
 951     mbsplits_cur     = vp8_mbsplits[part_idx],
 952     firstidx         = vp8_mbfirstidx[part_idx];
 953     mb->partitioning = part_idx;
 954
 955     for (n = 0; n < num; n++) {
 956         int k = firstidx[n];
 957         uint32_t left, above;
 958         const uint8_t *submv_prob;
 959
 960         if (!(k & 3))
 961             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 962         else
 963             left = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 964         if (k <= 3)
 965             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 966         else
 967             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 968
 969         submv_prob = get_submv_prob(left, above, is_vp7);
 970
 971         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 972             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 973                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 974                     mb->bmv[n].y = mb->mv.y +
 975                                    read_mv_component(c, s->prob->mvc[0], is_vp7);
 976                     mb->bmv[n].x = mb->mv.x +
 977                                    read_mv_component(c, s->prob->mvc[1], is_vp7);
 978                 } else {
 979                     AV_ZERO32(&mb->bmv[n]);
 980                 }
 981             } else {
 982                 AV_WN32A(&mb->bmv[n], above);
 983             }
 984         } else {
 985             AV_WN32A(&mb->bmv[n], left);
 986         }
 987     }
 988
 989     return num;
 990 }
 991
 992 /**
 993  * The vp7 reference decoder uses a padding macroblock column (added to right
 994  * edge of the frame) to guard against illegal macroblock offsets. The
 995  * algorithm has bugs that permit offsets to straddle the padding column.
 996  * This function replicates those bugs.
 997  *
 998  * @param[out] edge_x macroblock x address
 999  * @param[out] edge_y macroblock y address
1000  *
1001  * @return macroblock offset legal (boolean)
1002  */
1003 static int vp7_calculate_mb_offset(int mb_x, int mb_y, int mb_width,
1004                                    int xoffset, int yoffset, int boundary,
1005                                    int *edge_x, int *edge_y)
1006 {
1007     int vwidth = mb_width + 1;
1008     int new = (mb_y + yoffset) * vwidth + mb_x + xoffset;
1009     if (new < boundary || new % vwidth == vwidth - 1)
1010         return 0;
1011     *edge_y = new / vwidth;
1012     *edge_x = new % vwidth;
1013     return 1;
1014 }
1015
1016 static const VP56mv *get_bmv_ptr(const VP8Macroblock *mb, int subblock)
1017 {
1018     return &mb->bmv[mb->mode == VP8_MVMODE_SPLIT ? vp8_mbsplits[mb->partitioning][subblock] : 0];
1019 }
1020
1021 static av_always_inline
1022 void vp7_decode_mvs(VP8Context *s, VP8Macroblock *mb,
1023                     int mb_x, int mb_y, int layout)
1024 {
1025     VP8Macroblock *mb_edge[12];
1026     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR };
1027     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1028     int idx = CNT_ZERO;
1029     VP56mv near_mv[3];
1030     uint8_t cnt[3] = { 0 };
1031     VP56RangeCoder *c = &s->c;
1032     int i;
1033
1034     AV_ZERO32(&near_mv[0]);
1035     AV_ZERO32(&near_mv[1]);
1036     AV_ZERO32(&near_mv[2]);
1037
1038     for (i = 0; i < VP7_MV_PRED_COUNT; i++) {
1039         const VP7MVPred * pred = &vp7_mv_pred[i];
1040         int edge_x, edge_y;
1041
1042         if (vp7_calculate_mb_offset(mb_x, mb_y, s->mb_width, pred->xoffset,
1043                                     pred->yoffset, !s->profile, &edge_x, &edge_y)) {
1044             VP8Macroblock *edge = mb_edge[i] = (s->mb_layout == 1)
1045                                              ? s->macroblocks_base + 1 + edge_x +
1046                                                (s->mb_width + 1) * (edge_y + 1)
1047                                              : s->macroblocks + edge_x +
1048                                                (s->mb_height - edge_y - 1) * 2;
1049             uint32_t mv = AV_RN32A(get_bmv_ptr(edge, vp7_mv_pred[i].subblock));
1050             if (mv) {
1051                 if (AV_RN32A(&near_mv[CNT_NEAREST])) {
1052                     if (mv == AV_RN32A(&near_mv[CNT_NEAREST])) {
1053                         idx = CNT_NEAREST;
1054                     } else if (AV_RN32A(&near_mv[CNT_NEAR])) {
1055                         if (mv != AV_RN32A(&near_mv[CNT_NEAR]))
1056                             continue;
1057                         idx = CNT_NEAR;
1058                     } else {
1059                         AV_WN32A(&near_mv[CNT_NEAR], mv);
1060                         idx = CNT_NEAR;
1061                     }
1062                 } else {
1063                     AV_WN32A(&near_mv[CNT_NEAREST], mv);
1064                     idx = CNT_NEAREST;
1065                 }
1066             } else {
1067                 idx = CNT_ZERO;
1068             }
1069         } else {
1070             idx = CNT_ZERO;
1071         }
1072         cnt[idx] += vp7_mv_pred[i].score;
1073     }
1074
1075     mb->partitioning = VP8_SPLITMVMODE_NONE;
1076
1077     if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_ZERO]][0])) {
1078         mb->mode = VP8_MVMODE_MV;
1079
1080         if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAREST]][1])) {
1081
1082             if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][2])) {
1083
1084                 if (cnt[CNT_NEAREST] > cnt[CNT_NEAR])
1085                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAREST] ? 0 : AV_RN32A(&near_mv[CNT_NEAREST]));
1086                 else
1087                     AV_WN32A(&mb->mv, cnt[CNT_ZERO] > cnt[CNT_NEAR]    ? 0 : AV_RN32A(&near_mv[CNT_NEAR]));
1088
1089                 if (vp56_rac_get_prob_branchy(c, vp7_mode_contexts[cnt[CNT_NEAR]][3])) {
1090                     mb->mode = VP8_MVMODE_SPLIT;
1091                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP7) - 1];
1092                 } else {
1093                     mb->mv.y += vp7_read_mv_component(c, s->prob->mvc[0]);
1094                     mb->mv.x += vp7_read_mv_component(c, s->prob->mvc[1]);
1095                     mb->bmv[0] = mb->mv;
1096                 }
1097             } else {
1098                 mb->mv = near_mv[CNT_NEAR];
1099                 mb->bmv[0] = mb->mv;
1100             }
1101         } else {
1102             mb->mv = near_mv[CNT_NEAREST];
1103             mb->bmv[0] = mb->mv;
1104         }
1105     } else {
1106         mb->mode = VP8_MVMODE_ZERO;
1107         AV_ZERO32(&mb->mv);
1108         mb->bmv[0] = mb->mv;
1109     }
1110 }
1111
1112 static av_always_inline
1113 void vp8_decode_mvs(VP8Context *s, VP8mvbounds *mv_bounds, VP8Macroblock *mb,
1114                     int mb_x, int mb_y, int layout)
1115 {
1116     VP8Macroblock *mb_edge[3] = { 0      /* top */,
1117                                   mb - 1 /* left */,
1118                                   0      /* top-left */ };
1119     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
1120     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
1121     int idx = CNT_ZERO;
1122     int cur_sign_bias = s->sign_bias[mb->ref_frame];
1123     int8_t *sign_bias = s->sign_bias;
1124     VP56mv near_mv[4];
1125     uint8_t cnt[4] = { 0 };
1126     VP56RangeCoder *c = &s->c;
1127
1128     if (!layout) { // layout is inlined (s->mb_layout is not)
1129         mb_edge[0] = mb + 2;
1130         mb_edge[2] = mb + 1;
1131     } else {
1132         mb_edge[0] = mb - s->mb_width - 1;
1133         mb_edge[2] = mb - s->mb_width - 2;
1134     }
1135
1136     AV_ZERO32(&near_mv[0]);
1137     AV_ZERO32(&near_mv[1]);
1138     AV_ZERO32(&near_mv[2]);
1139
1140     /* Process MB on top, left and top-left */
1141 #define MV_EDGE_CHECK(n)                                                      \
1142     {                                                                         \
1143         VP8Macroblock *edge = mb_edge[n];                                     \
1144         int edge_ref = edge->ref_frame;                                       \
1145         if (edge_ref != VP56_FRAME_CURRENT) {                                 \
1146             uint32_t mv = AV_RN32A(&edge->mv);                                \
1147             if (mv) {                                                         \
1148                 if (cur_sign_bias != sign_bias[edge_ref]) {                   \
1149                     /* SWAR negate of the values in mv. */                    \
1150                     mv = ~mv;                                                 \
1151                     mv = ((mv & 0x7fff7fff) +                                 \
1152                           0x00010001) ^ (mv & 0x80008000);                    \
1153                 }                                                             \
1154                 if (!n || mv != AV_RN32A(&near_mv[idx]))                      \
1155                     AV_WN32A(&near_mv[++idx], mv);                            \
1156                 cnt[idx] += 1 + (n != 2);                                     \
1157             } else                                                            \
1158                 cnt[CNT_ZERO] += 1 + (n != 2);                                \
1159         }                                                                     \
1160     }
1161
1162     MV_EDGE_CHECK(0)
1163     MV_EDGE_CHECK(1)
1164     MV_EDGE_CHECK(2)
1165
1166     mb->partitioning = VP8_SPLITMVMODE_NONE;
1167     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
1168         mb->mode = VP8_MVMODE_MV;
1169
1170         /* If we have three distinct MVs, merge first and last if they're the same */
1171         if (cnt[CNT_SPLITMV] &&
1172             AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
1173             cnt[CNT_NEAREST] += 1;
1174
1175         /* Swap near and nearest if necessary */
1176         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
1177             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
1178             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
1179         }
1180
1181         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
1182             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
1183                 /* Choose the best mv out of 0,0 and the nearest mv */
1184                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
1185                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
1186                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
1187                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
1188
1189                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
1190                     mb->mode = VP8_MVMODE_SPLIT;
1191                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb, layout, IS_VP8) - 1];
1192                 } else {
1193                     mb->mv.y  += vp8_read_mv_component(c, s->prob->mvc[0]);
1194                     mb->mv.x  += vp8_read_mv_component(c, s->prob->mvc[1]);
1195                     mb->bmv[0] = mb->mv;
1196                 }
1197             } else {
1198                 clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAR]);
1199                 mb->bmv[0] = mb->mv;
1200             }
1201         } else {
1202             clamp_mv(mv_bounds, &mb->mv, &near_mv[CNT_NEAREST]);
1203             mb->bmv[0] = mb->mv;
1204         }
1205     } else {
1206         mb->mode = VP8_MVMODE_ZERO;
1207         AV_ZERO32(&mb->mv);
1208         mb->bmv[0] = mb->mv;
1209     }
1210 }
1211
1212 static av_always_inline
1213 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
1214                            int mb_x, int keyframe, int layout)
1215 {
1216     uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1217
1218     if (layout) {
1219         VP8Macroblock *mb_top = mb - s->mb_width - 1;
1220         memcpy(mb->intra4x4_pred_mode_top, mb_top->intra4x4_pred_mode_top, 4);
1221     }
1222     if (keyframe) {
1223         int x, y;
1224         uint8_t *top;
1225         uint8_t *const left = s->intra4x4_pred_mode_left;
1226         if (layout)
1227             top = mb->intra4x4_pred_mode_top;
1228         else
1229             top = s->intra4x4_pred_mode_top + 4 * mb_x;
1230         for (y = 0; y < 4; y++) {
1231             for (x = 0; x < 4; x++) {
1232                 const uint8_t *ctx;
1233                 ctx       = vp8_pred4x4_prob_intra[top[x]][left[y]];
1234                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
1235                 left[y]   = top[x] = *intra4x4;
1236                 intra4x4++;
1237             }
1238         }
1239     } else {
1240         int i;
1241         for (i = 0; i < 16; i++)
1242             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree,
1243                                            vp8_pred4x4_prob_inter);
1244     }
1245 }
1246
1247 static av_always_inline
1248 void decode_mb_mode(VP8Context *s, VP8mvbounds *mv_bounds,
1249                     VP8Macroblock *mb, int mb_x, int mb_y,
1250                     uint8_t *segment, uint8_t *ref, int layout, int is_vp7)
1251 {
1252     VP56RangeCoder *c = &s->c;
1253     static const char * const vp7_feature_name[] = { "q-index",
1254                                                      "lf-delta",
1255                                                      "partial-golden-update",
1256                                                      "blit-pitch" };
1257     if (is_vp7) {
1258         int i;
1259         *segment = 0;
1260         for (i = 0; i < 4; i++) {
1261             if (s->feature_enabled[i]) {
1262                 if (vp56_rac_get_prob_branchy(c, s->feature_present_prob[i])) {
1263                       int index = vp8_rac_get_tree(c, vp7_feature_index_tree,
1264                                                    s->feature_index_prob[i]);
1265                       av_log(s->avctx, AV_LOG_WARNING,
1266                              "Feature %s present in macroblock (value 0x%x)\n",
1267                              vp7_feature_name[i], s->feature_value[i][index]);
1268                 }
1269            }
1270         }
1271     } else if (s->segmentation.update_map) {
1272         int bit  = vp56_rac_get_prob(c, s->prob->segmentid[0]);
1273         *segment = vp56_rac_get_prob(c, s->prob->segmentid[1+bit]) + 2*bit;
1274     } else if (s->segmentation.enabled)
1275         *segment = ref ? *ref : *segment;
1276     mb->segment = *segment;
1277
1278     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
1279
1280     if (s->keyframe) {
1281         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra,
1282                                     vp8_pred16x16_prob_intra);
1283
1284         if (mb->mode == MODE_I4x4) {
1285             decode_intra4x4_modes(s, c, mb, mb_x, 1, layout);
1286         } else {
1287             const uint32_t modes = (is_vp7 ? vp7_pred4x4_mode
1288                                            : vp8_pred4x4_mode)[mb->mode] * 0x01010101u;
1289             if (s->mb_layout)
1290                 AV_WN32A(mb->intra4x4_pred_mode_top, modes);
1291             else
1292                 AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
1293             AV_WN32A(s->intra4x4_pred_mode_left, modes);
1294         }
1295
1296         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1297                                                 vp8_pred8x8c_prob_intra);
1298         mb->ref_frame        = VP56_FRAME_CURRENT;
1299     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
1300         // inter MB, 16.2
1301         if (vp56_rac_get_prob_branchy(c, s->prob->last))
1302             mb->ref_frame =
1303                 (!is_vp7 && vp56_rac_get_prob(c, s->prob->golden)) ? VP56_FRAME_GOLDEN2 /* altref */
1304                                                                    : VP56_FRAME_GOLDEN;
1305         else
1306             mb->ref_frame = VP56_FRAME_PREVIOUS;
1307         s->ref_count[mb->ref_frame - 1]++;
1308
1309         // motion vectors, 16.3
1310         if (is_vp7)
1311             vp7_decode_mvs(s, mb, mb_x, mb_y, layout);
1312         else
1313             vp8_decode_mvs(s, mv_bounds, mb, mb_x, mb_y, layout);
1314     } else {
1315         // intra MB, 16.1
1316         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
1317
1318         if (mb->mode == MODE_I4x4)
1319             decode_intra4x4_modes(s, c, mb, mb_x, 0, layout);
1320
1321         mb->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree,
1322                                                 s->prob->pred8x8c);
1323         mb->ref_frame        = VP56_FRAME_CURRENT;
1324         mb->partitioning     = VP8_SPLITMVMODE_NONE;
1325         AV_ZERO32(&mb->bmv[0]);
1326     }
1327 }
1328
1329 /**
1330  * @param r     arithmetic bitstream reader context
1331  * @param block destination for block coefficients
1332  * @param probs probabilities to use when reading trees from the bitstream
1333  * @param i     initial coeff index, 0 unless a separate DC block is coded
1334  * @param qmul  array holding the dc/ac dequant factor at position 0/1
1335  *
1336  * @return 0 if no coeffs were decoded
1337  *         otherwise, the index of the last coeff decoded plus one
1338  */
1339 static av_always_inline
1340 int decode_block_coeffs_internal(VP56RangeCoder *r, int16_t block[16],
1341                                  uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1342                                  int i, uint8_t *token_prob, int16_t qmul[2],
1343                                  const uint8_t scan[16], int vp7)
1344 {
1345     VP56RangeCoder c = *r;
1346     goto skip_eob;
1347     do {
1348         int coeff;
1349 restart:
1350         if (!vp56_rac_get_prob_branchy(&c, token_prob[0]))   // DCT_EOB
1351             break;
1352
1353 skip_eob:
1354         if (!vp56_rac_get_prob_branchy(&c, token_prob[1])) { // DCT_0
1355             if (++i == 16)
1356                 break; // invalid input; blocks should end with EOB
1357             token_prob = probs[i][0];
1358             if (vp7)
1359                 goto restart;
1360             goto skip_eob;
1361         }
1362
1363         if (!vp56_rac_get_prob_branchy(&c, token_prob[2])) { // DCT_1
1364             coeff = 1;
1365             token_prob = probs[i + 1][1];
1366         } else {
1367             if (!vp56_rac_get_prob_branchy(&c, token_prob[3])) { // DCT 2,3,4
1368                 coeff = vp56_rac_get_prob_branchy(&c, token_prob[4]);
1369                 if (coeff)
1370                     coeff += vp56_rac_get_prob(&c, token_prob[5]);
1371                 coeff += 2;
1372             } else {
1373                 // DCT_CAT*
1374                 if (!vp56_rac_get_prob_branchy(&c, token_prob[6])) {
1375                     if (!vp56_rac_get_prob_branchy(&c, token_prob[7])) { // DCT_CAT1
1376                         coeff = 5 + vp56_rac_get_prob(&c, vp8_dct_cat1_prob[0]);
1377                     } else {                                    // DCT_CAT2
1378                         coeff  = 7;
1379                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[0]) << 1;
1380                         coeff += vp56_rac_get_prob(&c, vp8_dct_cat2_prob[1]);
1381                     }
1382                 } else {    // DCT_CAT3 and up
1383                     int a   = vp56_rac_get_prob(&c, token_prob[8]);
1384                     int b   = vp56_rac_get_prob(&c, token_prob[9 + a]);
1385                     int cat = (a << 1) + b;
1386                     coeff  = 3 + (8 << cat);
1387                     coeff += vp8_rac_get_coeff(&c, ff_vp8_dct_cat_prob[cat]);
1388                 }
1389             }
1390             token_prob = probs[i + 1][2];
1391         }
1392         block[scan[i]] = (vp8_rac_get(&c) ? -coeff : coeff) * qmul[!!i];
1393     } while (++i < 16);
1394
1395     *r = c;
1396     return i;
1397 }
1398
1399 static av_always_inline
1400 int inter_predict_dc(int16_t block[16], int16_t pred[2])
1401 {
1402     int16_t dc = block[0];
1403     int ret = 0;
1404
1405     if (pred[1] > 3) {
1406         dc += pred[0];
1407         ret = 1;
1408     }
1409
1410     if (!pred[0] | !dc | ((int32_t)pred[0] ^ (int32_t)dc) >> 31) {
1411         block[0] = pred[0] = dc;
1412         pred[1] = 0;
1413     } else {
1414         if (pred[0] == dc)
1415             pred[1]++;
1416         block[0] = pred[0] = dc;
1417     }
1418
1419     return ret;
1420 }
1421
1422 static int vp7_decode_block_coeffs_internal(VP56RangeCoder *r,
1423                                             int16_t block[16],
1424                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1425                                             int i, uint8_t *token_prob,
1426                                             int16_t qmul[2],
1427                                             const uint8_t scan[16])
1428 {
1429     return decode_block_coeffs_internal(r, block, probs, i,
1430                                         token_prob, qmul, scan, IS_VP7);
1431 }
1432
1433 #ifndef vp8_decode_block_coeffs_internal
1434 static int vp8_decode_block_coeffs_internal(VP56RangeCoder *r,
1435                                             int16_t block[16],
1436                                             uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1437                                             int i, uint8_t *token_prob,
1438                                             int16_t qmul[2])
1439 {
1440     return decode_block_coeffs_internal(r, block, probs, i,
1441                                         token_prob, qmul, ff_zigzag_scan, IS_VP8);
1442 }
1443 #endif
1444
1445 /**
1446  * @param c          arithmetic bitstream reader context
1447  * @param block      destination for block coefficients
1448  * @param probs      probabilities to use when reading trees from the bitstream
1449  * @param i          initial coeff index, 0 unless a separate DC block is coded
1450  * @param zero_nhood the initial prediction context for number of surrounding
1451  *                   all-zero blocks (only left/top, so 0-2)
1452  * @param qmul       array holding the dc/ac dequant factor at position 0/1
1453  * @param scan       scan pattern (VP7 only)
1454  *
1455  * @return 0 if no coeffs were decoded
1456  *         otherwise, the index of the last coeff decoded plus one
1457  */
1458 static av_always_inline
1459 int decode_block_coeffs(VP56RangeCoder *c, int16_t block[16],
1460                         uint8_t probs[16][3][NUM_DCT_TOKENS - 1],
1461                         int i, int zero_nhood, int16_t qmul[2],
1462                         const uint8_t scan[16], int vp7)
1463 {
1464     uint8_t *token_prob = probs[i][zero_nhood];
1465     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
1466         return 0;
1467     return vp7 ? vp7_decode_block_coeffs_internal(c, block, probs, i,
1468                                                   token_prob, qmul, scan)
1469                : vp8_decode_block_coeffs_internal(c, block, probs, i,
1470                                                   token_prob, qmul);
1471 }
1472
1473 static av_always_inline
1474 void decode_mb_coeffs(VP8Context *s, VP8ThreadData *td, VP56RangeCoder *c,
1475                       VP8Macroblock *mb, uint8_t t_nnz[9], uint8_t l_nnz[9],
1476                       int is_vp7)
1477 {
1478     int i, x, y, luma_start = 0, luma_ctx = 3;
1479     int nnz_pred, nnz, nnz_total = 0;
1480     int segment = mb->segment;
1481     int block_dc = 0;
1482
1483     if (mb->mode != MODE_I4x4 && (is_vp7 || mb->mode != VP8_MVMODE_SPLIT)) {
1484         nnz_pred = t_nnz[8] + l_nnz[8];
1485
1486         // decode DC values and do hadamard
1487         nnz = decode_block_coeffs(c, td->block_dc, s->prob->token[1], 0,
1488                                   nnz_pred, s->qmat[segment].luma_dc_qmul,
1489                                   ff_zigzag_scan, is_vp7);
1490         l_nnz[8] = t_nnz[8] = !!nnz;
1491
1492         if (is_vp7 && mb->mode > MODE_I4x4) {
1493             nnz |=  inter_predict_dc(td->block_dc,
1494                                      s->inter_dc_pred[mb->ref_frame - 1]);
1495         }
1496
1497         if (nnz) {
1498             nnz_total += nnz;
1499             block_dc   = 1;
1500             if (nnz == 1)
1501                 s->vp8dsp.vp8_luma_dc_wht_dc(td->block, td->block_dc);
1502             else
1503                 s->vp8dsp.vp8_luma_dc_wht(td->block, td->block_dc);
1504         }
1505         luma_start = 1;
1506         luma_ctx   = 0;
1507     }
1508
1509     // luma blocks
1510     for (y = 0; y < 4; y++)
1511         for (x = 0; x < 4; x++) {
1512             nnz_pred = l_nnz[y] + t_nnz[x];
1513             nnz = decode_block_coeffs(c, td->block[y][x],
1514                                       s->prob->token[luma_ctx],
1515                                       luma_start, nnz_pred,
1516                                       s->qmat[segment].luma_qmul,
1517                                       s->prob[0].scan, is_vp7);
1518             /* nnz+block_dc may be one more than the actual last index,
1519              * but we don't care */
1520             td->non_zero_count_cache[y][x] = nnz + block_dc;
1521             t_nnz[x] = l_nnz[y] = !!nnz;
1522             nnz_total += nnz;
1523         }
1524
1525     // chroma blocks
1526     // TODO: what to do about dimensions? 2nd dim for luma is x,
1527     // but for chroma it's (y<<1)|x
1528     for (i = 4; i < 6; i++)
1529         for (y = 0; y < 2; y++)
1530             for (x = 0; x < 2; x++) {
1531                 nnz_pred = l_nnz[i + 2 * y] + t_nnz[i + 2 * x];
1532                 nnz = decode_block_coeffs(c, td->block[i][(y << 1) + x],
1533                                           s->prob->token[2], 0, nnz_pred,
1534                                           s->qmat[segment].chroma_qmul,
1535                                           s->prob[0].scan, is_vp7);
1536                 td->non_zero_count_cache[i][(y << 1) + x] = nnz;
1537                 t_nnz[i + 2 * x] = l_nnz[i + 2 * y] = !!nnz;
1538                 nnz_total += nnz;
1539             }
1540
1541     // if there were no coded coeffs despite the macroblock not being marked skip,
1542     // we MUST not do the inner loop filter and should not do IDCT
1543     // Since skip isn't used for bitstream prediction, just manually set it.
1544     if (!nnz_total)
1545         mb->skip = 1;
1546 }
1547
1548 static av_always_inline
1549 void backup_mb_border(uint8_t *top_border, uint8_t *src_y,
1550                       uint8_t *src_cb, uint8_t *src_cr,
1551                       ptrdiff_t linesize, ptrdiff_t uvlinesize, int simple)
1552 {
1553     AV_COPY128(top_border, src_y + 15 * linesize);
1554     if (!simple) {
1555         AV_COPY64(top_border + 16, src_cb + 7 * uvlinesize);
1556         AV_COPY64(top_border + 24, src_cr + 7 * uvlinesize);
1557     }
1558 }
1559
1560 static av_always_inline
1561 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb,
1562                     uint8_t *src_cr, ptrdiff_t linesize, ptrdiff_t uvlinesize, int mb_x,
1563                     int mb_y, int mb_width, int simple, int xchg)
1564 {
1565     uint8_t *top_border_m1 = top_border - 32;     // for TL prediction
1566     src_y  -= linesize;
1567     src_cb -= uvlinesize;
1568     src_cr -= uvlinesize;
1569
1570 #define XCHG(a, b, xchg)                                                      \
1571     do {                                                                      \
1572         if (xchg)                                                             \
1573             AV_SWAP64(b, a);                                                  \
1574         else                                                                  \
1575             AV_COPY64(b, a);                                                  \
1576     } while (0)
1577
1578     XCHG(top_border_m1 + 8, src_y - 8, xchg);
1579     XCHG(top_border, src_y, xchg);
1580     XCHG(top_border + 8, src_y + 8, 1);
1581     if (mb_x < mb_width - 1)
1582         XCHG(top_border + 32, src_y + 16, 1);
1583
1584     // only copy chroma for normal loop filter
1585     // or to initialize the top row to 127
1586     if (!simple || !mb_y) {
1587         XCHG(top_border_m1 + 16, src_cb - 8, xchg);
1588         XCHG(top_border_m1 + 24, src_cr - 8, xchg);
1589         XCHG(top_border + 16, src_cb, 1);
1590         XCHG(top_border + 24, src_cr, 1);
1591     }
1592 }
1593
1594 static av_always_inline
1595 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
1596 {
1597     if (!mb_x)
1598         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
1599     else
1600         return mb_y ? mode : LEFT_DC_PRED8x8;
1601 }
1602
1603 static av_always_inline
1604 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y, int vp7)
1605 {
1606     if (!mb_x)
1607         return mb_y ? VERT_PRED8x8 : (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8);
1608     else
1609         return mb_y ? mode : HOR_PRED8x8;
1610 }
1611
1612 static av_always_inline
1613 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y, int vp7)
1614 {
1615     switch (mode) {
1616     case DC_PRED8x8:
1617         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
1618     case VERT_PRED8x8:
1619         return !mb_y ? (vp7 ? DC_128_PRED8x8 : DC_127_PRED8x8) : mode;
1620     case HOR_PRED8x8:
1621         return !mb_x ? (vp7 ? DC_128_PRED8x8 : DC_129_PRED8x8) : mode;
1622     case PLANE_PRED8x8: /* TM */
1623         return check_tm_pred8x8_mode(mode, mb_x, mb_y, vp7);
1624     }
1625     return mode;
1626 }
1627
1628 static av_always_inline
1629 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y, int vp7)
1630 {
1631     if (!mb_x) {
1632         return mb_y ? VERT_VP8_PRED : (vp7 ? DC_128_PRED : DC_129_PRED);
1633     } else {
1634         return mb_y ? mode : HOR_VP8_PRED;
1635     }
1636 }
1637
1638 static av_always_inline
1639 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y,
1640                                      int *copy_buf, int vp7)
1641 {
1642     switch (mode) {
1643     case VERT_PRED:
1644         if (!mb_x && mb_y) {
1645             *copy_buf = 1;
1646             return mode;
1647         }
1648         /* fall-through */
1649     case DIAG_DOWN_LEFT_PRED:
1650     case VERT_LEFT_PRED:
1651         return !mb_y ? (vp7 ? DC_128_PRED : DC_127_PRED) : mode;
1652     case HOR_PRED:
1653         if (!mb_y) {
1654             *copy_buf = 1;
1655             return mode;
1656         }
1657         /* fall-through */
1658     case HOR_UP_PRED:
1659         return !mb_x ? (vp7 ? DC_128_PRED : DC_129_PRED) : mode;
1660     case TM_VP8_PRED:
1661         return check_tm_pred4x4_mode(mode, mb_x, mb_y, vp7);
1662     case DC_PRED: /* 4x4 DC doesn't use the same "H.264-style" exceptions
1663                    * as 16x16/8x8 DC */
1664     case DIAG_DOWN_RIGHT_PRED:
1665     case VERT_RIGHT_PRED:
1666     case HOR_DOWN_PRED:
1667         if (!mb_y || !mb_x)
1668             *copy_buf = 1;
1669         return mode;
1670     }
1671     return mode;
1672 }
1673
1674 static av_always_inline
1675 void intra_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1676                    VP8Macroblock *mb, int mb_x, int mb_y, int is_vp7)
1677 {
1678     int x, y, mode, nnz;
1679     uint32_t tr;
1680
1681     /* for the first row, we need to run xchg_mb_border to init the top edge
1682      * to 127 otherwise, skip it if we aren't going to deblock */
1683     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1684         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1685                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1686                        s->filter.simple, 1);
1687
1688     if (mb->mode < MODE_I4x4) {
1689         mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y, is_vp7);
1690         s->hpc.pred16x16[mode](dst[0], s->linesize);
1691     } else {
1692         uint8_t *ptr = dst[0];
1693         uint8_t *intra4x4 = mb->intra4x4_pred_mode_mb;
1694         const uint8_t lo = is_vp7 ? 128 : 127;
1695         const uint8_t hi = is_vp7 ? 128 : 129;
1696         uint8_t tr_top[4] = { lo, lo, lo, lo };
1697
1698         // all blocks on the right edge of the macroblock use bottom edge
1699         // the top macroblock for their topright edge
1700         uint8_t *tr_right = ptr - s->linesize + 16;
1701
1702         // if we're on the right edge of the frame, said edge is extended
1703         // from the top macroblock
1704         if (mb_y && mb_x == s->mb_width - 1) {
1705             tr       = tr_right[-1] * 0x01010101u;
1706             tr_right = (uint8_t *) &tr;
1707         }
1708
1709         if (mb->skip)
1710             AV_ZERO128(td->non_zero_count_cache);
1711
1712         for (y = 0; y < 4; y++) {
1713             uint8_t *topright = ptr + 4 - s->linesize;
1714             for (x = 0; x < 4; x++) {
1715                 int copy = 0;
1716                 ptrdiff_t linesize = s->linesize;
1717                 uint8_t *dst = ptr + 4 * x;
1718                 LOCAL_ALIGNED(4, uint8_t, copy_dst, [5 * 8]);
1719
1720                 if ((y == 0 || x == 3) && mb_y == 0) {
1721                     topright = tr_top;
1722                 } else if (x == 3)
1723                     topright = tr_right;
1724
1725                 mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x,
1726                                                         mb_y + y, &copy, is_vp7);
1727                 if (copy) {
1728                     dst      = copy_dst + 12;
1729                     linesize = 8;
1730                     if (!(mb_y + y)) {
1731                         copy_dst[3] = lo;
1732                         AV_WN32A(copy_dst + 4, lo * 0x01010101U);
1733                     } else {
1734                         AV_COPY32(copy_dst + 4, ptr + 4 * x - s->linesize);
1735                         if (!(mb_x + x)) {
1736                             copy_dst[3] = hi;
1737                         } else {
1738                             copy_dst[3] = ptr[4 * x - s->linesize - 1];
1739                         }
1740                     }
1741                     if (!(mb_x + x)) {
1742                         copy_dst[11] =
1743                         copy_dst[19] =
1744                         copy_dst[27] =
1745                         copy_dst[35] = hi;
1746                     } else {
1747                         copy_dst[11] = ptr[4 * x                   - 1];
1748                         copy_dst[19] = ptr[4 * x + s->linesize     - 1];
1749                         copy_dst[27] = ptr[4 * x + s->linesize * 2 - 1];
1750                         copy_dst[35] = ptr[4 * x + s->linesize * 3 - 1];
1751                     }
1752                 }
1753                 s->hpc.pred4x4[mode](dst, topright, linesize);
1754                 if (copy) {
1755                     AV_COPY32(ptr + 4 * x,                   copy_dst + 12);
1756                     AV_COPY32(ptr + 4 * x + s->linesize,     copy_dst + 20);
1757                     AV_COPY32(ptr + 4 * x + s->linesize * 2, copy_dst + 28);
1758                     AV_COPY32(ptr + 4 * x + s->linesize * 3, copy_dst + 36);
1759                 }
1760
1761                 nnz = td->non_zero_count_cache[y][x];
1762                 if (nnz) {
1763                     if (nnz == 1)
1764                         s->vp8dsp.vp8_idct_dc_add(ptr + 4 * x,
1765                                                   td->block[y][x], s->linesize);
1766                     else
1767                         s->vp8dsp.vp8_idct_add(ptr + 4 * x,
1768                                                td->block[y][x], s->linesize);
1769                 }
1770                 topright += 4;
1771             }
1772
1773             ptr      += 4 * s->linesize;
1774             intra4x4 += 4;
1775         }
1776     }
1777
1778     mode = check_intra_pred8x8_mode_emuedge(mb->chroma_pred_mode,
1779                                             mb_x, mb_y, is_vp7);
1780     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1781     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1782
1783     if (mb_y && (s->deblock_filter || !mb_y) && td->thread_nr == 0)
1784         xchg_mb_border(s->top_border[mb_x + 1], dst[0], dst[1], dst[2],
1785                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1786                        s->filter.simple, 0);
1787 }
1788
1789 static const uint8_t subpel_idx[3][8] = {
1790     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1791                                 // also function pointer index
1792     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1793     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1794 };
1795
1796 /**
1797  * luma MC function
1798  *
1799  * @param s        VP8 decoding context
1800  * @param dst      target buffer for block data at block position
1801  * @param ref      reference picture buffer at origin (0, 0)
1802  * @param mv       motion vector (relative to block position) to get pixel data from
1803  * @param x_off    horizontal position of block from origin (0, 0)
1804  * @param y_off    vertical position of block from origin (0, 0)
1805  * @param block_w  width of block (16, 8 or 4)
1806  * @param block_h  height of block (always same as block_w)
1807  * @param width    width of src/dst plane data
1808  * @param height   height of src/dst plane data
1809  * @param linesize size of a single line of plane data, including padding
1810  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1811  */
1812 static av_always_inline
1813 void vp8_mc_luma(VP8Context *s, VP8ThreadData *td, uint8_t *dst,
1814                  ThreadFrame *ref, const VP56mv *mv,
1815                  int x_off, int y_off, int block_w, int block_h,
1816                  int width, int height, ptrdiff_t linesize,
1817                  vp8_mc_func mc_func[3][3])
1818 {
1819     uint8_t *src = ref->f->data[0];
1820
1821     if (AV_RN32A(mv)) {
1822         ptrdiff_t src_linesize = linesize;
1823
1824         int mx = (mv->x * 2) & 7, mx_idx = subpel_idx[0][mx];
1825         int my = (mv->y * 2) & 7, my_idx = subpel_idx[0][my];
1826
1827         x_off += mv->x >> 2;
1828         y_off += mv->y >> 2;
1829
1830         // edge emulation
1831         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1832         src += y_off * linesize + x_off;
1833         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1834             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1835             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1836                                      src - my_idx * linesize - mx_idx,
1837                                      EDGE_EMU_LINESIZE, linesize,
1838                                      block_w + subpel_idx[1][mx],
1839                                      block_h + subpel_idx[1][my],
1840                                      x_off - mx_idx, y_off - my_idx,
1841                                      width, height);
1842             src = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1843             src_linesize = EDGE_EMU_LINESIZE;
1844         }
1845         mc_func[my_idx][mx_idx](dst, linesize, src, src_linesize, block_h, mx, my);
1846     } else {
1847         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1848         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off,
1849                       linesize, block_h, 0, 0);
1850     }
1851 }
1852
1853 /**
1854  * chroma MC function
1855  *
1856  * @param s        VP8 decoding context
1857  * @param dst1     target buffer for block data at block position (U plane)
1858  * @param dst2     target buffer for block data at block position (V plane)
1859  * @param ref      reference picture buffer at origin (0, 0)
1860  * @param mv       motion vector (relative to block position) to get pixel data from
1861  * @param x_off    horizontal position of block from origin (0, 0)
1862  * @param y_off    vertical position of block from origin (0, 0)
1863  * @param block_w  width of block (16, 8 or 4)
1864  * @param block_h  height of block (always same as block_w)
1865  * @param width    width of src/dst plane data
1866  * @param height   height of src/dst plane data
1867  * @param linesize size of a single line of plane data, including padding
1868  * @param mc_func  motion compensation function pointers (bilinear or sixtap MC)
1869  */
1870 static av_always_inline
1871 void vp8_mc_chroma(VP8Context *s, VP8ThreadData *td, uint8_t *dst1,
1872                    uint8_t *dst2, ThreadFrame *ref, const VP56mv *mv,
1873                    int x_off, int y_off, int block_w, int block_h,
1874                    int width, int height, ptrdiff_t linesize,
1875                    vp8_mc_func mc_func[3][3])
1876 {
1877     uint8_t *src1 = ref->f->data[1], *src2 = ref->f->data[2];
1878
1879     if (AV_RN32A(mv)) {
1880         int mx = mv->x & 7, mx_idx = subpel_idx[0][mx];
1881         int my = mv->y & 7, my_idx = subpel_idx[0][my];
1882
1883         x_off += mv->x >> 3;
1884         y_off += mv->y >> 3;
1885
1886         // edge emulation
1887         src1 += y_off * linesize + x_off;
1888         src2 += y_off * linesize + x_off;
1889         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1890         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1891             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1892             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1893                                      src1 - my_idx * linesize - mx_idx,
1894                                      EDGE_EMU_LINESIZE, linesize,
1895                                      block_w + subpel_idx[1][mx],
1896                                      block_h + subpel_idx[1][my],
1897                                      x_off - mx_idx, y_off - my_idx, width, height);
1898             src1 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1899             mc_func[my_idx][mx_idx](dst1, linesize, src1, EDGE_EMU_LINESIZE, block_h, mx, my);
1900
1901             s->vdsp.emulated_edge_mc(td->edge_emu_buffer,
1902                                      src2 - my_idx * linesize - mx_idx,
1903                                      EDGE_EMU_LINESIZE, linesize,
1904                                      block_w + subpel_idx[1][mx],
1905                                      block_h + subpel_idx[1][my],
1906                                      x_off - mx_idx, y_off - my_idx, width, height);
1907             src2 = td->edge_emu_buffer + mx_idx + EDGE_EMU_LINESIZE * my_idx;
1908             mc_func[my_idx][mx_idx](dst2, linesize, src2, EDGE_EMU_LINESIZE, block_h, mx, my);
1909         } else {
1910             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1911             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1912         }
1913     } else {
1914         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1915         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1916         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1917     }
1918 }
1919
1920 static av_always_inline
1921 void vp8_mc_part(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1922                  ThreadFrame *ref_frame, int x_off, int y_off,
1923                  int bx_off, int by_off, int block_w, int block_h,
1924                  int width, int height, VP56mv *mv)
1925 {
1926     VP56mv uvmv = *mv;
1927
1928     /* Y */
1929     vp8_mc_luma(s, td, dst[0] + by_off * s->linesize + bx_off,
1930                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1931                 block_w, block_h, width, height, s->linesize,
1932                 s->put_pixels_tab[block_w == 8]);
1933
1934     /* U/V */
1935     if (s->profile == 3) {
1936         /* this block only applies VP8; it is safe to check
1937          * only the profile, as VP7 profile <= 1 */
1938         uvmv.x &= ~7;
1939         uvmv.y &= ~7;
1940     }
1941     x_off   >>= 1;
1942     y_off   >>= 1;
1943     bx_off  >>= 1;
1944     by_off  >>= 1;
1945     width   >>= 1;
1946     height  >>= 1;
1947     block_w >>= 1;
1948     block_h >>= 1;
1949     vp8_mc_chroma(s, td, dst[1] + by_off * s->uvlinesize + bx_off,
1950                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1951                   &uvmv, x_off + bx_off, y_off + by_off,
1952                   block_w, block_h, width, height, s->uvlinesize,
1953                   s->put_pixels_tab[1 + (block_w == 4)]);
1954 }
1955
1956 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1957  * Optimized for 64-byte cache lines. Inspired by ffh264 prefetch_motion. */
1958 static av_always_inline
1959 void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y,
1960                      int mb_xy, int ref)
1961 {
1962     /* Don't prefetch refs that haven't been used very often this frame. */
1963     if (s->ref_count[ref - 1] > (mb_xy >> 5)) {
1964         int x_off = mb_x << 4, y_off = mb_y << 4;
1965         int mx = (mb->mv.x >> 2) + x_off + 8;
1966         int my = (mb->mv.y >> 2) + y_off;
1967         uint8_t **src = s->framep[ref]->tf.f->data;
1968         int off = mx + (my + (mb_x & 3) * 4) * s->linesize + 64;
1969         /* For threading, a ff_thread_await_progress here might be useful, but
1970          * it actually slows down the decoder. Since a bad prefetch doesn't
1971          * generate bad decoder output, we don't run it here. */
1972         s->vdsp.prefetch(src[0] + off, s->linesize, 4);
1973         off = (mx >> 1) + ((my >> 1) + (mb_x & 7)) * s->uvlinesize + 64;
1974         s->vdsp.prefetch(src[1] + off, src[2] - src[1], 2);
1975     }
1976 }
1977
1978 /**
1979  * Apply motion vectors to prediction buffer, chapter 18.
1980  */
1981 static av_always_inline
1982 void inter_predict(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3],
1983                    VP8Macroblock *mb, int mb_x, int mb_y)
1984 {
1985     int x_off = mb_x << 4, y_off = mb_y << 4;
1986     int width = 16 * s->mb_width, height = 16 * s->mb_height;
1987     ThreadFrame *ref = &s->framep[mb->ref_frame]->tf;
1988     VP56mv *bmv = mb->bmv;
1989
1990     switch (mb->partitioning) {
1991     case VP8_SPLITMVMODE_NONE:
1992         vp8_mc_part(s, td, dst, ref, x_off, y_off,
1993                     0, 0, 16, 16, width, height, &mb->mv);
1994         break;
1995     case VP8_SPLITMVMODE_4x4: {
1996         int x, y;
1997         VP56mv uvmv;
1998
1999         /* Y */
2000         for (y = 0; y < 4; y++) {
2001             for (x = 0; x < 4; x++) {
2002                 vp8_mc_luma(s, td, dst[0] + 4 * y * s->linesize + x * 4,
2003                             ref, &bmv[4 * y + x],
2004                             4 * x + x_off, 4 * y + y_off, 4, 4,
2005                             width, height, s->linesize,
2006                             s->put_pixels_tab[2]);
2007             }
2008         }
2009
2010         /* U/V */
2011         x_off  >>= 1;
2012         y_off  >>= 1;
2013         width  >>= 1;
2014         height >>= 1;
2015         for (y = 0; y < 2; y++) {
2016             for (x = 0; x < 2; x++) {
2017                 uvmv.x = mb->bmv[2 * y       * 4 + 2 * x    ].x +
2018                          mb->bmv[2 * y       * 4 + 2 * x + 1].x +
2019                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].x +
2020                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].x;
2021                 uvmv.y = mb->bmv[2 * y       * 4 + 2 * x    ].y +
2022                          mb->bmv[2 * y       * 4 + 2 * x + 1].y +
2023                          mb->bmv[(2 * y + 1) * 4 + 2 * x    ].y +
2024                          mb->bmv[(2 * y + 1) * 4 + 2 * x + 1].y;
2025                 uvmv.x = (uvmv.x + 2 + FF_SIGNBIT(uvmv.x)) >> 2;
2026                 uvmv.y = (uvmv.y + 2 + FF_SIGNBIT(uvmv.y)) >> 2;
2027                 if (s->profile == 3) {
2028                     uvmv.x &= ~7;
2029                     uvmv.y &= ~7;
2030                 }
2031                 vp8_mc_chroma(s, td, dst[1] + 4 * y * s->uvlinesize + x * 4,
2032                               dst[2] + 4 * y * s->uvlinesize + x * 4, ref,
2033                               &uvmv, 4 * x + x_off, 4 * y + y_off, 4, 4,
2034                               width, height, s->uvlinesize,
2035                               s->put_pixels_tab[2]);
2036             }
2037         }
2038         break;
2039     }
2040     case VP8_SPLITMVMODE_16x8:
2041         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2042                     0, 0, 16, 8, width, height, &bmv[0]);
2043         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2044                     0, 8, 16, 8, width, height, &bmv[1]);
2045         break;
2046     case VP8_SPLITMVMODE_8x16:
2047         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2048                     0, 0, 8, 16, width, height, &bmv[0]);
2049         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2050                     8, 0, 8, 16, width, height, &bmv[1]);
2051         break;
2052     case VP8_SPLITMVMODE_8x8:
2053         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2054                     0, 0, 8, 8, width, height, &bmv[0]);
2055         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2056                     8, 0, 8, 8, width, height, &bmv[1]);
2057         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2058                     0, 8, 8, 8, width, height, &bmv[2]);
2059         vp8_mc_part(s, td, dst, ref, x_off, y_off,
2060                     8, 8, 8, 8, width, height, &bmv[3]);
2061         break;
2062     }
2063 }
2064
2065 static av_always_inline
2066 void idct_mb(VP8Context *s, VP8ThreadData *td, uint8_t *dst[3], VP8Macroblock *mb)
2067 {
2068     int x, y, ch;
2069
2070     if (mb->mode != MODE_I4x4) {
2071         uint8_t *y_dst = dst[0];
2072         for (y = 0; y < 4; y++) {
2073             uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[y]);
2074             if (nnz4) {
2075                 if (nnz4 & ~0x01010101) {
2076                     for (x = 0; x < 4; x++) {
2077                         if ((uint8_t) nnz4 == 1)
2078                             s->vp8dsp.vp8_idct_dc_add(y_dst + 4 * x,
2079                                                       td->block[y][x],
2080                                                       s->linesize);
2081                         else if ((uint8_t) nnz4 > 1)
2082                             s->vp8dsp.vp8_idct_add(y_dst + 4 * x,
2083                                                    td->block[y][x],
2084                                                    s->linesize);
2085                         nnz4 >>= 8;
2086                         if (!nnz4)
2087                             break;
2088                     }
2089                 } else {
2090                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, td->block[y], s->linesize);
2091                 }
2092             }
2093             y_dst += 4 * s->linesize;
2094         }
2095     }
2096
2097     for (ch = 0; ch < 2; ch++) {
2098         uint32_t nnz4 = AV_RL32(td->non_zero_count_cache[4 + ch]);
2099         if (nnz4) {
2100             uint8_t *ch_dst = dst[1 + ch];
2101             if (nnz4 & ~0x01010101) {
2102                 for (y = 0; y < 2; y++) {
2103                     for (x = 0; x < 2; x++) {
2104                         if ((uint8_t) nnz4 == 1)
2105                             s->vp8dsp.vp8_idct_dc_add(ch_dst + 4 * x,
2106                                                       td->block[4 + ch][(y << 1) + x],
2107                                                       s->uvlinesize);
2108                         else if ((uint8_t) nnz4 > 1)
2109                             s->vp8dsp.vp8_idct_add(ch_dst + 4 * x,
2110                                                    td->block[4 + ch][(y << 1) + x],
2111                                                    s->uvlinesize);
2112                         nnz4 >>= 8;
2113                         if (!nnz4)
2114                             goto chroma_idct_end;
2115                     }
2116                     ch_dst += 4 * s->uvlinesize;
2117                 }
2118             } else {
2119                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, td->block[4 + ch], s->uvlinesize);
2120             }
2121         }
2122 chroma_idct_end:
2123         ;
2124     }
2125 }
2126
2127 static av_always_inline
2128 void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb,
2129                          VP8FilterStrength *f, int is_vp7)
2130 {
2131     int interior_limit, filter_level;
2132
2133     if (s->segmentation.enabled) {
2134         filter_level = s->segmentation.filter_level[mb->segment];
2135         if (!s->segmentation.absolute_vals)
2136             filter_level += s->filter.level;
2137     } else
2138         filter_level = s->filter.level;
2139
2140     if (s->lf_delta.enabled) {
2141         filter_level += s->lf_delta.ref[mb->ref_frame];
2142         filter_level += s->lf_delta.mode[mb->mode];
2143     }
2144
2145     filter_level = av_clip_uintp2(filter_level, 6);
2146
2147     interior_limit = filter_level;
2148     if (s->filter.sharpness) {
2149         interior_limit >>= (s->filter.sharpness + 3) >> 2;
2150         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
2151     }
2152     interior_limit = FFMAX(interior_limit, 1);
2153
2154     f->filter_level = filter_level;
2155     f->inner_limit = interior_limit;
2156     f->inner_filter = is_vp7 || !mb->skip || mb->mode == MODE_I4x4 ||
2157                       mb->mode == VP8_MVMODE_SPLIT;
2158 }
2159
2160 static av_always_inline
2161 void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f,
2162                int mb_x, int mb_y, int is_vp7)
2163 {
2164     int mbedge_lim, bedge_lim_y, bedge_lim_uv, hev_thresh;
2165     int filter_level = f->filter_level;
2166     int inner_limit = f->inner_limit;
2167     int inner_filter = f->inner_filter;
2168     ptrdiff_t linesize   = s->linesize;
2169     ptrdiff_t uvlinesize = s->uvlinesize;
2170     static const uint8_t hev_thresh_lut[2][64] = {
2171         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2172           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2173           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
2174           3, 3, 3, 3 },
2175         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
2176           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2177           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2178           2, 2, 2, 2 }
2179     };
2180
2181     if (!filter_level)
2182         return;
2183
2184     if (is_vp7) {
2185         bedge_lim_y  = filter_level;
2186         bedge_lim_uv = filter_level * 2;
2187         mbedge_lim   = filter_level + 2;
2188     } else {
2189         bedge_lim_y  =
2190         bedge_lim_uv = filter_level * 2 + inner_limit;
2191         mbedge_lim   = bedge_lim_y + 4;
2192     }
2193
2194     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
2195
2196     if (mb_x) {
2197         s->vp8dsp.vp8_h_loop_filter16y(dst[0], linesize,
2198                                        mbedge_lim, inner_limit, hev_thresh);
2199         s->vp8dsp.vp8_h_loop_filter8uv(dst[1], dst[2], uvlinesize,
2200                                        mbedge_lim, inner_limit, hev_thresh);
2201     }
2202
2203 #define H_LOOP_FILTER_16Y_INNER(cond)                                         \
2204     if (cond && inner_filter) {                                               \
2205         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  4, linesize,           \
2206                                              bedge_lim_y, inner_limit,        \
2207                                              hev_thresh);                     \
2208         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] +  8, linesize,           \
2209                                              bedge_lim_y, inner_limit,        \
2210                                              hev_thresh);                     \
2211         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0] + 12, linesize,           \
2212                                              bedge_lim_y, inner_limit,        \
2213                                              hev_thresh);                     \
2214         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] +  4, dst[2] + 4,         \
2215                                              uvlinesize,  bedge_lim_uv,       \
2216                                              inner_limit, hev_thresh);        \
2217     }
2218
2219     H_LOOP_FILTER_16Y_INNER(!is_vp7)
2220
2221     if (mb_y) {
2222         s->vp8dsp.vp8_v_loop_filter16y(dst[0], linesize,
2223                                        mbedge_lim, inner_limit, hev_thresh);
2224         s->vp8dsp.vp8_v_loop_filter8uv(dst[1], dst[2], uvlinesize,
2225                                        mbedge_lim, inner_limit, hev_thresh);
2226     }
2227
2228     if (inner_filter) {
2229         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  4 * linesize,
2230                                              linesize, bedge_lim_y,
2231                                              inner_limit, hev_thresh);
2232         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] +  8 * linesize,
2233                                              linesize, bedge_lim_y,
2234                                              inner_limit, hev_thresh);
2235         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0] + 12 * linesize,
2236                                              linesize, bedge_lim_y,
2237                                              inner_limit, hev_thresh);
2238         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] +  4 * uvlinesize,
2239                                              dst[2] +  4 * uvlinesize,
2240                                              uvlinesize, bedge_lim_uv,
2241                                              inner_limit, hev_thresh);
2242     }
2243
2244     H_LOOP_FILTER_16Y_INNER(is_vp7)
2245 }
2246
2247 static av_always_inline
2248 void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f,
2249                       int mb_x, int mb_y)
2250 {
2251     int mbedge_lim, bedge_lim;
2252     int filter_level = f->filter_level;
2253     int inner_limit  = f->inner_limit;
2254     int inner_filter = f->inner_filter;
2255     ptrdiff_t linesize = s->linesize;
2256
2257     if (!filter_level)
2258         return;
2259
2260     bedge_lim  = 2 * filter_level + inner_limit;
2261     mbedge_lim = bedge_lim + 4;
2262
2263     if (mb_x)
2264         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
2265     if (inner_filter) {
2266         s->vp8dsp.vp8_h_loop_filter_simple(dst +  4, linesize, bedge_lim);
2267         s->vp8dsp.vp8_h_loop_filter_simple(dst +  8, linesize, bedge_lim);
2268         s->vp8dsp.vp8_h_loop_filter_simple(dst + 12, linesize, bedge_lim);
2269     }
2270
2271     if (mb_y)
2272         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
2273     if (inner_filter) {
2274         s->vp8dsp.vp8_v_loop_filter_simple(dst +  4 * linesize, linesize, bedge_lim);
2275         s->vp8dsp.vp8_v_loop_filter_simple(dst +  8 * linesize, linesize, bedge_lim);
2276         s->vp8dsp.vp8_v_loop_filter_simple(dst + 12 * linesize, linesize, bedge_lim);
2277     }
2278 }
2279
2280 #define MARGIN (16 << 2)
2281 static av_always_inline
2282 int vp78_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *curframe,
2283                                     VP8Frame *prev_frame, int is_vp7)
2284 {
2285     VP8Context *s = avctx->priv_data;
2286     int mb_x, mb_y;
2287
2288     s->mv_bounds.mv_min.y = -MARGIN;
2289     s->mv_bounds.mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
2290     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
2291         VP8Macroblock *mb = s->macroblocks_base +
2292                             ((s->mb_width + 1) * (mb_y + 1) + 1);
2293         int mb_xy = mb_y * s->mb_width;
2294
2295         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2296
2297         s->mv_bounds.mv_min.x = -MARGIN;
2298         s->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2299
2300         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2301             if (vpX_rac_is_end(&s->c)) {
2302                 return AVERROR_INVALIDDATA;
2303             }
2304             if (mb_y == 0)
2305                 AV_WN32A((mb - s->mb_width - 1)->intra4x4_pred_mode_top,
2306                          DC_PRED * 0x01010101);
2307             decode_mb_mode(s, &s->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2308                            prev_frame && prev_frame->seg_map ?
2309                            prev_frame->seg_map->data + mb_xy : NULL, 1, is_vp7);
2310             s->mv_bounds.mv_min.x -= 64;
2311             s->mv_bounds.mv_max.x -= 64;
2312         }
2313         s->mv_bounds.mv_min.y -= 64;
2314         s->mv_bounds.mv_max.y -= 64;
2315     }
2316     return 0;
2317 }
2318
2319 static int vp7_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2320                                    VP8Frame *prev_frame)
2321 {
2322     return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP7);
2323 }
2324
2325 static int vp8_decode_mv_mb_modes(AVCodecContext *avctx, VP8Frame *cur_frame,
2326                                    VP8Frame *prev_frame)
2327 {
2328     return vp78_decode_mv_mb_modes(avctx, cur_frame, prev_frame, IS_VP8);
2329 }
2330
2331 #if HAVE_THREADS
2332 #define check_thread_pos(td, otd, mb_x_check, mb_y_check)                     \
2333     do {                                                                      \
2334         int tmp = (mb_y_check << 16) | (mb_x_check & 0xFFFF);                 \
2335         if (atomic_load(&otd->thread_mb_pos) < tmp) {                         \
2336             pthread_mutex_lock(&otd->lock);                                   \
2337             atomic_store(&td->wait_mb_pos, tmp);                              \
2338             do {                                                              \
2339                 if (atomic_load(&otd->thread_mb_pos) >= tmp)                  \
2340                     break;                                                    \
2341                 pthread_cond_wait(&otd->cond, &otd->lock);                    \
2342             } while (1);                                                      \
2343             atomic_store(&td->wait_mb_pos, INT_MAX);                          \
2344             pthread_mutex_unlock(&otd->lock);                                 \
2345         }                                                                     \
2346     } while (0)
2347
2348 #define update_pos(td, mb_y, mb_x)                                            \
2349     do {                                                                      \
2350         int pos              = (mb_y << 16) | (mb_x & 0xFFFF);                \
2351         int sliced_threading = (avctx->active_thread_type == FF_THREAD_SLICE) && \
2352                                (num_jobs > 1);                                \
2353         int is_null          = !next_td || !prev_td;                          \
2354         int pos_check        = (is_null) ? 1 :                                \
2355             (next_td != td && pos >= atomic_load(&next_td->wait_mb_pos)) ||   \
2356             (prev_td != td && pos >= atomic_load(&prev_td->wait_mb_pos));     \
2357         atomic_store(&td->thread_mb_pos, pos);                                \
2358         if (sliced_threading && pos_check) {                                  \
2359             pthread_mutex_lock(&td->lock);                                    \
2360             pthread_cond_broadcast(&td->cond);                                \
2361             pthread_mutex_unlock(&td->lock);                                  \
2362         }                                                                     \
2363     } while (0)
2364 #else
2365 #define check_thread_pos(td, otd, mb_x_check, mb_y_check) while(0)
2366 #define update_pos(td, mb_y, mb_x) while(0)
2367 #endif
2368
2369 static av_always_inline int decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2370                                         int jobnr, int threadnr, int is_vp7)
2371 {
2372     VP8Context *s = avctx->priv_data;
2373     VP8ThreadData *prev_td, *next_td, *td = &s->thread_data[threadnr];
2374     int mb_y = atomic_load(&td->thread_mb_pos) >> 16;
2375     int mb_x, mb_xy = mb_y * s->mb_width;
2376     int num_jobs = s->num_jobs;
2377     VP8Frame *curframe = s->curframe, *prev_frame = s->prev_frame;
2378     VP56RangeCoder *c  = &s->coeff_partition[mb_y & (s->num_coeff_partitions - 1)];
2379     VP8Macroblock *mb;
2380     uint8_t *dst[3] = {
2381         curframe->tf.f->data[0] + 16 * mb_y * s->linesize,
2382         curframe->tf.f->data[1] +  8 * mb_y * s->uvlinesize,
2383         curframe->tf.f->data[2] +  8 * mb_y * s->uvlinesize
2384     };
2385
2386     if (vpX_rac_is_end(c))
2387          return AVERROR_INVALIDDATA;
2388
2389     if (mb_y == 0)
2390         prev_td = td;
2391     else
2392         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2393     if (mb_y == s->mb_height - 1)
2394         next_td = td;
2395     else
2396         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2397     if (s->mb_layout == 1)
2398         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2399     else {
2400         // Make sure the previous frame has read its segmentation map,
2401         // if we re-use the same map.
2402         if (prev_frame && s->segmentation.enabled &&
2403             !s->segmentation.update_map)
2404             ff_thread_await_progress(&prev_frame->tf, mb_y, 0);
2405         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2406         memset(mb - 1, 0, sizeof(*mb)); // zero left macroblock
2407         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED * 0x01010101);
2408     }
2409
2410     if (!is_vp7 || mb_y == 0)
2411         memset(td->left_nnz, 0, sizeof(td->left_nnz));
2412
2413     td->mv_bounds.mv_min.x = -MARGIN;
2414     td->mv_bounds.mv_max.x = ((s->mb_width - 1) << 6) + MARGIN;
2415
2416     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
2417         if (vpX_rac_is_end(c))
2418             return AVERROR_INVALIDDATA;
2419         // Wait for previous thread to read mb_x+2, and reach mb_y-1.
2420         if (prev_td != td) {
2421             if (threadnr != 0) {
2422                 check_thread_pos(td, prev_td,
2423                                  mb_x + (is_vp7 ? 2 : 1),
2424                                  mb_y - (is_vp7 ? 2 : 1));
2425             } else {
2426                 check_thread_pos(td, prev_td,
2427                                  mb_x + (is_vp7 ? 2 : 1) + s->mb_width + 3,
2428                                  mb_y - (is_vp7 ? 2 : 1));
2429             }
2430         }
2431
2432         s->vdsp.prefetch(dst[0] + (mb_x & 3) * 4 * s->linesize + 64,
2433                          s->linesize, 4);
2434         s->vdsp.prefetch(dst[1] + (mb_x & 7) * s->uvlinesize + 64,
2435                          dst[2] - dst[1], 2);
2436
2437         if (!s->mb_layout)
2438             decode_mb_mode(s, &td->mv_bounds, mb, mb_x, mb_y, curframe->seg_map->data + mb_xy,
2439                            prev_frame && prev_frame->seg_map ?
2440                            prev_frame->seg_map->data + mb_xy : NULL, 0, is_vp7);
2441
2442         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
2443
2444         if (!mb->skip)
2445             decode_mb_coeffs(s, td, c, mb, s->top_nnz[mb_x], td->left_nnz, is_vp7);
2446
2447         if (mb->mode <= MODE_I4x4)
2448             intra_predict(s, td, dst, mb, mb_x, mb_y, is_vp7);
2449         else
2450             inter_predict(s, td, dst, mb, mb_x, mb_y);
2451
2452         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
2453
2454         if (!mb->skip) {
2455             idct_mb(s, td, dst, mb);
2456         } else {
2457             AV_ZERO64(td->left_nnz);
2458             AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
2459
2460             /* Reset DC block predictors if they would exist
2461              * if the mb had coefficients */
2462             if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
2463                 td->left_nnz[8]     = 0;
2464                 s->top_nnz[mb_x][8] = 0;
2465             }
2466         }
2467
2468         if (s->deblock_filter)
2469             filter_level_for_mb(s, mb, &td->filter_strength[mb_x], is_vp7);
2470
2471         if (s->deblock_filter && num_jobs != 1 && threadnr == num_jobs - 1) {
2472             if (s->filter.simple)
2473                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2474                                  NULL, NULL, s->linesize, 0, 1);
2475             else
2476                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2477                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2478         }
2479
2480         prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
2481
2482         dst[0]      += 16;
2483         dst[1]      += 8;
2484         dst[2]      += 8;
2485         td->mv_bounds.mv_min.x -= 64;
2486         td->mv_bounds.mv_max.x -= 64;
2487
2488         if (mb_x == s->mb_width + 1) {
2489             update_pos(td, mb_y, s->mb_width + 3);
2490         } else {
2491             update_pos(td, mb_y, mb_x);
2492         }
2493     }
2494     return 0;
2495 }
2496
2497 static int vp7_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2498                                         int jobnr, int threadnr)
2499 {
2500     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 1);
2501 }
2502
2503 static int vp8_decode_mb_row_no_filter(AVCodecContext *avctx, void *tdata,
2504                                         int jobnr, int threadnr)
2505 {
2506     return decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr, 0);
2507 }
2508
2509 static av_always_inline void filter_mb_row(AVCodecContext *avctx, void *tdata,
2510                               int jobnr, int threadnr, int is_vp7)
2511 {
2512     VP8Context *s = avctx->priv_data;
2513     VP8ThreadData *td = &s->thread_data[threadnr];
2514     int mb_x, mb_y = atomic_load(&td->thread_mb_pos) >> 16, num_jobs = s->num_jobs;
2515     AVFrame *curframe = s->curframe->tf.f;
2516     VP8Macroblock *mb;
2517     VP8ThreadData *prev_td, *next_td;
2518     uint8_t *dst[3] = {
2519         curframe->data[0] + 16 * mb_y * s->linesize,
2520         curframe->data[1] +  8 * mb_y * s->uvlinesize,
2521         curframe->data[2] +  8 * mb_y * s->uvlinesize
2522     };
2523
2524     if (s->mb_layout == 1)
2525         mb = s->macroblocks_base + ((s->mb_width + 1) * (mb_y + 1) + 1);
2526     else
2527         mb = s->macroblocks + (s->mb_height - mb_y - 1) * 2;
2528
2529     if (mb_y == 0)
2530         prev_td = td;
2531     else
2532         prev_td = &s->thread_data[(jobnr + num_jobs - 1) % num_jobs];
2533     if (mb_y == s->mb_height - 1)
2534         next_td = td;
2535     else
2536         next_td = &s->thread_data[(jobnr + 1) % num_jobs];
2537
2538     for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb++) {
2539         VP8FilterStrength *f = &td->filter_strength[mb_x];
2540         if (prev_td != td)
2541             check_thread_pos(td, prev_td,
2542                              (mb_x + 1) + (s->mb_width + 3), mb_y - 1);
2543         if (next_td != td)
2544             if (next_td != &s->thread_data[0])
2545                 check_thread_pos(td, next_td, mb_x + 1, mb_y + 1);
2546
2547         if (num_jobs == 1) {
2548             if (s->filter.simple)
2549                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2550                                  NULL, NULL, s->linesize, 0, 1);
2551             else
2552                 backup_mb_border(s->top_border[mb_x + 1], dst[0],
2553                                  dst[1], dst[2], s->linesize, s->uvlinesize, 0);
2554         }
2555
2556         if (s->filter.simple)
2557             filter_mb_simple(s, dst[0], f, mb_x, mb_y);
2558         else
2559             filter_mb(s, dst, f, mb_x, mb_y, is_vp7);
2560         dst[0] += 16;
2561         dst[1] += 8;
2562         dst[2] += 8;
2563
2564         update_pos(td, mb_y, (s->mb_width + 3) + mb_x);
2565     }
2566 }
2567
2568 static void vp7_filter_mb_row(AVCodecContext *avctx, void *tdata,
2569                               int jobnr, int threadnr)
2570 {
2571     filter_mb_row(avctx, tdata, jobnr, threadnr, 1);
2572 }
2573
2574 static void vp8_filter_mb_row(AVCodecContext *avctx, void *tdata,
2575                               int jobnr, int threadnr)
2576 {
2577     filter_mb_row(avctx, tdata, jobnr, threadnr, 0);
2578 }
2579
2580 static av_always_inline
2581 int vp78_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata, int jobnr,
2582                               int threadnr, int is_vp7)
2583 {
2584     VP8Context *s = avctx->priv_data;
2585     VP8ThreadData *td = &s->thread_data[jobnr];
2586     VP8ThreadData *next_td = NULL, *prev_td = NULL;
2587     VP8Frame *curframe = s->curframe;
2588     int mb_y, num_jobs = s->num_jobs;
2589     int ret;
2590
2591     td->thread_nr = threadnr;
2592     td->mv_bounds.mv_min.y   = -MARGIN - 64 * threadnr;
2593     td->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN - 64 * threadnr;
2594     for (mb_y = jobnr; mb_y < s->mb_height; mb_y += num_jobs) {
2595         atomic_store(&td->thread_mb_pos, mb_y << 16);
2596         ret = s->decode_mb_row_no_filter(avctx, tdata, jobnr, threadnr);
2597         if (ret < 0) {
2598             update_pos(td, s->mb_height, INT_MAX & 0xFFFF);
2599             return ret;
2600         }
2601         if (s->deblock_filter)
2602             s->filter_mb_row(avctx, tdata, jobnr, threadnr);
2603         update_pos(td, mb_y, INT_MAX & 0xFFFF);
2604
2605         td->mv_bounds.mv_min.y -= 64 * num_jobs;
2606         td->mv_bounds.mv_max.y -= 64 * num_jobs;
2607
2608         if (avctx->active_thread_type == FF_THREAD_FRAME)
2609             ff_thread_report_progress(&curframe->tf, mb_y, 0);
2610     }
2611
2612     return 0;
2613 }
2614
2615 static int vp7_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2616                                     int jobnr, int threadnr)
2617 {
2618     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP7);
2619 }
2620
2621 static int vp8_decode_mb_row_sliced(AVCodecContext *avctx, void *tdata,
2622                                     int jobnr, int threadnr)
2623 {
2624     return vp78_decode_mb_row_sliced(avctx, tdata, jobnr, threadnr, IS_VP8);
2625 }
2626
2627 static av_always_inline
2628 int vp78_decode_frame(AVCodecContext *avctx, AVFrame *rframe, int *got_frame,
2629                       const AVPacket *avpkt, int is_vp7)
2630 {
2631     VP8Context *s = avctx->priv_data;
2632     int ret, i, referenced, num_jobs;
2633     enum AVDiscard skip_thresh;
2634     VP8Frame *av_uninit(curframe), *prev_frame;
2635
2636     if (is_vp7)
2637         ret = vp7_decode_frame_header(s, avpkt->data, avpkt->size);
2638     else
2639         ret = vp8_decode_frame_header(s, avpkt->data, avpkt->size);
2640
2641     if (ret < 0)
2642         goto err;
2643
2644     if (s->actually_webp) {
2645         // avctx->pix_fmt already set in caller.
2646     } else if (!is_vp7 && s->pix_fmt == AV_PIX_FMT_NONE) {
2647         s->pix_fmt = get_pixel_format(s);
2648         if (s->pix_fmt < 0) {
2649             ret = AVERROR(EINVAL);
2650             goto err;
2651         }
2652         avctx->pix_fmt = s->pix_fmt;
2653     }
2654
2655     prev_frame = s->framep[VP56_FRAME_CURRENT];
2656
2657     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT ||
2658                  s->update_altref == VP56_FRAME_CURRENT;
2659
2660     skip_thresh = !referenced ? AVDISCARD_NONREF
2661                               : !s->keyframe ? AVDISCARD_NONKEY
2662                                              : AVDISCARD_ALL;
2663
2664     if (avctx->skip_frame >= skip_thresh) {
2665         s->invisible = 1;
2666         memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2667         goto skip_decode;
2668     }
2669     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
2670
2671     // release no longer referenced frames
2672     for (i = 0; i < 5; i++)
2673         if (s->frames[i].tf.f->buf[0] &&
2674             &s->frames[i] != prev_frame &&
2675             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
2676             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN]   &&
2677             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
2678             vp8_release_frame(s, &s->frames[i]);
2679
2680     curframe = s->framep[VP56_FRAME_CURRENT] = vp8_find_free_buffer(s);
2681
2682     if (!s->colorspace)
2683         avctx->colorspace = AVCOL_SPC_BT470BG;
2684     if (s->fullrange)
2685         avctx->color_range = AVCOL_RANGE_JPEG;
2686     else
2687         avctx->color_range = AVCOL_RANGE_MPEG;
2688
2689     /* Given that arithmetic probabilities are updated every frame, it's quite
2690      * likely that the values we have on a random interframe are complete
2691      * junk if we didn't start decode on a keyframe. So just don't display
2692      * anything rather than junk. */
2693     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
2694                          !s->framep[VP56_FRAME_GOLDEN]   ||
2695                          !s->framep[VP56_FRAME_GOLDEN2])) {
2696         av_log(avctx, AV_LOG_WARNING,
2697                "Discarding interframe without a prior keyframe!\n");
2698         ret = AVERROR_INVALIDDATA;
2699         goto err;
2700     }
2701
2702     curframe->tf.f->key_frame = s->keyframe;
2703     curframe->tf.f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I
2704                                             : AV_PICTURE_TYPE_P;
2705     if ((ret = vp8_alloc_frame(s, curframe, referenced)) < 0)
2706         goto err;
2707
2708     // check if golden and altref are swapped
2709     if (s->update_altref != VP56_FRAME_NONE)
2710         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[s->update_altref];
2711     else
2712         s->next_framep[VP56_FRAME_GOLDEN2] = s->framep[VP56_FRAME_GOLDEN2];
2713
2714     if (s->update_golden != VP56_FRAME_NONE)
2715         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[s->update_golden];
2716     else
2717         s->next_framep[VP56_FRAME_GOLDEN] = s->framep[VP56_FRAME_GOLDEN];
2718
2719     if (s->update_last)
2720         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
2721     else
2722         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
2723
2724     s->next_framep[VP56_FRAME_CURRENT] = curframe;
2725
2726     if (ffcodec(avctx->codec)->update_thread_context)
2727         ff_thread_finish_setup(avctx);
2728
2729     if (avctx->hwaccel) {
2730         ret = avctx->hwaccel->start_frame(avctx, avpkt->data, avpkt->size);
2731         if (ret < 0)
2732             goto err;
2733
2734         ret = avctx->hwaccel->decode_slice(avctx, avpkt->data, avpkt->size);
2735         if (ret < 0)
2736             goto err;
2737
2738         ret = avctx->hwaccel->end_frame(avctx);
2739         if (ret < 0)
2740             goto err;
2741
2742     } else {
2743         s->linesize   = curframe->tf.f->linesize[0];
2744         s->uvlinesize = curframe->tf.f->linesize[1];
2745
2746         memset(s->top_nnz, 0, s->mb_width * sizeof(*s->top_nnz));
2747         /* Zero macroblock structures for top/top-left prediction
2748          * from outside the frame. */
2749         if (!s->mb_layout)
2750             memset(s->macroblocks + s->mb_height * 2 - 1, 0,
2751                    (s->mb_width + 1) * sizeof(*s->macroblocks));
2752         if (!s->mb_layout && s->keyframe)
2753             memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width * 4);
2754
2755         memset(s->ref_count, 0, sizeof(s->ref_count));
2756
2757         if (s->mb_layout == 1) {
2758             // Make sure the previous frame has read its segmentation map,
2759             // if we re-use the same map.
2760             if (prev_frame && s->segmentation.enabled &&
2761                 !s->segmentation.update_map)
2762                 ff_thread_await_progress(&prev_frame->tf, 1, 0);
2763             if (is_vp7)
2764                 ret = vp7_decode_mv_mb_modes(avctx, curframe, prev_frame);
2765             else
2766                 ret = vp8_decode_mv_mb_modes(avctx, curframe, prev_frame);
2767             if (ret < 0)
2768                 goto err;
2769         }
2770
2771         if (avctx->active_thread_type == FF_THREAD_FRAME)
2772             num_jobs = 1;
2773         else
2774             num_jobs = FFMIN(s->num_coeff_partitions, avctx->thread_count);
2775         s->num_jobs   = num_jobs;
2776         s->curframe   = curframe;
2777         s->prev_frame = prev_frame;
2778         s->mv_bounds.mv_min.y   = -MARGIN;
2779         s->mv_bounds.mv_max.y   = ((s->mb_height - 1) << 6) + MARGIN;
2780         for (i = 0; i < MAX_THREADS; i++) {
2781             VP8ThreadData *td = &s->thread_data[i];
2782             atomic_init(&td->thread_mb_pos, 0);
2783             atomic_init(&td->wait_mb_pos, INT_MAX);
2784         }
2785         if (is_vp7)
2786             avctx->execute2(avctx, vp7_decode_mb_row_sliced, s->thread_data, NULL,
2787                             num_jobs);
2788         else
2789             avctx->execute2(avctx, vp8_decode_mb_row_sliced, s->thread_data, NULL,
2790                             num_jobs);
2791     }
2792
2793     ff_thread_report_progress(&curframe->tf, INT_MAX, 0);
2794     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
2795
2796 skip_decode:
2797     // if future frames don't use the updated probabilities,
2798     // reset them to the values we saved
2799     if (!s->update_probabilities)
2800         s->prob[0] = s->prob[1];
2801
2802     if (!s->invisible) {
2803         if ((ret = av_frame_ref(rframe, curframe->tf.f)) < 0)
2804             return ret;
2805         *got_frame = 1;
2806     }
2807
2808     return avpkt->size;
2809 err:
2810     memcpy(&s->next_framep[0], &s->framep[0], sizeof(s->framep[0]) * 4);
2811     return ret;
2812 }
2813
2814 int ff_vp8_decode_frame(AVCodecContext *avctx, AVFrame *frame,
2815                         int *got_frame, AVPacket *avpkt)
2816 {
2817     return vp78_decode_frame(avctx, frame, got_frame, avpkt, IS_VP8);
2818 }
2819
2820 #if CONFIG_VP7_DECODER
2821 static int vp7_decode_frame(AVCodecContext *avctx, AVFrame *frame,
2822                             int *got_frame, AVPacket *avpkt)
2823 {
2824     return vp78_decode_frame(avctx, frame, got_frame, avpkt, IS_VP7);
2825 }
2826 #endif /* CONFIG_VP7_DECODER */
2827
2828 av_cold int ff_vp8_decode_free(AVCodecContext *avctx)
2829 {
2830     VP8Context *s = avctx->priv_data;
2831     int i;
2832
2833     vp8_decode_flush_impl(avctx, 1);
2834     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++)
2835         av_frame_free(&s->frames[i].tf.f);
2836
2837     return 0;
2838 }
2839
2840 static av_cold int vp8_init_frames(VP8Context *s)
2841 {
2842     int i;
2843     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
2844         s->frames[i].tf.f = av_frame_alloc();
2845         if (!s->frames[i].tf.f)
2846             return AVERROR(ENOMEM);
2847     }
2848     return 0;
2849 }
2850
2851 static av_always_inline
2852 int vp78_decode_init(AVCodecContext *avctx, int is_vp7)
2853 {
2854     VP8Context *s = avctx->priv_data;
2855     int ret;
2856
2857     s->avctx = avctx;
2858     s->vp7   = avctx->codec->id == AV_CODEC_ID_VP7;
2859     s->pix_fmt = AV_PIX_FMT_NONE;
2860     avctx->pix_fmt = AV_PIX_FMT_YUV420P;
2861
2862     ff_videodsp_init(&s->vdsp, 8);
2863
2864     ff_vp78dsp_init(&s->vp8dsp);
2865     if (CONFIG_VP7_DECODER && is_vp7) {
2866         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP7, 8, 1);
2867         ff_vp7dsp_init(&s->vp8dsp);
2868         s->decode_mb_row_no_filter = vp7_decode_mb_row_no_filter;
2869         s->filter_mb_row           = vp7_filter_mb_row;
2870     } else if (CONFIG_VP8_DECODER && !is_vp7) {
2871         ff_h264_pred_init(&s->hpc, AV_CODEC_ID_VP8, 8, 1);
2872         ff_vp8dsp_init(&s->vp8dsp);
2873         s->decode_mb_row_no_filter = vp8_decode_mb_row_no_filter;
2874         s->filter_mb_row           = vp8_filter_mb_row;
2875     }
2876
2877     /* does not change for VP8 */
2878     memcpy(s->prob[0].scan, ff_zigzag_scan, sizeof(s->prob[0].scan));
2879
2880     if ((ret = vp8_init_frames(s)) < 0) {
2881         ff_vp8_decode_free(avctx);
2882         return ret;
2883     }
2884
2885     return 0;
2886 }
2887
2888 #if CONFIG_VP7_DECODER
2889 static int vp7_decode_init(AVCodecContext *avctx)
2890 {
2891     return vp78_decode_init(avctx, IS_VP7);
2892 }
2893 #endif /* CONFIG_VP7_DECODER */
2894
2895 av_cold int ff_vp8_decode_init(AVCodecContext *avctx)
2896 {
2897     return vp78_decode_init(avctx, IS_VP8);
2898 }
2899
2900 #if CONFIG_VP8_DECODER
2901 #if HAVE_THREADS
2902 #define REBASE(pic) ((pic) ? (pic) - &s_src->frames[0] + &s->frames[0] : NULL)
2903
2904 static int vp8_decode_update_thread_context(AVCodecContext *dst,
2905                                             const AVCodecContext *src)
2906 {
2907     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
2908     int i;
2909
2910     if (s->macroblocks_base &&
2911         (s_src->mb_width != s->mb_width || s_src->mb_height != s->mb_height)) {
2912         free_buffers(s);
2913         s->mb_width  = s_src->mb_width;
2914         s->mb_height = s_src->mb_height;
2915     }
2916
2917     s->pix_fmt      = s_src->pix_fmt;
2918     s->prob[0]      = s_src->prob[!s_src->update_probabilities];
2919     s->segmentation = s_src->segmentation;
2920     s->lf_delta     = s_src->lf_delta;
2921     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
2922
2923     for (i = 0; i < FF_ARRAY_ELEMS(s_src->frames); i++) {
2924         if (s_src->frames[i].tf.f->buf[0]) {
2925             int ret = vp8_ref_frame(s, &s->frames[i], &s_src->frames[i]);
2926             if (ret < 0)
2927                 return ret;
2928         }
2929     }
2930
2931     s->framep[0] = REBASE(s_src->next_framep[0]);
2932     s->framep[1] = REBASE(s_src->next_framep[1]);
2933     s->framep[2] = REBASE(s_src->next_framep[2]);
2934     s->framep[3] = REBASE(s_src->next_framep[3]);
2935
2936     return 0;
2937 }
2938 #endif /* HAVE_THREADS */
2939 #endif /* CONFIG_VP8_DECODER */
2940
2941 #if CONFIG_VP7_DECODER
2942 const FFCodec ff_vp7_decoder = {
2943     .p.name                = "vp7",
2944     .p.long_name           = NULL_IF_CONFIG_SMALL("On2 VP7"),
2945     .p.type                = AVMEDIA_TYPE_VIDEO,
2946     .p.id                  = AV_CODEC_ID_VP7,
2947     .priv_data_size        = sizeof(VP8Context),
2948     .init                  = vp7_decode_init,
2949     .close                 = ff_vp8_decode_free,
2950     FF_CODEC_DECODE_CB(vp7_decode_frame),
2951     .p.capabilities        = AV_CODEC_CAP_DR1,
2952     .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE,
2953     .flush                 = vp8_decode_flush,
2954 };
2955 #endif /* CONFIG_VP7_DECODER */
2956
2957 #if CONFIG_VP8_DECODER
2958 const FFCodec ff_vp8_decoder = {
2959     .p.name                = "vp8",
2960     .p.long_name           = NULL_IF_CONFIG_SMALL("On2 VP8"),
2961     .p.type                = AVMEDIA_TYPE_VIDEO,
2962     .p.id                  = AV_CODEC_ID_VP8,
2963     .priv_data_size        = sizeof(VP8Context),
2964     .init                  = ff_vp8_decode_init,
2965     .close                 = ff_vp8_decode_free,
2966     FF_CODEC_DECODE_CB(ff_vp8_decode_frame),
2967     .p.capabilities        = AV_CODEC_CAP_DR1 | AV_CODEC_CAP_FRAME_THREADS |
2968                              AV_CODEC_CAP_SLICE_THREADS,
2969     .caps_internal         = FF_CODEC_CAP_INIT_THREADSAFE |
2970                              FF_CODEC_CAP_ALLOCATE_PROGRESS,
2971     .flush                 = vp8_decode_flush,
2972     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
2973     .hw_configs            = (const AVCodecHWConfigInternal *const []) {
2974 #if CONFIG_VP8_VAAPI_HWACCEL
2975                                HWACCEL_VAAPI(vp8),
2976 #endif
2977 #if CONFIG_VP8_NVDEC_HWACCEL
2978                                HWACCEL_NVDEC(vp8),
2979 #endif
2980                                NULL
2981                            },
2982 };
2983 #endif /* CONFIG_VP7_DECODER */