gst-libs/ext/libav/libavcodec/vp8.c

   1 /**
   2  * VP8 compatible video decoder
   3  *
   4  * Copyright (C) 2010 David Conrad
   5  * Copyright (C) 2010 Ronald S. Bultje
   6  * Copyright (C) 2010 Jason Garrett-Glaser
   7  *
   8  * This file is part of Libav.
   9  *
  10  * Libav is free software; you can redistribute it and/or
  11  * modify it under the terms of the GNU Lesser General Public
  12  * License as published by the Free Software Foundation; either
  13  * version 2.1 of the License, or (at your option) any later version.
  14  *
  15  * Libav is distributed in the hope that it will be useful,
  16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  18  * Lesser General Public License for more details.
  19  *
  20  * You should have received a copy of the GNU Lesser General Public
  21  * License along with Libav; if not, write to the Free Software
  22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  23  */
  24
  25 #include "libavutil/imgutils.h"
  26 #include "avcodec.h"
  27 #include "vp8.h"
  28 #include "vp8data.h"
  29 #include "rectangle.h"
  30 #include "thread.h"
  31
  32 #if ARCH_ARM
  33 #   include "arm/vp8.h"
  34 #endif
  35
  36 static void vp8_decode_flush(AVCodecContext *avctx)
  37 {
  38     VP8Context *s = avctx->priv_data;
  39     int i;
  40
  41     if (!avctx->is_copy) {
  42         for (i = 0; i < 5; i++)
  43             if (s->frames[i].data[0])
  44                 ff_thread_release_buffer(avctx, &s->frames[i]);
  45     }
  46     memset(s->framep, 0, sizeof(s->framep));
  47
  48     av_freep(&s->macroblocks_base);
  49     av_freep(&s->filter_strength);
  50     av_freep(&s->intra4x4_pred_mode_top);
  51     av_freep(&s->top_nnz);
  52     av_freep(&s->edge_emu_buffer);
  53     av_freep(&s->top_border);
  54     av_freep(&s->segmentation_map);
  55
  56     s->macroblocks        = NULL;
  57 }
  58
  59 static int update_dimensions(VP8Context *s, int width, int height)
  60 {
  61     if (width  != s->avctx->width ||
  62         height != s->avctx->height) {
  63         if (av_image_check_size(width, height, 0, s->avctx))
  64             return AVERROR_INVALIDDATA;
  65
  66         vp8_decode_flush(s->avctx);
  67
  68         avcodec_set_dimensions(s->avctx, width, height);
  69     }
  70
  71     s->mb_width  = (s->avctx->coded_width +15) / 16;
  72     s->mb_height = (s->avctx->coded_height+15) / 16;
  73
  74     s->macroblocks_base        = av_mallocz((s->mb_width+s->mb_height*2+1)*sizeof(*s->macroblocks));
  75     s->filter_strength         = av_mallocz(s->mb_width*sizeof(*s->filter_strength));
  76     s->intra4x4_pred_mode_top  = av_mallocz(s->mb_width*4);
  77     s->top_nnz                 = av_mallocz(s->mb_width*sizeof(*s->top_nnz));
  78     s->top_border              = av_mallocz((s->mb_width+1)*sizeof(*s->top_border));
  79     s->segmentation_map        = av_mallocz(s->mb_width*s->mb_height);
  80
  81     if (!s->macroblocks_base || !s->filter_strength || !s->intra4x4_pred_mode_top ||
  82         !s->top_nnz || !s->top_border || !s->segmentation_map)
  83         return AVERROR(ENOMEM);
  84
  85     s->macroblocks        = s->macroblocks_base + 1;
  86
  87     return 0;
  88 }
  89
  90 static void parse_segment_info(VP8Context *s)
  91 {
  92     VP56RangeCoder *c = &s->c;
  93     int i;
  94
  95     s->segmentation.update_map = vp8_rac_get(c);
  96
  97     if (vp8_rac_get(c)) { // update segment feature data
  98         s->segmentation.absolute_vals = vp8_rac_get(c);
  99
 100         for (i = 0; i < 4; i++)
 101             s->segmentation.base_quant[i]   = vp8_rac_get_sint(c, 7);
 102
 103         for (i = 0; i < 4; i++)
 104             s->segmentation.filter_level[i] = vp8_rac_get_sint(c, 6);
 105     }
 106     if (s->segmentation.update_map)
 107         for (i = 0; i < 3; i++)
 108             s->prob->segmentid[i] = vp8_rac_get(c) ? vp8_rac_get_uint(c, 8) : 255;
 109 }
 110
 111 static void update_lf_deltas(VP8Context *s)
 112 {
 113     VP56RangeCoder *c = &s->c;
 114     int i;
 115
 116     for (i = 0; i < 4; i++)
 117         s->lf_delta.ref[i]  = vp8_rac_get_sint(c, 6);
 118
 119     for (i = MODE_I4x4; i <= VP8_MVMODE_SPLIT; i++)
 120         s->lf_delta.mode[i] = vp8_rac_get_sint(c, 6);
 121 }
 122
 123 static int setup_partitions(VP8Context *s, const uint8_t *buf, int buf_size)
 124 {
 125     const uint8_t *sizes = buf;
 126     int i;
 127
 128     s->num_coeff_partitions = 1 << vp8_rac_get_uint(&s->c, 2);
 129
 130     buf      += 3*(s->num_coeff_partitions-1);
 131     buf_size -= 3*(s->num_coeff_partitions-1);
 132     if (buf_size < 0)
 133         return -1;
 134
 135     for (i = 0; i < s->num_coeff_partitions-1; i++) {
 136         int size = AV_RL24(sizes + 3*i);
 137         if (buf_size - size < 0)
 138             return -1;
 139
 140         ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, size);
 141         buf      += size;
 142         buf_size -= size;
 143     }
 144     ff_vp56_init_range_decoder(&s->coeff_partition[i], buf, buf_size);
 145
 146     return 0;
 147 }
 148
 149 static void get_quants(VP8Context *s)
 150 {
 151     VP56RangeCoder *c = &s->c;
 152     int i, base_qi;
 153
 154     int yac_qi     = vp8_rac_get_uint(c, 7);
 155     int ydc_delta  = vp8_rac_get_sint(c, 4);
 156     int y2dc_delta = vp8_rac_get_sint(c, 4);
 157     int y2ac_delta = vp8_rac_get_sint(c, 4);
 158     int uvdc_delta = vp8_rac_get_sint(c, 4);
 159     int uvac_delta = vp8_rac_get_sint(c, 4);
 160
 161     for (i = 0; i < 4; i++) {
 162         if (s->segmentation.enabled) {
 163             base_qi = s->segmentation.base_quant[i];
 164             if (!s->segmentation.absolute_vals)
 165                 base_qi += yac_qi;
 166         } else
 167             base_qi = yac_qi;
 168
 169         s->qmat[i].luma_qmul[0]    =       vp8_dc_qlookup[av_clip_uintp2(base_qi + ydc_delta , 7)];
 170         s->qmat[i].luma_qmul[1]    =       vp8_ac_qlookup[av_clip_uintp2(base_qi             , 7)];
 171         s->qmat[i].luma_dc_qmul[0] =   2 * vp8_dc_qlookup[av_clip_uintp2(base_qi + y2dc_delta, 7)];
 172         s->qmat[i].luma_dc_qmul[1] = 155 * vp8_ac_qlookup[av_clip_uintp2(base_qi + y2ac_delta, 7)] / 100;
 173         s->qmat[i].chroma_qmul[0]  =       vp8_dc_qlookup[av_clip_uintp2(base_qi + uvdc_delta, 7)];
 174         s->qmat[i].chroma_qmul[1]  =       vp8_ac_qlookup[av_clip_uintp2(base_qi + uvac_delta, 7)];
 175
 176         s->qmat[i].luma_dc_qmul[1] = FFMAX(s->qmat[i].luma_dc_qmul[1], 8);
 177         s->qmat[i].chroma_qmul[0]  = FFMIN(s->qmat[i].chroma_qmul[0], 132);
 178     }
 179 }
 180
 181 /**
 182  * Determine which buffers golden and altref should be updated with after this frame.
 183  * The spec isn't clear here, so I'm going by my understanding of what libvpx does
 184  *
 185  * Intra frames update all 3 references
 186  * Inter frames update VP56_FRAME_PREVIOUS if the update_last flag is set
 187  * If the update (golden|altref) flag is set, it's updated with the current frame
 188  *      if update_last is set, and VP56_FRAME_PREVIOUS otherwise.
 189  * If the flag is not set, the number read means:
 190  *      0: no update
 191  *      1: VP56_FRAME_PREVIOUS
 192  *      2: update golden with altref, or update altref with golden
 193  */
 194 static VP56Frame ref_to_update(VP8Context *s, int update, VP56Frame ref)
 195 {
 196     VP56RangeCoder *c = &s->c;
 197
 198     if (update)
 199         return VP56_FRAME_CURRENT;
 200
 201     switch (vp8_rac_get_uint(c, 2)) {
 202     case 1:
 203         return VP56_FRAME_PREVIOUS;
 204     case 2:
 205         return (ref == VP56_FRAME_GOLDEN) ? VP56_FRAME_GOLDEN2 : VP56_FRAME_GOLDEN;
 206     }
 207     return VP56_FRAME_NONE;
 208 }
 209
 210 static void update_refs(VP8Context *s)
 211 {
 212     VP56RangeCoder *c = &s->c;
 213
 214     int update_golden = vp8_rac_get(c);
 215     int update_altref = vp8_rac_get(c);
 216
 217     s->update_golden = ref_to_update(s, update_golden, VP56_FRAME_GOLDEN);
 218     s->update_altref = ref_to_update(s, update_altref, VP56_FRAME_GOLDEN2);
 219 }
 220
 221 static int decode_frame_header(VP8Context *s, const uint8_t *buf, int buf_size)
 222 {
 223     VP56RangeCoder *c = &s->c;
 224     int header_size, hscale, vscale, i, j, k, l, m, ret;
 225     int width  = s->avctx->width;
 226     int height = s->avctx->height;
 227
 228     s->keyframe  = !(buf[0] & 1);
 229     s->profile   =  (buf[0]>>1) & 7;
 230     s->invisible = !(buf[0] & 0x10);
 231     header_size  = AV_RL24(buf) >> 5;
 232     buf      += 3;
 233     buf_size -= 3;
 234
 235     if (s->profile > 3)
 236         av_log(s->avctx, AV_LOG_WARNING, "Unknown profile %d\n", s->profile);
 237
 238     if (!s->profile)
 239         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_epel_pixels_tab, sizeof(s->put_pixels_tab));
 240     else    // profile 1-3 use bilinear, 4+ aren't defined so whatever
 241         memcpy(s->put_pixels_tab, s->vp8dsp.put_vp8_bilinear_pixels_tab, sizeof(s->put_pixels_tab));
 242
 243     if (header_size > buf_size - 7*s->keyframe) {
 244         av_log(s->avctx, AV_LOG_ERROR, "Header size larger than data provided\n");
 245         return AVERROR_INVALIDDATA;
 246     }
 247
 248     if (s->keyframe) {
 249         if (AV_RL24(buf) != 0x2a019d) {
 250             av_log(s->avctx, AV_LOG_ERROR, "Invalid start code 0x%x\n", AV_RL24(buf));
 251             return AVERROR_INVALIDDATA;
 252         }
 253         width  = AV_RL16(buf+3) & 0x3fff;
 254         height = AV_RL16(buf+5) & 0x3fff;
 255         hscale = buf[4] >> 6;
 256         vscale = buf[6] >> 6;
 257         buf      += 7;
 258         buf_size -= 7;
 259
 260         if (hscale || vscale)
 261             av_log_missing_feature(s->avctx, "Upscaling", 1);
 262
 263         s->update_golden = s->update_altref = VP56_FRAME_CURRENT;
 264         for (i = 0; i < 4; i++)
 265             for (j = 0; j < 16; j++)
 266                 memcpy(s->prob->token[i][j], vp8_token_default_probs[i][vp8_coeff_band[j]],
 267                        sizeof(s->prob->token[i][j]));
 268         memcpy(s->prob->pred16x16, vp8_pred16x16_prob_inter, sizeof(s->prob->pred16x16));
 269         memcpy(s->prob->pred8x8c , vp8_pred8x8c_prob_inter , sizeof(s->prob->pred8x8c));
 270         memcpy(s->prob->mvc      , vp8_mv_default_prob     , sizeof(s->prob->mvc));
 271         memset(&s->segmentation, 0, sizeof(s->segmentation));
 272     }
 273
 274     if (!s->macroblocks_base || /* first frame */
 275         width != s->avctx->width || height != s->avctx->height) {
 276         if ((ret = update_dimensions(s, width, height) < 0))
 277             return ret;
 278     }
 279
 280     ff_vp56_init_range_decoder(c, buf, header_size);
 281     buf      += header_size;
 282     buf_size -= header_size;
 283
 284     if (s->keyframe) {
 285         if (vp8_rac_get(c))
 286             av_log(s->avctx, AV_LOG_WARNING, "Unspecified colorspace\n");
 287         vp8_rac_get(c); // whether we can skip clamping in dsp functions
 288     }
 289
 290     if ((s->segmentation.enabled = vp8_rac_get(c)))
 291         parse_segment_info(s);
 292     else
 293         s->segmentation.update_map = 0; // FIXME: move this to some init function?
 294
 295     s->filter.simple    = vp8_rac_get(c);
 296     s->filter.level     = vp8_rac_get_uint(c, 6);
 297     s->filter.sharpness = vp8_rac_get_uint(c, 3);
 298
 299     if ((s->lf_delta.enabled = vp8_rac_get(c)))
 300         if (vp8_rac_get(c))
 301             update_lf_deltas(s);
 302
 303     if (setup_partitions(s, buf, buf_size)) {
 304         av_log(s->avctx, AV_LOG_ERROR, "Invalid partitions\n");
 305         return AVERROR_INVALIDDATA;
 306     }
 307
 308     get_quants(s);
 309
 310     if (!s->keyframe) {
 311         update_refs(s);
 312         s->sign_bias[VP56_FRAME_GOLDEN]               = vp8_rac_get(c);
 313         s->sign_bias[VP56_FRAME_GOLDEN2 /* altref */] = vp8_rac_get(c);
 314     }
 315
 316     // if we aren't saving this frame's probabilities for future frames,
 317     // make a copy of the current probabilities
 318     if (!(s->update_probabilities = vp8_rac_get(c)))
 319         s->prob[1] = s->prob[0];
 320
 321     s->update_last = s->keyframe || vp8_rac_get(c);
 322
 323     for (i = 0; i < 4; i++)
 324         for (j = 0; j < 8; j++)
 325             for (k = 0; k < 3; k++)
 326                 for (l = 0; l < NUM_DCT_TOKENS-1; l++)
 327                     if (vp56_rac_get_prob_branchy(c, vp8_token_update_probs[i][j][k][l])) {
 328                         int prob = vp8_rac_get_uint(c, 8);
 329                         for (m = 0; vp8_coeff_band_indexes[j][m] >= 0; m++)
 330                             s->prob->token[i][vp8_coeff_band_indexes[j][m]][k][l] = prob;
 331                     }
 332
 333     if ((s->mbskip_enabled = vp8_rac_get(c)))
 334         s->prob->mbskip = vp8_rac_get_uint(c, 8);
 335
 336     if (!s->keyframe) {
 337         s->prob->intra  = vp8_rac_get_uint(c, 8);
 338         s->prob->last   = vp8_rac_get_uint(c, 8);
 339         s->prob->golden = vp8_rac_get_uint(c, 8);
 340
 341         if (vp8_rac_get(c))
 342             for (i = 0; i < 4; i++)
 343                 s->prob->pred16x16[i] = vp8_rac_get_uint(c, 8);
 344         if (vp8_rac_get(c))
 345             for (i = 0; i < 3; i++)
 346                 s->prob->pred8x8c[i]  = vp8_rac_get_uint(c, 8);
 347
 348         // 17.2 MV probability update
 349         for (i = 0; i < 2; i++)
 350             for (j = 0; j < 19; j++)
 351                 if (vp56_rac_get_prob_branchy(c, vp8_mv_update_prob[i][j]))
 352                     s->prob->mvc[i][j] = vp8_rac_get_nn(c);
 353     }
 354
 355     return 0;
 356 }
 357
 358 static av_always_inline void clamp_mv(VP8Context *s, VP56mv *dst, const VP56mv *src)
 359 {
 360     dst->x = av_clip(src->x, s->mv_min.x, s->mv_max.x);
 361     dst->y = av_clip(src->y, s->mv_min.y, s->mv_max.y);
 362 }
 363
 364 /**
 365  * Motion vector coding, 17.1.
 366  */
 367 static int read_mv_component(VP56RangeCoder *c, const uint8_t *p)
 368 {
 369     int bit, x = 0;
 370
 371     if (vp56_rac_get_prob_branchy(c, p[0])) {
 372         int i;
 373
 374         for (i = 0; i < 3; i++)
 375             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 376         for (i = 9; i > 3; i--)
 377             x += vp56_rac_get_prob(c, p[9 + i]) << i;
 378         if (!(x & 0xFFF0) || vp56_rac_get_prob(c, p[12]))
 379             x += 8;
 380     } else {
 381         // small_mvtree
 382         const uint8_t *ps = p+2;
 383         bit = vp56_rac_get_prob(c, *ps);
 384         ps += 1 + 3*bit;
 385         x  += 4*bit;
 386         bit = vp56_rac_get_prob(c, *ps);
 387         ps += 1 + bit;
 388         x  += 2*bit;
 389         x  += vp56_rac_get_prob(c, *ps);
 390     }
 391
 392     return (x && vp56_rac_get_prob(c, p[1])) ? -x : x;
 393 }
 394
 395 static av_always_inline
 396 const uint8_t *get_submv_prob(uint32_t left, uint32_t top)
 397 {
 398     if (left == top)
 399         return vp8_submv_prob[4-!!left];
 400     if (!top)
 401         return vp8_submv_prob[2];
 402     return vp8_submv_prob[1-!!left];
 403 }
 404
 405 /**
 406  * Split motion vector prediction, 16.4.
 407  * @returns the number of motion vectors parsed (2, 4 or 16)
 408  */
 409 static av_always_inline
 410 int decode_splitmvs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb)
 411 {
 412     int part_idx;
 413     int n, num;
 414     VP8Macroblock *top_mb  = &mb[2];
 415     VP8Macroblock *left_mb = &mb[-1];
 416     const uint8_t *mbsplits_left = vp8_mbsplits[left_mb->partitioning],
 417                   *mbsplits_top = vp8_mbsplits[top_mb->partitioning],
 418                   *mbsplits_cur, *firstidx;
 419     VP56mv *top_mv  = top_mb->bmv;
 420     VP56mv *left_mv = left_mb->bmv;
 421     VP56mv *cur_mv  = mb->bmv;
 422
 423     if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[0])) {
 424         if (vp56_rac_get_prob_branchy(c, vp8_mbsplit_prob[1])) {
 425             part_idx = VP8_SPLITMVMODE_16x8 + vp56_rac_get_prob(c, vp8_mbsplit_prob[2]);
 426         } else {
 427             part_idx = VP8_SPLITMVMODE_8x8;
 428         }
 429     } else {
 430         part_idx = VP8_SPLITMVMODE_4x4;
 431     }
 432
 433     num = vp8_mbsplit_count[part_idx];
 434     mbsplits_cur = vp8_mbsplits[part_idx],
 435     firstidx = vp8_mbfirstidx[part_idx];
 436     mb->partitioning = part_idx;
 437
 438     for (n = 0; n < num; n++) {
 439         int k = firstidx[n];
 440         uint32_t left, above;
 441         const uint8_t *submv_prob;
 442
 443         if (!(k & 3))
 444             left = AV_RN32A(&left_mv[mbsplits_left[k + 3]]);
 445         else
 446             left  = AV_RN32A(&cur_mv[mbsplits_cur[k - 1]]);
 447         if (k <= 3)
 448             above = AV_RN32A(&top_mv[mbsplits_top[k + 12]]);
 449         else
 450             above = AV_RN32A(&cur_mv[mbsplits_cur[k - 4]]);
 451
 452         submv_prob = get_submv_prob(left, above);
 453
 454         if (vp56_rac_get_prob_branchy(c, submv_prob[0])) {
 455             if (vp56_rac_get_prob_branchy(c, submv_prob[1])) {
 456                 if (vp56_rac_get_prob_branchy(c, submv_prob[2])) {
 457                     mb->bmv[n].y = mb->mv.y + read_mv_component(c, s->prob->mvc[0]);
 458                     mb->bmv[n].x = mb->mv.x + read_mv_component(c, s->prob->mvc[1]);
 459                 } else {
 460                     AV_ZERO32(&mb->bmv[n]);
 461                 }
 462             } else {
 463                 AV_WN32A(&mb->bmv[n], above);
 464             }
 465         } else {
 466             AV_WN32A(&mb->bmv[n], left);
 467         }
 468     }
 469
 470     return num;
 471 }
 472
 473 static av_always_inline
 474 void decode_mvs(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y)
 475 {
 476     VP8Macroblock *mb_edge[3] = { mb + 2 /* top */,
 477                                   mb - 1 /* left */,
 478                                   mb + 1 /* top-left */ };
 479     enum { CNT_ZERO, CNT_NEAREST, CNT_NEAR, CNT_SPLITMV };
 480     enum { VP8_EDGE_TOP, VP8_EDGE_LEFT, VP8_EDGE_TOPLEFT };
 481     int idx = CNT_ZERO;
 482     int cur_sign_bias = s->sign_bias[mb->ref_frame];
 483     int8_t *sign_bias = s->sign_bias;
 484     VP56mv near_mv[4];
 485     uint8_t cnt[4] = { 0 };
 486     VP56RangeCoder *c = &s->c;
 487
 488     AV_ZERO32(&near_mv[0]);
 489     AV_ZERO32(&near_mv[1]);
 490
 491     /* Process MB on top, left and top-left */
 492     #define MV_EDGE_CHECK(n)\
 493     {\
 494         VP8Macroblock *edge = mb_edge[n];\
 495         int edge_ref = edge->ref_frame;\
 496         if (edge_ref != VP56_FRAME_CURRENT) {\
 497             uint32_t mv = AV_RN32A(&edge->mv);\
 498             if (mv) {\
 499                 if (cur_sign_bias != sign_bias[edge_ref]) {\
 500                     /* SWAR negate of the values in mv. */\
 501                     mv = ~mv;\
 502                     mv = ((mv&0x7fff7fff) + 0x00010001) ^ (mv&0x80008000);\
 503                 }\
 504                 if (!n || mv != AV_RN32A(&near_mv[idx]))\
 505                     AV_WN32A(&near_mv[++idx], mv);\
 506                 cnt[idx]      += 1 + (n != 2);\
 507             } else\
 508                 cnt[CNT_ZERO] += 1 + (n != 2);\
 509         }\
 510     }
 511
 512     MV_EDGE_CHECK(0)
 513     MV_EDGE_CHECK(1)
 514     MV_EDGE_CHECK(2)
 515
 516     mb->partitioning = VP8_SPLITMVMODE_NONE;
 517     if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_ZERO]][0])) {
 518         mb->mode = VP8_MVMODE_MV;
 519
 520         /* If we have three distinct MVs, merge first and last if they're the same */
 521         if (cnt[CNT_SPLITMV] && AV_RN32A(&near_mv[1 + VP8_EDGE_TOP]) == AV_RN32A(&near_mv[1 + VP8_EDGE_TOPLEFT]))
 522             cnt[CNT_NEAREST] += 1;
 523
 524         /* Swap near and nearest if necessary */
 525         if (cnt[CNT_NEAR] > cnt[CNT_NEAREST]) {
 526             FFSWAP(uint8_t,     cnt[CNT_NEAREST],     cnt[CNT_NEAR]);
 527             FFSWAP( VP56mv, near_mv[CNT_NEAREST], near_mv[CNT_NEAR]);
 528         }
 529
 530         if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAREST]][1])) {
 531             if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_NEAR]][2])) {
 532
 533                 /* Choose the best mv out of 0,0 and the nearest mv */
 534                 clamp_mv(s, &mb->mv, &near_mv[CNT_ZERO + (cnt[CNT_NEAREST] >= cnt[CNT_ZERO])]);
 535                 cnt[CNT_SPLITMV] = ((mb_edge[VP8_EDGE_LEFT]->mode    == VP8_MVMODE_SPLIT) +
 536                                     (mb_edge[VP8_EDGE_TOP]->mode     == VP8_MVMODE_SPLIT)) * 2 +
 537                                     (mb_edge[VP8_EDGE_TOPLEFT]->mode == VP8_MVMODE_SPLIT);
 538
 539                 if (vp56_rac_get_prob_branchy(c, vp8_mode_contexts[cnt[CNT_SPLITMV]][3])) {
 540                     mb->mode = VP8_MVMODE_SPLIT;
 541                     mb->mv = mb->bmv[decode_splitmvs(s, c, mb) - 1];
 542                 } else {
 543                     mb->mv.y += read_mv_component(c, s->prob->mvc[0]);
 544                     mb->mv.x += read_mv_component(c, s->prob->mvc[1]);
 545                     mb->bmv[0] = mb->mv;
 546                 }
 547             } else {
 548                 clamp_mv(s, &mb->mv, &near_mv[CNT_NEAR]);
 549                 mb->bmv[0] = mb->mv;
 550             }
 551         } else {
 552             clamp_mv(s, &mb->mv, &near_mv[CNT_NEAREST]);
 553             mb->bmv[0] = mb->mv;
 554         }
 555     } else {
 556         mb->mode = VP8_MVMODE_ZERO;
 557         AV_ZERO32(&mb->mv);
 558         mb->bmv[0] = mb->mv;
 559     }
 560 }
 561
 562 static av_always_inline
 563 void decode_intra4x4_modes(VP8Context *s, VP56RangeCoder *c,
 564                            int mb_x, int keyframe)
 565 {
 566     uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
 567     if (keyframe) {
 568         int x, y;
 569         uint8_t* const top = s->intra4x4_pred_mode_top + 4 * mb_x;
 570         uint8_t* const left = s->intra4x4_pred_mode_left;
 571         for (y = 0; y < 4; y++) {
 572             for (x = 0; x < 4; x++) {
 573                 const uint8_t *ctx;
 574                 ctx = vp8_pred4x4_prob_intra[top[x]][left[y]];
 575                 *intra4x4 = vp8_rac_get_tree(c, vp8_pred4x4_tree, ctx);
 576                 left[y] = top[x] = *intra4x4;
 577                 intra4x4++;
 578             }
 579         }
 580     } else {
 581         int i;
 582         for (i = 0; i < 16; i++)
 583             intra4x4[i] = vp8_rac_get_tree(c, vp8_pred4x4_tree, vp8_pred4x4_prob_inter);
 584     }
 585 }
 586
 587 static av_always_inline
 588 void decode_mb_mode(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, uint8_t *segment, uint8_t *ref)
 589 {
 590     VP56RangeCoder *c = &s->c;
 591
 592     if (s->segmentation.update_map)
 593         *segment = vp8_rac_get_tree(c, vp8_segmentid_tree, s->prob->segmentid);
 594     else
 595         *segment = ref ? *ref : *segment;
 596     s->segment = *segment;
 597
 598     mb->skip = s->mbskip_enabled ? vp56_rac_get_prob(c, s->prob->mbskip) : 0;
 599
 600     if (s->keyframe) {
 601         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_intra, vp8_pred16x16_prob_intra);
 602
 603         if (mb->mode == MODE_I4x4) {
 604             decode_intra4x4_modes(s, c, mb_x, 1);
 605         } else {
 606             const uint32_t modes = vp8_pred4x4_mode[mb->mode] * 0x01010101u;
 607             AV_WN32A(s->intra4x4_pred_mode_top + 4 * mb_x, modes);
 608             AV_WN32A(s->intra4x4_pred_mode_left, modes);
 609         }
 610
 611         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, vp8_pred8x8c_prob_intra);
 612         mb->ref_frame = VP56_FRAME_CURRENT;
 613     } else if (vp56_rac_get_prob_branchy(c, s->prob->intra)) {
 614         // inter MB, 16.2
 615         if (vp56_rac_get_prob_branchy(c, s->prob->last))
 616             mb->ref_frame = vp56_rac_get_prob(c, s->prob->golden) ?
 617                 VP56_FRAME_GOLDEN2 /* altref */ : VP56_FRAME_GOLDEN;
 618         else
 619             mb->ref_frame = VP56_FRAME_PREVIOUS;
 620         s->ref_count[mb->ref_frame-1]++;
 621
 622         // motion vectors, 16.3
 623         decode_mvs(s, mb, mb_x, mb_y);
 624     } else {
 625         // intra MB, 16.1
 626         mb->mode = vp8_rac_get_tree(c, vp8_pred16x16_tree_inter, s->prob->pred16x16);
 627
 628         if (mb->mode == MODE_I4x4)
 629             decode_intra4x4_modes(s, c, mb_x, 0);
 630
 631         s->chroma_pred_mode = vp8_rac_get_tree(c, vp8_pred8x8c_tree, s->prob->pred8x8c);
 632         mb->ref_frame = VP56_FRAME_CURRENT;
 633         mb->partitioning = VP8_SPLITMVMODE_NONE;
 634         AV_ZERO32(&mb->bmv[0]);
 635     }
 636 }
 637
 638 #ifndef decode_block_coeffs_internal
 639 /**
 640  * @param c arithmetic bitstream reader context
 641  * @param block destination for block coefficients
 642  * @param probs probabilities to use when reading trees from the bitstream
 643  * @param i initial coeff index, 0 unless a separate DC block is coded
 644  * @param qmul array holding the dc/ac dequant factor at position 0/1
 645  * @return 0 if no coeffs were decoded
 646  *         otherwise, the index of the last coeff decoded plus one
 647  */
 648 static int decode_block_coeffs_internal(VP56RangeCoder *c, DCTELEM block[16],
 649                                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 650                                         int i, uint8_t *token_prob, int16_t qmul[2])
 651 {
 652     goto skip_eob;
 653     do {
 654         int coeff;
 655         if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 656             return i;
 657
 658 skip_eob:
 659         if (!vp56_rac_get_prob_branchy(c, token_prob[1])) { // DCT_0
 660             if (++i == 16)
 661                 return i; // invalid input; blocks should end with EOB
 662             token_prob = probs[i][0];
 663             goto skip_eob;
 664         }
 665
 666         if (!vp56_rac_get_prob_branchy(c, token_prob[2])) { // DCT_1
 667             coeff = 1;
 668             token_prob = probs[i+1][1];
 669         } else {
 670             if (!vp56_rac_get_prob_branchy(c, token_prob[3])) { // DCT 2,3,4
 671                 coeff = vp56_rac_get_prob_branchy(c, token_prob[4]);
 672                 if (coeff)
 673                     coeff += vp56_rac_get_prob(c, token_prob[5]);
 674                 coeff += 2;
 675             } else {
 676                 // DCT_CAT*
 677                 if (!vp56_rac_get_prob_branchy(c, token_prob[6])) {
 678                     if (!vp56_rac_get_prob_branchy(c, token_prob[7])) { // DCT_CAT1
 679                         coeff  = 5 + vp56_rac_get_prob(c, vp8_dct_cat1_prob[0]);
 680                     } else {                                    // DCT_CAT2
 681                         coeff  = 7;
 682                         coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[0]) << 1;
 683                         coeff += vp56_rac_get_prob(c, vp8_dct_cat2_prob[1]);
 684                     }
 685                 } else {    // DCT_CAT3 and up
 686                     int a = vp56_rac_get_prob(c, token_prob[8]);
 687                     int b = vp56_rac_get_prob(c, token_prob[9+a]);
 688                     int cat = (a<<1) + b;
 689                     coeff  = 3 + (8<<cat);
 690                     coeff += vp8_rac_get_coeff(c, ff_vp8_dct_cat_prob[cat]);
 691                 }
 692             }
 693             token_prob = probs[i+1][2];
 694         }
 695         block[zigzag_scan[i]] = (vp8_rac_get(c) ? -coeff : coeff) * qmul[!!i];
 696     } while (++i < 16);
 697
 698     return i;
 699 }
 700 #endif
 701
 702 /**
 703  * @param c arithmetic bitstream reader context
 704  * @param block destination for block coefficients
 705  * @param probs probabilities to use when reading trees from the bitstream
 706  * @param i initial coeff index, 0 unless a separate DC block is coded
 707  * @param zero_nhood the initial prediction context for number of surrounding
 708  *                   all-zero blocks (only left/top, so 0-2)
 709  * @param qmul array holding the dc/ac dequant factor at position 0/1
 710  * @return 0 if no coeffs were decoded
 711  *         otherwise, the index of the last coeff decoded plus one
 712  */
 713 static av_always_inline
 714 int decode_block_coeffs(VP56RangeCoder *c, DCTELEM block[16],
 715                         uint8_t probs[16][3][NUM_DCT_TOKENS-1],
 716                         int i, int zero_nhood, int16_t qmul[2])
 717 {
 718     uint8_t *token_prob = probs[i][zero_nhood];
 719     if (!vp56_rac_get_prob_branchy(c, token_prob[0]))   // DCT_EOB
 720         return 0;
 721     return decode_block_coeffs_internal(c, block, probs, i, token_prob, qmul);
 722 }
 723
 724 static av_always_inline
 725 void decode_mb_coeffs(VP8Context *s, VP56RangeCoder *c, VP8Macroblock *mb,
 726                       uint8_t t_nnz[9], uint8_t l_nnz[9])
 727 {
 728     int i, x, y, luma_start = 0, luma_ctx = 3;
 729     int nnz_pred, nnz, nnz_total = 0;
 730     int segment = s->segment;
 731     int block_dc = 0;
 732
 733     if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
 734         nnz_pred = t_nnz[8] + l_nnz[8];
 735
 736         // decode DC values and do hadamard
 737         nnz = decode_block_coeffs(c, s->block_dc, s->prob->token[1], 0, nnz_pred,
 738                                   s->qmat[segment].luma_dc_qmul);
 739         l_nnz[8] = t_nnz[8] = !!nnz;
 740         if (nnz) {
 741             nnz_total += nnz;
 742             block_dc = 1;
 743             if (nnz == 1)
 744                 s->vp8dsp.vp8_luma_dc_wht_dc(s->block, s->block_dc);
 745             else
 746                 s->vp8dsp.vp8_luma_dc_wht(s->block, s->block_dc);
 747         }
 748         luma_start = 1;
 749         luma_ctx = 0;
 750     }
 751
 752     // luma blocks
 753     for (y = 0; y < 4; y++)
 754         for (x = 0; x < 4; x++) {
 755             nnz_pred = l_nnz[y] + t_nnz[x];
 756             nnz = decode_block_coeffs(c, s->block[y][x], s->prob->token[luma_ctx], luma_start,
 757                                       nnz_pred, s->qmat[segment].luma_qmul);
 758             // nnz+block_dc may be one more than the actual last index, but we don't care
 759             s->non_zero_count_cache[y][x] = nnz + block_dc;
 760             t_nnz[x] = l_nnz[y] = !!nnz;
 761             nnz_total += nnz;
 762         }
 763
 764     // chroma blocks
 765     // TODO: what to do about dimensions? 2nd dim for luma is x,
 766     // but for chroma it's (y<<1)|x
 767     for (i = 4; i < 6; i++)
 768         for (y = 0; y < 2; y++)
 769             for (x = 0; x < 2; x++) {
 770                 nnz_pred = l_nnz[i+2*y] + t_nnz[i+2*x];
 771                 nnz = decode_block_coeffs(c, s->block[i][(y<<1)+x], s->prob->token[2], 0,
 772                                           nnz_pred, s->qmat[segment].chroma_qmul);
 773                 s->non_zero_count_cache[i][(y<<1)+x] = nnz;
 774                 t_nnz[i+2*x] = l_nnz[i+2*y] = !!nnz;
 775                 nnz_total += nnz;
 776             }
 777
 778     // if there were no coded coeffs despite the macroblock not being marked skip,
 779     // we MUST not do the inner loop filter and should not do IDCT
 780     // Since skip isn't used for bitstream prediction, just manually set it.
 781     if (!nnz_total)
 782         mb->skip = 1;
 783 }
 784
 785 static av_always_inline
 786 void backup_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 787                       int linesize, int uvlinesize, int simple)
 788 {
 789     AV_COPY128(top_border, src_y + 15*linesize);
 790     if (!simple) {
 791         AV_COPY64(top_border+16, src_cb + 7*uvlinesize);
 792         AV_COPY64(top_border+24, src_cr + 7*uvlinesize);
 793     }
 794 }
 795
 796 static av_always_inline
 797 void xchg_mb_border(uint8_t *top_border, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr,
 798                     int linesize, int uvlinesize, int mb_x, int mb_y, int mb_width,
 799                     int simple, int xchg)
 800 {
 801     uint8_t *top_border_m1 = top_border-32;     // for TL prediction
 802     src_y  -=   linesize;
 803     src_cb -= uvlinesize;
 804     src_cr -= uvlinesize;
 805
 806 #define XCHG(a,b,xchg) do {                     \
 807         if (xchg) AV_SWAP64(b,a);               \
 808         else      AV_COPY64(b,a);               \
 809     } while (0)
 810
 811     XCHG(top_border_m1+8, src_y-8, xchg);
 812     XCHG(top_border,      src_y,   xchg);
 813     XCHG(top_border+8,    src_y+8, 1);
 814     if (mb_x < mb_width-1)
 815         XCHG(top_border+32, src_y+16, 1);
 816
 817     // only copy chroma for normal loop filter
 818     // or to initialize the top row to 127
 819     if (!simple || !mb_y) {
 820         XCHG(top_border_m1+16, src_cb-8, xchg);
 821         XCHG(top_border_m1+24, src_cr-8, xchg);
 822         XCHG(top_border+16,    src_cb, 1);
 823         XCHG(top_border+24,    src_cr, 1);
 824     }
 825 }
 826
 827 static av_always_inline
 828 int check_dc_pred8x8_mode(int mode, int mb_x, int mb_y)
 829 {
 830     if (!mb_x) {
 831         return mb_y ? TOP_DC_PRED8x8 : DC_128_PRED8x8;
 832     } else {
 833         return mb_y ? mode : LEFT_DC_PRED8x8;
 834     }
 835 }
 836
 837 static av_always_inline
 838 int check_tm_pred8x8_mode(int mode, int mb_x, int mb_y)
 839 {
 840     if (!mb_x) {
 841         return mb_y ? VERT_PRED8x8 : DC_129_PRED8x8;
 842     } else {
 843         return mb_y ? mode : HOR_PRED8x8;
 844     }
 845 }
 846
 847 static av_always_inline
 848 int check_intra_pred8x8_mode(int mode, int mb_x, int mb_y)
 849 {
 850     if (mode == DC_PRED8x8) {
 851         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 852     } else {
 853         return mode;
 854     }
 855 }
 856
 857 static av_always_inline
 858 int check_intra_pred8x8_mode_emuedge(int mode, int mb_x, int mb_y)
 859 {
 860     switch (mode) {
 861     case DC_PRED8x8:
 862         return check_dc_pred8x8_mode(mode, mb_x, mb_y);
 863     case VERT_PRED8x8:
 864         return !mb_y ? DC_127_PRED8x8 : mode;
 865     case HOR_PRED8x8:
 866         return !mb_x ? DC_129_PRED8x8 : mode;
 867     case PLANE_PRED8x8 /*TM*/:
 868         return check_tm_pred8x8_mode(mode, mb_x, mb_y);
 869     }
 870     return mode;
 871 }
 872
 873 static av_always_inline
 874 int check_tm_pred4x4_mode(int mode, int mb_x, int mb_y)
 875 {
 876     if (!mb_x) {
 877         return mb_y ? VERT_VP8_PRED : DC_129_PRED;
 878     } else {
 879         return mb_y ? mode : HOR_VP8_PRED;
 880     }
 881 }
 882
 883 static av_always_inline
 884 int check_intra_pred4x4_mode_emuedge(int mode, int mb_x, int mb_y, int *copy_buf)
 885 {
 886     switch (mode) {
 887     case VERT_PRED:
 888         if (!mb_x && mb_y) {
 889             *copy_buf = 1;
 890             return mode;
 891         }
 892         /* fall-through */
 893     case DIAG_DOWN_LEFT_PRED:
 894     case VERT_LEFT_PRED:
 895         return !mb_y ? DC_127_PRED : mode;
 896     case HOR_PRED:
 897         if (!mb_y) {
 898             *copy_buf = 1;
 899             return mode;
 900         }
 901         /* fall-through */
 902     case HOR_UP_PRED:
 903         return !mb_x ? DC_129_PRED : mode;
 904     case TM_VP8_PRED:
 905         return check_tm_pred4x4_mode(mode, mb_x, mb_y);
 906     case DC_PRED: // 4x4 DC doesn't use the same "H.264-style" exceptions as 16x16/8x8 DC
 907     case DIAG_DOWN_RIGHT_PRED:
 908     case VERT_RIGHT_PRED:
 909     case HOR_DOWN_PRED:
 910         if (!mb_y || !mb_x)
 911             *copy_buf = 1;
 912         return mode;
 913     }
 914     return mode;
 915 }
 916
 917 static av_always_inline
 918 void intra_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
 919                    int mb_x, int mb_y)
 920 {
 921     AVCodecContext *avctx = s->avctx;
 922     int x, y, mode, nnz, tr;
 923
 924     // for the first row, we need to run xchg_mb_border to init the top edge to 127
 925     // otherwise, skip it if we aren't going to deblock
 926     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
 927         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
 928                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
 929                        s->filter.simple, 1);
 930
 931     if (mb->mode < MODE_I4x4) {
 932         if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // tested
 933             mode = check_intra_pred8x8_mode_emuedge(mb->mode, mb_x, mb_y);
 934         } else {
 935             mode = check_intra_pred8x8_mode(mb->mode, mb_x, mb_y);
 936         }
 937         s->hpc.pred16x16[mode](dst[0], s->linesize);
 938     } else {
 939         uint8_t *ptr = dst[0];
 940         uint8_t *intra4x4 = s->intra4x4_pred_mode_mb;
 941         uint8_t tr_top[4] = { 127, 127, 127, 127 };
 942
 943         // all blocks on the right edge of the macroblock use bottom edge
 944         // the top macroblock for their topright edge
 945         uint8_t *tr_right = ptr - s->linesize + 16;
 946
 947         // if we're on the right edge of the frame, said edge is extended
 948         // from the top macroblock
 949         if (!(!mb_y && avctx->flags & CODEC_FLAG_EMU_EDGE) &&
 950             mb_x == s->mb_width-1) {
 951             tr = tr_right[-1]*0x01010101;
 952             tr_right = (uint8_t *)&tr;
 953         }
 954
 955         if (mb->skip)
 956             AV_ZERO128(s->non_zero_count_cache);
 957
 958         for (y = 0; y < 4; y++) {
 959             uint8_t *topright = ptr + 4 - s->linesize;
 960             for (x = 0; x < 4; x++) {
 961                 int copy = 0, linesize = s->linesize;
 962                 uint8_t *dst = ptr+4*x;
 963                 DECLARE_ALIGNED(4, uint8_t, copy_dst)[5*8];
 964
 965                 if ((y == 0 || x == 3) && mb_y == 0 && avctx->flags & CODEC_FLAG_EMU_EDGE) {
 966                     topright = tr_top;
 967                 } else if (x == 3)
 968                     topright = tr_right;
 969
 970                 if (avctx->flags & CODEC_FLAG_EMU_EDGE) { // mb_x+x or mb_y+y is a hack but works
 971                     mode = check_intra_pred4x4_mode_emuedge(intra4x4[x], mb_x + x, mb_y + y, &copy);
 972                     if (copy) {
 973                         dst = copy_dst + 12;
 974                         linesize = 8;
 975                         if (!(mb_y + y)) {
 976                             copy_dst[3] = 127U;
 977                             AV_WN32A(copy_dst+4, 127U * 0x01010101U);
 978                         } else {
 979                             AV_COPY32(copy_dst+4, ptr+4*x-s->linesize);
 980                             if (!(mb_x + x)) {
 981                                 copy_dst[3] = 129U;
 982                             } else {
 983                                 copy_dst[3] = ptr[4*x-s->linesize-1];
 984                             }
 985                         }
 986                         if (!(mb_x + x)) {
 987                             copy_dst[11] =
 988                             copy_dst[19] =
 989                             copy_dst[27] =
 990                             copy_dst[35] = 129U;
 991                         } else {
 992                             copy_dst[11] = ptr[4*x              -1];
 993                             copy_dst[19] = ptr[4*x+s->linesize  -1];
 994                             copy_dst[27] = ptr[4*x+s->linesize*2-1];
 995                             copy_dst[35] = ptr[4*x+s->linesize*3-1];
 996                         }
 997                     }
 998                 } else {
 999                     mode = intra4x4[x];
1000                 }
1001                 s->hpc.pred4x4[mode](dst, topright, linesize);
1002                 if (copy) {
1003                     AV_COPY32(ptr+4*x              , copy_dst+12);
1004                     AV_COPY32(ptr+4*x+s->linesize  , copy_dst+20);
1005                     AV_COPY32(ptr+4*x+s->linesize*2, copy_dst+28);
1006                     AV_COPY32(ptr+4*x+s->linesize*3, copy_dst+36);
1007                 }
1008
1009                 nnz = s->non_zero_count_cache[y][x];
1010                 if (nnz) {
1011                     if (nnz == 1)
1012                         s->vp8dsp.vp8_idct_dc_add(ptr+4*x, s->block[y][x], s->linesize);
1013                     else
1014                         s->vp8dsp.vp8_idct_add(ptr+4*x, s->block[y][x], s->linesize);
1015                 }
1016                 topright += 4;
1017             }
1018
1019             ptr   += 4*s->linesize;
1020             intra4x4 += 4;
1021         }
1022     }
1023
1024     if (avctx->flags & CODEC_FLAG_EMU_EDGE) {
1025         mode = check_intra_pred8x8_mode_emuedge(s->chroma_pred_mode, mb_x, mb_y);
1026     } else {
1027         mode = check_intra_pred8x8_mode(s->chroma_pred_mode, mb_x, mb_y);
1028     }
1029     s->hpc.pred8x8[mode](dst[1], s->uvlinesize);
1030     s->hpc.pred8x8[mode](dst[2], s->uvlinesize);
1031
1032     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE && !mb_y) && (s->deblock_filter || !mb_y))
1033         xchg_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2],
1034                        s->linesize, s->uvlinesize, mb_x, mb_y, s->mb_width,
1035                        s->filter.simple, 0);
1036 }
1037
1038 static const uint8_t subpel_idx[3][8] = {
1039     { 0, 1, 2, 1, 2, 1, 2, 1 }, // nr. of left extra pixels,
1040                                 // also function pointer index
1041     { 0, 3, 5, 3, 5, 3, 5, 3 }, // nr. of extra pixels required
1042     { 0, 2, 3, 2, 3, 2, 3, 2 }, // nr. of right extra pixels
1043 };
1044
1045 /**
1046  * luma MC function
1047  *
1048  * @param s VP8 decoding context
1049  * @param dst target buffer for block data at block position
1050  * @param src reference picture buffer at origin (0, 0)
1051  * @param mv motion vector (relative to block position) to get pixel data from
1052  * @param x_off horizontal position of block from origin (0, 0)
1053  * @param y_off vertical position of block from origin (0, 0)
1054  * @param block_w width of block (16, 8 or 4)
1055  * @param block_h height of block (always same as block_w)
1056  * @param width width of src/dst plane data
1057  * @param height height of src/dst plane data
1058  * @param linesize size of a single line of plane data, including padding
1059  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1060  */
1061 static av_always_inline
1062 void vp8_mc_luma(VP8Context *s, uint8_t *dst, AVFrame *ref, const VP56mv *mv,
1063                  int x_off, int y_off, int block_w, int block_h,
1064                  int width, int height, int linesize,
1065                  vp8_mc_func mc_func[3][3])
1066 {
1067     uint8_t *src = ref->data[0];
1068
1069     if (AV_RN32A(mv)) {
1070
1071         int mx = (mv->x << 1)&7, mx_idx = subpel_idx[0][mx];
1072         int my = (mv->y << 1)&7, my_idx = subpel_idx[0][my];
1073
1074         x_off += mv->x >> 2;
1075         y_off += mv->y >> 2;
1076
1077         // edge emulation
1078         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 4, 0);
1079         src += y_off * linesize + x_off;
1080         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1081             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1082             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src - my_idx * linesize - mx_idx, linesize,
1083                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1084                                     x_off - mx_idx, y_off - my_idx, width, height);
1085             src = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1086         }
1087         mc_func[my_idx][mx_idx](dst, linesize, src, linesize, block_h, mx, my);
1088     } else {
1089         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 4, 0);
1090         mc_func[0][0](dst, linesize, src + y_off * linesize + x_off, linesize, block_h, 0, 0);
1091     }
1092 }
1093
1094 /**
1095  * chroma MC function
1096  *
1097  * @param s VP8 decoding context
1098  * @param dst1 target buffer for block data at block position (U plane)
1099  * @param dst2 target buffer for block data at block position (V plane)
1100  * @param ref reference picture buffer at origin (0, 0)
1101  * @param mv motion vector (relative to block position) to get pixel data from
1102  * @param x_off horizontal position of block from origin (0, 0)
1103  * @param y_off vertical position of block from origin (0, 0)
1104  * @param block_w width of block (16, 8 or 4)
1105  * @param block_h height of block (always same as block_w)
1106  * @param width width of src/dst plane data
1107  * @param height height of src/dst plane data
1108  * @param linesize size of a single line of plane data, including padding
1109  * @param mc_func motion compensation function pointers (bilinear or sixtap MC)
1110  */
1111 static av_always_inline
1112 void vp8_mc_chroma(VP8Context *s, uint8_t *dst1, uint8_t *dst2, AVFrame *ref,
1113                    const VP56mv *mv, int x_off, int y_off,
1114                    int block_w, int block_h, int width, int height, int linesize,
1115                    vp8_mc_func mc_func[3][3])
1116 {
1117     uint8_t *src1 = ref->data[1], *src2 = ref->data[2];
1118
1119     if (AV_RN32A(mv)) {
1120         int mx = mv->x&7, mx_idx = subpel_idx[0][mx];
1121         int my = mv->y&7, my_idx = subpel_idx[0][my];
1122
1123         x_off += mv->x >> 3;
1124         y_off += mv->y >> 3;
1125
1126         // edge emulation
1127         src1 += y_off * linesize + x_off;
1128         src2 += y_off * linesize + x_off;
1129         ff_thread_await_progress(ref, (3 + y_off + block_h + subpel_idx[2][my]) >> 3, 0);
1130         if (x_off < mx_idx || x_off >= width  - block_w - subpel_idx[2][mx] ||
1131             y_off < my_idx || y_off >= height - block_h - subpel_idx[2][my]) {
1132             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src1 - my_idx * linesize - mx_idx, linesize,
1133                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1134                                     x_off - mx_idx, y_off - my_idx, width, height);
1135             src1 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1136             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1137
1138             s->dsp.emulated_edge_mc(s->edge_emu_buffer, src2 - my_idx * linesize - mx_idx, linesize,
1139                                     block_w + subpel_idx[1][mx], block_h + subpel_idx[1][my],
1140                                     x_off - mx_idx, y_off - my_idx, width, height);
1141             src2 = s->edge_emu_buffer + mx_idx + linesize * my_idx;
1142             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1143         } else {
1144             mc_func[my_idx][mx_idx](dst1, linesize, src1, linesize, block_h, mx, my);
1145             mc_func[my_idx][mx_idx](dst2, linesize, src2, linesize, block_h, mx, my);
1146         }
1147     } else {
1148         ff_thread_await_progress(ref, (3 + y_off + block_h) >> 3, 0);
1149         mc_func[0][0](dst1, linesize, src1 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1150         mc_func[0][0](dst2, linesize, src2 + y_off * linesize + x_off, linesize, block_h, 0, 0);
1151     }
1152 }
1153
1154 static av_always_inline
1155 void vp8_mc_part(VP8Context *s, uint8_t *dst[3],
1156                  AVFrame *ref_frame, int x_off, int y_off,
1157                  int bx_off, int by_off,
1158                  int block_w, int block_h,
1159                  int width, int height, VP56mv *mv)
1160 {
1161     VP56mv uvmv = *mv;
1162
1163     /* Y */
1164     vp8_mc_luma(s, dst[0] + by_off * s->linesize + bx_off,
1165                 ref_frame, mv, x_off + bx_off, y_off + by_off,
1166                 block_w, block_h, width, height, s->linesize,
1167                 s->put_pixels_tab[block_w == 8]);
1168
1169     /* U/V */
1170     if (s->profile == 3) {
1171         uvmv.x &= ~7;
1172         uvmv.y &= ~7;
1173     }
1174     x_off   >>= 1; y_off   >>= 1;
1175     bx_off  >>= 1; by_off  >>= 1;
1176     width   >>= 1; height  >>= 1;
1177     block_w >>= 1; block_h >>= 1;
1178     vp8_mc_chroma(s, dst[1] + by_off * s->uvlinesize + bx_off,
1179                   dst[2] + by_off * s->uvlinesize + bx_off, ref_frame,
1180                   &uvmv, x_off + bx_off, y_off + by_off,
1181                   block_w, block_h, width, height, s->uvlinesize,
1182                   s->put_pixels_tab[1 + (block_w == 4)]);
1183 }
1184
1185 /* Fetch pixels for estimated mv 4 macroblocks ahead.
1186  * Optimized for 64-byte cache lines.  Inspired by ffh264 prefetch_motion. */
1187 static av_always_inline void prefetch_motion(VP8Context *s, VP8Macroblock *mb, int mb_x, int mb_y, int mb_xy, int ref)
1188 {
1189     /* Don't prefetch refs that haven't been used very often this frame. */
1190     if (s->ref_count[ref-1] > (mb_xy >> 5)) {
1191         int x_off = mb_x << 4, y_off = mb_y << 4;
1192         int mx = (mb->mv.x>>2) + x_off + 8;
1193         int my = (mb->mv.y>>2) + y_off;
1194         uint8_t **src= s->framep[ref]->data;
1195         int off= mx + (my + (mb_x&3)*4)*s->linesize + 64;
1196         /* For threading, a ff_thread_await_progress here might be useful, but
1197          * it actually slows down the decoder. Since a bad prefetch doesn't
1198          * generate bad decoder output, we don't run it here. */
1199         s->dsp.prefetch(src[0]+off, s->linesize, 4);
1200         off= (mx>>1) + ((my>>1) + (mb_x&7))*s->uvlinesize + 64;
1201         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
1202     }
1203 }
1204
1205 /**
1206  * Apply motion vectors to prediction buffer, chapter 18.
1207  */
1208 static av_always_inline
1209 void inter_predict(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb,
1210                    int mb_x, int mb_y)
1211 {
1212     int x_off = mb_x << 4, y_off = mb_y << 4;
1213     int width = 16*s->mb_width, height = 16*s->mb_height;
1214     AVFrame *ref = s->framep[mb->ref_frame];
1215     VP56mv *bmv = mb->bmv;
1216
1217     switch (mb->partitioning) {
1218     case VP8_SPLITMVMODE_NONE:
1219         vp8_mc_part(s, dst, ref, x_off, y_off,
1220                     0, 0, 16, 16, width, height, &mb->mv);
1221         break;
1222     case VP8_SPLITMVMODE_4x4: {
1223         int x, y;
1224         VP56mv uvmv;
1225
1226         /* Y */
1227         for (y = 0; y < 4; y++) {
1228             for (x = 0; x < 4; x++) {
1229                 vp8_mc_luma(s, dst[0] + 4*y*s->linesize + x*4,
1230                             ref, &bmv[4*y + x],
1231                             4*x + x_off, 4*y + y_off, 4, 4,
1232                             width, height, s->linesize,
1233                             s->put_pixels_tab[2]);
1234             }
1235         }
1236
1237         /* U/V */
1238         x_off >>= 1; y_off >>= 1; width >>= 1; height >>= 1;
1239         for (y = 0; y < 2; y++) {
1240             for (x = 0; x < 2; x++) {
1241                 uvmv.x = mb->bmv[ 2*y    * 4 + 2*x  ].x +
1242                          mb->bmv[ 2*y    * 4 + 2*x+1].x +
1243                          mb->bmv[(2*y+1) * 4 + 2*x  ].x +
1244                          mb->bmv[(2*y+1) * 4 + 2*x+1].x;
1245                 uvmv.y = mb->bmv[ 2*y    * 4 + 2*x  ].y +
1246                          mb->bmv[ 2*y    * 4 + 2*x+1].y +
1247                          mb->bmv[(2*y+1) * 4 + 2*x  ].y +
1248                          mb->bmv[(2*y+1) * 4 + 2*x+1].y;
1249                 uvmv.x = (uvmv.x + 2 + (uvmv.x >> (INT_BIT-1))) >> 2;
1250                 uvmv.y = (uvmv.y + 2 + (uvmv.y >> (INT_BIT-1))) >> 2;
1251                 if (s->profile == 3) {
1252                     uvmv.x &= ~7;
1253                     uvmv.y &= ~7;
1254                 }
1255                 vp8_mc_chroma(s, dst[1] + 4*y*s->uvlinesize + x*4,
1256                               dst[2] + 4*y*s->uvlinesize + x*4, ref, &uvmv,
1257                               4*x + x_off, 4*y + y_off, 4, 4,
1258                               width, height, s->uvlinesize,
1259                               s->put_pixels_tab[2]);
1260             }
1261         }
1262         break;
1263     }
1264     case VP8_SPLITMVMODE_16x8:
1265         vp8_mc_part(s, dst, ref, x_off, y_off,
1266                     0, 0, 16, 8, width, height, &bmv[0]);
1267         vp8_mc_part(s, dst, ref, x_off, y_off,
1268                     0, 8, 16, 8, width, height, &bmv[1]);
1269         break;
1270     case VP8_SPLITMVMODE_8x16:
1271         vp8_mc_part(s, dst, ref, x_off, y_off,
1272                     0, 0, 8, 16, width, height, &bmv[0]);
1273         vp8_mc_part(s, dst, ref, x_off, y_off,
1274                     8, 0, 8, 16, width, height, &bmv[1]);
1275         break;
1276     case VP8_SPLITMVMODE_8x8:
1277         vp8_mc_part(s, dst, ref, x_off, y_off,
1278                     0, 0, 8, 8, width, height, &bmv[0]);
1279         vp8_mc_part(s, dst, ref, x_off, y_off,
1280                     8, 0, 8, 8, width, height, &bmv[1]);
1281         vp8_mc_part(s, dst, ref, x_off, y_off,
1282                     0, 8, 8, 8, width, height, &bmv[2]);
1283         vp8_mc_part(s, dst, ref, x_off, y_off,
1284                     8, 8, 8, 8, width, height, &bmv[3]);
1285         break;
1286     }
1287 }
1288
1289 static av_always_inline void idct_mb(VP8Context *s, uint8_t *dst[3], VP8Macroblock *mb)
1290 {
1291     int x, y, ch;
1292
1293     if (mb->mode != MODE_I4x4) {
1294         uint8_t *y_dst = dst[0];
1295         for (y = 0; y < 4; y++) {
1296             uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[y]);
1297             if (nnz4) {
1298                 if (nnz4&~0x01010101) {
1299                     for (x = 0; x < 4; x++) {
1300                         if ((uint8_t)nnz4 == 1)
1301                             s->vp8dsp.vp8_idct_dc_add(y_dst+4*x, s->block[y][x], s->linesize);
1302                         else if((uint8_t)nnz4 > 1)
1303                             s->vp8dsp.vp8_idct_add(y_dst+4*x, s->block[y][x], s->linesize);
1304                         nnz4 >>= 8;
1305                         if (!nnz4)
1306                             break;
1307                     }
1308                 } else {
1309                     s->vp8dsp.vp8_idct_dc_add4y(y_dst, s->block[y], s->linesize);
1310                 }
1311             }
1312             y_dst += 4*s->linesize;
1313         }
1314     }
1315
1316     for (ch = 0; ch < 2; ch++) {
1317         uint32_t nnz4 = AV_RL32(s->non_zero_count_cache[4+ch]);
1318         if (nnz4) {
1319             uint8_t *ch_dst = dst[1+ch];
1320             if (nnz4&~0x01010101) {
1321                 for (y = 0; y < 2; y++) {
1322                     for (x = 0; x < 2; x++) {
1323                         if ((uint8_t)nnz4 == 1)
1324                             s->vp8dsp.vp8_idct_dc_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1325                         else if((uint8_t)nnz4 > 1)
1326                             s->vp8dsp.vp8_idct_add(ch_dst+4*x, s->block[4+ch][(y<<1)+x], s->uvlinesize);
1327                         nnz4 >>= 8;
1328                         if (!nnz4)
1329                             goto chroma_idct_end;
1330                     }
1331                     ch_dst += 4*s->uvlinesize;
1332                 }
1333             } else {
1334                 s->vp8dsp.vp8_idct_dc_add4uv(ch_dst, s->block[4+ch], s->uvlinesize);
1335             }
1336         }
1337 chroma_idct_end: ;
1338     }
1339 }
1340
1341 static av_always_inline void filter_level_for_mb(VP8Context *s, VP8Macroblock *mb, VP8FilterStrength *f )
1342 {
1343     int interior_limit, filter_level;
1344
1345     if (s->segmentation.enabled) {
1346         filter_level = s->segmentation.filter_level[s->segment];
1347         if (!s->segmentation.absolute_vals)
1348             filter_level += s->filter.level;
1349     } else
1350         filter_level = s->filter.level;
1351
1352     if (s->lf_delta.enabled) {
1353         filter_level += s->lf_delta.ref[mb->ref_frame];
1354         filter_level += s->lf_delta.mode[mb->mode];
1355     }
1356
1357     filter_level = av_clip_uintp2(filter_level, 6);
1358
1359     interior_limit = filter_level;
1360     if (s->filter.sharpness) {
1361         interior_limit >>= (s->filter.sharpness + 3) >> 2;
1362         interior_limit = FFMIN(interior_limit, 9 - s->filter.sharpness);
1363     }
1364     interior_limit = FFMAX(interior_limit, 1);
1365
1366     f->filter_level = filter_level;
1367     f->inner_limit = interior_limit;
1368     f->inner_filter = !mb->skip || mb->mode == MODE_I4x4 || mb->mode == VP8_MVMODE_SPLIT;
1369 }
1370
1371 static av_always_inline void filter_mb(VP8Context *s, uint8_t *dst[3], VP8FilterStrength *f, int mb_x, int mb_y)
1372 {
1373     int mbedge_lim, bedge_lim, hev_thresh;
1374     int filter_level = f->filter_level;
1375     int inner_limit = f->inner_limit;
1376     int inner_filter = f->inner_filter;
1377     int linesize = s->linesize;
1378     int uvlinesize = s->uvlinesize;
1379     static const uint8_t hev_thresh_lut[2][64] = {
1380         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1381           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1382           3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
1383           3, 3, 3, 3 },
1384         { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
1385           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1386           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
1387           2, 2, 2, 2 }
1388     };
1389
1390     if (!filter_level)
1391         return;
1392
1393      bedge_lim = 2*filter_level + inner_limit;
1394     mbedge_lim = bedge_lim + 4;
1395
1396     hev_thresh = hev_thresh_lut[s->keyframe][filter_level];
1397
1398     if (mb_x) {
1399         s->vp8dsp.vp8_h_loop_filter16y(dst[0],     linesize,
1400                                        mbedge_lim, inner_limit, hev_thresh);
1401         s->vp8dsp.vp8_h_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1402                                        mbedge_lim, inner_limit, hev_thresh);
1403     }
1404
1405     if (inner_filter) {
1406         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 4, linesize, bedge_lim,
1407                                              inner_limit, hev_thresh);
1408         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+ 8, linesize, bedge_lim,
1409                                              inner_limit, hev_thresh);
1410         s->vp8dsp.vp8_h_loop_filter16y_inner(dst[0]+12, linesize, bedge_lim,
1411                                              inner_limit, hev_thresh);
1412         s->vp8dsp.vp8_h_loop_filter8uv_inner(dst[1] + 4, dst[2] + 4,
1413                                              uvlinesize,  bedge_lim,
1414                                              inner_limit, hev_thresh);
1415     }
1416
1417     if (mb_y) {
1418         s->vp8dsp.vp8_v_loop_filter16y(dst[0],     linesize,
1419                                        mbedge_lim, inner_limit, hev_thresh);
1420         s->vp8dsp.vp8_v_loop_filter8uv(dst[1],     dst[2],      uvlinesize,
1421                                        mbedge_lim, inner_limit, hev_thresh);
1422     }
1423
1424     if (inner_filter) {
1425         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 4*linesize,
1426                                              linesize,    bedge_lim,
1427                                              inner_limit, hev_thresh);
1428         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+ 8*linesize,
1429                                              linesize,    bedge_lim,
1430                                              inner_limit, hev_thresh);
1431         s->vp8dsp.vp8_v_loop_filter16y_inner(dst[0]+12*linesize,
1432                                              linesize,    bedge_lim,
1433                                              inner_limit, hev_thresh);
1434         s->vp8dsp.vp8_v_loop_filter8uv_inner(dst[1] + 4 * uvlinesize,
1435                                              dst[2] + 4 * uvlinesize,
1436                                              uvlinesize,  bedge_lim,
1437                                              inner_limit, hev_thresh);
1438     }
1439 }
1440
1441 static av_always_inline void filter_mb_simple(VP8Context *s, uint8_t *dst, VP8FilterStrength *f, int mb_x, int mb_y)
1442 {
1443     int mbedge_lim, bedge_lim;
1444     int filter_level = f->filter_level;
1445     int inner_limit = f->inner_limit;
1446     int inner_filter = f->inner_filter;
1447     int linesize = s->linesize;
1448
1449     if (!filter_level)
1450         return;
1451
1452      bedge_lim = 2*filter_level + inner_limit;
1453     mbedge_lim = bedge_lim + 4;
1454
1455     if (mb_x)
1456         s->vp8dsp.vp8_h_loop_filter_simple(dst, linesize, mbedge_lim);
1457     if (inner_filter) {
1458         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 4, linesize, bedge_lim);
1459         s->vp8dsp.vp8_h_loop_filter_simple(dst+ 8, linesize, bedge_lim);
1460         s->vp8dsp.vp8_h_loop_filter_simple(dst+12, linesize, bedge_lim);
1461     }
1462
1463     if (mb_y)
1464         s->vp8dsp.vp8_v_loop_filter_simple(dst, linesize, mbedge_lim);
1465     if (inner_filter) {
1466         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 4*linesize, linesize, bedge_lim);
1467         s->vp8dsp.vp8_v_loop_filter_simple(dst+ 8*linesize, linesize, bedge_lim);
1468         s->vp8dsp.vp8_v_loop_filter_simple(dst+12*linesize, linesize, bedge_lim);
1469     }
1470 }
1471
1472 static void filter_mb_row(VP8Context *s, AVFrame *curframe, int mb_y)
1473 {
1474     VP8FilterStrength *f = s->filter_strength;
1475     uint8_t *dst[3] = {
1476         curframe->data[0] + 16*mb_y*s->linesize,
1477         curframe->data[1] +  8*mb_y*s->uvlinesize,
1478         curframe->data[2] +  8*mb_y*s->uvlinesize
1479     };
1480     int mb_x;
1481
1482     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1483         backup_mb_border(s->top_border[mb_x+1], dst[0], dst[1], dst[2], s->linesize, s->uvlinesize, 0);
1484         filter_mb(s, dst, f++, mb_x, mb_y);
1485         dst[0] += 16;
1486         dst[1] += 8;
1487         dst[2] += 8;
1488     }
1489 }
1490
1491 static void filter_mb_row_simple(VP8Context *s, AVFrame *curframe, int mb_y)
1492 {
1493     VP8FilterStrength *f = s->filter_strength;
1494     uint8_t *dst = curframe->data[0] + 16*mb_y*s->linesize;
1495     int mb_x;
1496
1497     for (mb_x = 0; mb_x < s->mb_width; mb_x++) {
1498         backup_mb_border(s->top_border[mb_x+1], dst, NULL, NULL, s->linesize, 0, 1);
1499         filter_mb_simple(s, dst, f++, mb_x, mb_y);
1500         dst += 16;
1501     }
1502 }
1503
1504 static int vp8_decode_frame(AVCodecContext *avctx, void *data, int *data_size,
1505                             AVPacket *avpkt)
1506 {
1507     VP8Context *s = avctx->priv_data;
1508     int ret, mb_x, mb_y, i, y, referenced;
1509     enum AVDiscard skip_thresh;
1510     AVFrame *av_uninit(curframe), *prev_frame = s->framep[VP56_FRAME_CURRENT];
1511
1512     if ((ret = decode_frame_header(s, avpkt->data, avpkt->size)) < 0)
1513         return ret;
1514
1515     referenced = s->update_last || s->update_golden == VP56_FRAME_CURRENT
1516                                 || s->update_altref == VP56_FRAME_CURRENT;
1517
1518     skip_thresh = !referenced ? AVDISCARD_NONREF :
1519                     !s->keyframe ? AVDISCARD_NONKEY : AVDISCARD_ALL;
1520
1521     if (avctx->skip_frame >= skip_thresh) {
1522         s->invisible = 1;
1523         goto skip_decode;
1524     }
1525     s->deblock_filter = s->filter.level && avctx->skip_loop_filter < skip_thresh;
1526
1527     // release no longer referenced frames
1528     for (i = 0; i < 5; i++)
1529         if (s->frames[i].data[0] &&
1530             &s->frames[i] != prev_frame &&
1531             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1532             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1533             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2])
1534             ff_thread_release_buffer(avctx, &s->frames[i]);
1535
1536     // find a free buffer
1537     for (i = 0; i < 5; i++)
1538         if (&s->frames[i] != prev_frame &&
1539             &s->frames[i] != s->framep[VP56_FRAME_PREVIOUS] &&
1540             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN] &&
1541             &s->frames[i] != s->framep[VP56_FRAME_GOLDEN2]) {
1542             curframe = s->framep[VP56_FRAME_CURRENT] = &s->frames[i];
1543             break;
1544         }
1545     if (i == 5) {
1546         av_log(avctx, AV_LOG_FATAL, "Ran out of free frames!\n");
1547         abort();
1548     }
1549     if (curframe->data[0])
1550         ff_thread_release_buffer(avctx, curframe);
1551
1552     curframe->key_frame = s->keyframe;
1553     curframe->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
1554     curframe->reference = referenced ? 3 : 0;
1555     curframe->ref_index[0] = s->segmentation_map;
1556     if ((ret = ff_thread_get_buffer(avctx, curframe))) {
1557         av_log(avctx, AV_LOG_ERROR, "get_buffer() failed!\n");
1558         return ret;
1559     }
1560
1561     // check if golden and altref are swapped
1562     if (s->update_altref != VP56_FRAME_NONE) {
1563         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[s->update_altref];
1564     } else {
1565         s->next_framep[VP56_FRAME_GOLDEN2]  = s->framep[VP56_FRAME_GOLDEN2];
1566     }
1567     if (s->update_golden != VP56_FRAME_NONE) {
1568         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[s->update_golden];
1569     } else {
1570         s->next_framep[VP56_FRAME_GOLDEN]   = s->framep[VP56_FRAME_GOLDEN];
1571     }
1572     if (s->update_last) {
1573         s->next_framep[VP56_FRAME_PREVIOUS] = curframe;
1574     } else {
1575         s->next_framep[VP56_FRAME_PREVIOUS] = s->framep[VP56_FRAME_PREVIOUS];
1576     }
1577     s->next_framep[VP56_FRAME_CURRENT]      = curframe;
1578
1579     ff_thread_finish_setup(avctx);
1580
1581     // Given that arithmetic probabilities are updated every frame, it's quite likely
1582     // that the values we have on a random interframe are complete junk if we didn't
1583     // start decode on a keyframe. So just don't display anything rather than junk.
1584     if (!s->keyframe && (!s->framep[VP56_FRAME_PREVIOUS] ||
1585                          !s->framep[VP56_FRAME_GOLDEN] ||
1586                          !s->framep[VP56_FRAME_GOLDEN2])) {
1587         av_log(avctx, AV_LOG_WARNING, "Discarding interframe without a prior keyframe!\n");
1588         return AVERROR_INVALIDDATA;
1589     }
1590
1591     s->linesize   = curframe->linesize[0];
1592     s->uvlinesize = curframe->linesize[1];
1593
1594     if (!s->edge_emu_buffer)
1595         s->edge_emu_buffer = av_malloc(21*s->linesize);
1596
1597     memset(s->top_nnz, 0, s->mb_width*sizeof(*s->top_nnz));
1598
1599     /* Zero macroblock structures for top/top-left prediction from outside the frame. */
1600     memset(s->macroblocks + s->mb_height*2 - 1, 0, (s->mb_width+1)*sizeof(*s->macroblocks));
1601
1602     // top edge of 127 for intra prediction
1603     if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1604         s->top_border[0][15] = s->top_border[0][23] = 127;
1605         memset(s->top_border[1]-1, 127, s->mb_width*sizeof(*s->top_border)+1);
1606     }
1607     memset(s->ref_count, 0, sizeof(s->ref_count));
1608     if (s->keyframe)
1609         memset(s->intra4x4_pred_mode_top, DC_PRED, s->mb_width*4);
1610
1611 #define MARGIN (16 << 2)
1612     s->mv_min.y = -MARGIN;
1613     s->mv_max.y = ((s->mb_height - 1) << 6) + MARGIN;
1614
1615     for (mb_y = 0; mb_y < s->mb_height; mb_y++) {
1616         VP56RangeCoder *c = &s->coeff_partition[mb_y & (s->num_coeff_partitions-1)];
1617         VP8Macroblock *mb = s->macroblocks + (s->mb_height - mb_y - 1)*2;
1618         int mb_xy = mb_y*s->mb_width;
1619         uint8_t *dst[3] = {
1620             curframe->data[0] + 16*mb_y*s->linesize,
1621             curframe->data[1] +  8*mb_y*s->uvlinesize,
1622             curframe->data[2] +  8*mb_y*s->uvlinesize
1623         };
1624
1625         memset(mb - 1, 0, sizeof(*mb));   // zero left macroblock
1626         memset(s->left_nnz, 0, sizeof(s->left_nnz));
1627         AV_WN32A(s->intra4x4_pred_mode_left, DC_PRED*0x01010101);
1628
1629         // left edge of 129 for intra prediction
1630         if (!(avctx->flags & CODEC_FLAG_EMU_EDGE)) {
1631             for (i = 0; i < 3; i++)
1632                 for (y = 0; y < 16>>!!i; y++)
1633                     dst[i][y*curframe->linesize[i]-1] = 129;
1634             if (mb_y == 1) // top left edge is also 129
1635                 s->top_border[0][15] = s->top_border[0][23] = s->top_border[0][31] = 129;
1636         }
1637
1638         s->mv_min.x = -MARGIN;
1639         s->mv_max.x = ((s->mb_width  - 1) << 6) + MARGIN;
1640         if (prev_frame && s->segmentation.enabled && !s->segmentation.update_map)
1641             ff_thread_await_progress(prev_frame, mb_y, 0);
1642
1643         for (mb_x = 0; mb_x < s->mb_width; mb_x++, mb_xy++, mb++) {
1644             /* Prefetch the current frame, 4 MBs ahead */
1645             s->dsp.prefetch(dst[0] + (mb_x&3)*4*s->linesize + 64, s->linesize, 4);
1646             s->dsp.prefetch(dst[1] + (mb_x&7)*s->uvlinesize + 64, dst[2] - dst[1], 2);
1647
1648             decode_mb_mode(s, mb, mb_x, mb_y, s->segmentation_map + mb_xy,
1649                            prev_frame ? prev_frame->ref_index[0] + mb_xy : NULL);
1650
1651             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_PREVIOUS);
1652
1653             if (!mb->skip)
1654                 decode_mb_coeffs(s, c, mb, s->top_nnz[mb_x], s->left_nnz);
1655
1656             if (mb->mode <= MODE_I4x4)
1657                 intra_predict(s, dst, mb, mb_x, mb_y);
1658             else
1659                 inter_predict(s, dst, mb, mb_x, mb_y);
1660
1661             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN);
1662
1663             if (!mb->skip) {
1664                 idct_mb(s, dst, mb);
1665             } else {
1666                 AV_ZERO64(s->left_nnz);
1667                 AV_WN64(s->top_nnz[mb_x], 0);   // array of 9, so unaligned
1668
1669                 // Reset DC block predictors if they would exist if the mb had coefficients
1670                 if (mb->mode != MODE_I4x4 && mb->mode != VP8_MVMODE_SPLIT) {
1671                     s->left_nnz[8]      = 0;
1672                     s->top_nnz[mb_x][8] = 0;
1673                 }
1674             }
1675
1676             if (s->deblock_filter)
1677                 filter_level_for_mb(s, mb, &s->filter_strength[mb_x]);
1678
1679             prefetch_motion(s, mb, mb_x, mb_y, mb_xy, VP56_FRAME_GOLDEN2);
1680
1681             dst[0] += 16;
1682             dst[1] += 8;
1683             dst[2] += 8;
1684             s->mv_min.x -= 64;
1685             s->mv_max.x -= 64;
1686         }
1687         if (s->deblock_filter) {
1688             if (s->filter.simple)
1689                 filter_mb_row_simple(s, curframe, mb_y);
1690             else
1691                 filter_mb_row(s, curframe, mb_y);
1692         }
1693         s->mv_min.y -= 64;
1694         s->mv_max.y -= 64;
1695
1696         ff_thread_report_progress(curframe, mb_y, 0);
1697     }
1698
1699     ff_thread_report_progress(curframe, INT_MAX, 0);
1700 skip_decode:
1701     // if future frames don't use the updated probabilities,
1702     // reset them to the values we saved
1703     if (!s->update_probabilities)
1704         s->prob[0] = s->prob[1];
1705
1706     memcpy(&s->framep[0], &s->next_framep[0], sizeof(s->framep[0]) * 4);
1707
1708     if (!s->invisible) {
1709         *(AVFrame*)data = *curframe;
1710         *data_size = sizeof(AVFrame);
1711     }
1712
1713     return avpkt->size;
1714 }
1715
1716 static av_cold int vp8_decode_init(AVCodecContext *avctx)
1717 {
1718     VP8Context *s = avctx->priv_data;
1719
1720     s->avctx = avctx;
1721     avctx->pix_fmt = PIX_FMT_YUV420P;
1722
1723     dsputil_init(&s->dsp, avctx);
1724     ff_h264_pred_init(&s->hpc, CODEC_ID_VP8, 8);
1725     ff_vp8dsp_init(&s->vp8dsp);
1726
1727     return 0;
1728 }
1729
1730 static av_cold int vp8_decode_free(AVCodecContext *avctx)
1731 {
1732     vp8_decode_flush(avctx);
1733     return 0;
1734 }
1735
1736 static av_cold int vp8_decode_init_thread_copy(AVCodecContext *avctx)
1737 {
1738     VP8Context *s = avctx->priv_data;
1739
1740     s->avctx = avctx;
1741
1742     return 0;
1743 }
1744
1745 #define REBASE(pic) \
1746     pic ? pic - &s_src->frames[0] + &s->frames[0] : NULL
1747
1748 static int vp8_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
1749 {
1750     VP8Context *s = dst->priv_data, *s_src = src->priv_data;
1751
1752     s->prob[0] = s_src->prob[!s_src->update_probabilities];
1753     s->segmentation = s_src->segmentation;
1754     s->lf_delta = s_src->lf_delta;
1755     memcpy(s->sign_bias, s_src->sign_bias, sizeof(s->sign_bias));
1756
1757     memcpy(&s->frames, &s_src->frames, sizeof(s->frames));
1758     s->framep[0] = REBASE(s_src->next_framep[0]);
1759     s->framep[1] = REBASE(s_src->next_framep[1]);
1760     s->framep[2] = REBASE(s_src->next_framep[2]);
1761     s->framep[3] = REBASE(s_src->next_framep[3]);
1762
1763     return 0;
1764 }
1765
1766 AVCodec ff_vp8_decoder = {
1767     "vp8",
1768     AVMEDIA_TYPE_VIDEO,
1769     CODEC_ID_VP8,
1770     sizeof(VP8Context),
1771     vp8_decode_init,
1772     NULL,
1773     vp8_decode_free,
1774     vp8_decode_frame,
1775     CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
1776     .flush = vp8_decode_flush,
1777     .long_name = NULL_IF_CONFIG_SMALL("On2 VP8"),
1778     .init_thread_copy      = ONLY_IF_THREADS_ENABLED(vp8_decode_init_thread_copy),
1779     .update_thread_context = ONLY_IF_THREADS_ENABLED(vp8_decode_update_thread_context),
1780 };