1 /********************************************************************
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
9 * by the Xiph.Org Foundation http://www.xiph.org/ *
11 ********************************************************************
13 function: mode selection code
16 ********************************************************************/
24 typedef struct oc_fr_state oc_fr_state;
25 typedef struct oc_qii_state oc_qii_state;
26 typedef struct oc_enc_pipeline_state oc_enc_pipeline_state;
27 typedef struct oc_rd_metric oc_rd_metric;
28 typedef struct oc_mode_choice oc_mode_choice;
32 /*There are 8 possible schemes used to encode macro block modes.
33 Schemes 0-6 use a maximally-skewed Huffman code to code each of the modes.
34 The same set of Huffman codes is used for each of these 7 schemes, but the
35 mode assigned to each codeword varies.
36 Scheme 0 writes a custom mapping from codeword to MB mode to the bitstream,
37 while schemes 1-6 have a fixed mapping.
38 Scheme 7 just encodes each mode directly in 3 bits.*/
40 /*The mode orderings for the various mode coding schemes.
41 Scheme 0 uses a custom alphabet, which is not stored in this table.
42 This is the inverse of the equivalent table OC_MODE_ALPHABETS in the
44 static const unsigned char OC_MODE_RANKS[7][OC_NMODES]={
45 /*Last MV dominates.*/
66 /*Initialize the mode scheme chooser.
67 This need only be called once per encoder.*/
68 void oc_mode_scheme_chooser_init(oc_mode_scheme_chooser *_chooser){
70 _chooser->mode_ranks[0]=_chooser->scheme0_ranks;
71 for(si=1;si<8;si++)_chooser->mode_ranks[si]=OC_MODE_RANKS[si-1];
74 /*Reset the mode scheme chooser.
75 This needs to be called once for each frame, including the first.*/
76 static void oc_mode_scheme_chooser_reset(oc_mode_scheme_chooser *_chooser){
78 memset(_chooser->mode_counts,0,OC_NMODES*sizeof(*_chooser->mode_counts));
79 /*Scheme 0 starts with 24 bits to store the mode list in.*/
80 _chooser->scheme_bits[0]=24;
81 memset(_chooser->scheme_bits+1,0,7*sizeof(*_chooser->scheme_bits));
83 /*Scheme 7 should always start first, and scheme 0 should always start
85 _chooser->scheme_list[si]=7-si;
86 _chooser->scheme0_list[si]=_chooser->scheme0_ranks[si]=si;
91 /*This is the real purpose of this data structure: not actually selecting a
92 mode scheme, but estimating the cost of coding a given mode given all the
93 modes selected so far.
94 This is done via opportunity cost: the cost is defined as the number of bits
95 required to encode all the modes selected so far including the current one
96 using the best possible scheme, minus the number of bits required to encode
97 all the modes selected so far not including the current one using the best
99 The computational expense of doing this probably makes it overkill.
100 Just be happy we take a greedy approach instead of trying to solve the
101 global mode-selection problem (which is NP-hard).
102 _mb_mode: The mode to determine the cost of.
103 Return: The number of bits required to code this mode.*/
104 static int oc_mode_scheme_chooser_cost(oc_mode_scheme_chooser *_chooser,
112 scheme0=_chooser->scheme_list[0];
113 scheme1=_chooser->scheme_list[1];
114 best_bits=_chooser->scheme_bits[scheme0];
115 mode_bits=OC_MODE_BITS[scheme0+1>>3][_chooser->mode_ranks[scheme0][_mb_mode]];
116 /*Typical case: If the difference between the best scheme and the next best
117 is greater than 6 bits, then adding just one mode cannot change which
119 if(_chooser->scheme_bits[scheme1]-best_bits>6)return mode_bits;
120 /*Otherwise, check to see if adding this mode selects a different scheme as
123 best_bits+=mode_bits;
125 /*For any scheme except 0, we can just use the bit cost of the mode's rank
128 scheme_bits=_chooser->scheme_bits[scheme1]+
129 OC_MODE_BITS[scheme1+1>>3][_chooser->mode_ranks[scheme1][_mb_mode]];
133 /*For scheme 0, incrementing the mode count could potentially change the
135 Find the index where the mode would be moved to in the optimal list,
136 and use its bit cost instead of the one for the mode's current
137 position in the list.*/
138 /*We don't recompute scheme bits; this is computing opportunity cost, not
140 for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0&&
141 _chooser->mode_counts[_mb_mode]>=
142 _chooser->mode_counts[_chooser->scheme0_list[ri-1]];ri--);
143 scheme_bits=_chooser->scheme_bits[0]+OC_MODE_BITS[0][ri];
145 if(scheme_bits<best_bits)best_bits=scheme_bits;
147 scheme1=_chooser->scheme_list[si];
149 while(_chooser->scheme_bits[scheme1]-_chooser->scheme_bits[scheme0]<=6);
150 return best_bits-_chooser->scheme_bits[scheme0];
153 /*Incrementally update the mode counts and per-scheme bit counts and re-order
154 the scheme lists once a mode has been selected.
155 _mb_mode: The mode that was chosen.*/
156 static void oc_mode_scheme_chooser_update(oc_mode_scheme_chooser *_chooser,
160 _chooser->mode_counts[_mb_mode]++;
161 /*Re-order the scheme0 mode list if necessary.*/
162 for(ri=_chooser->scheme0_ranks[_mb_mode];ri>0;ri--){
164 pmode=_chooser->scheme0_list[ri-1];
165 if(_chooser->mode_counts[pmode]>=_chooser->mode_counts[_mb_mode])break;
166 /*Reorder the mode ranking.*/
167 _chooser->scheme0_ranks[pmode]++;
168 _chooser->scheme0_list[ri]=pmode;
170 _chooser->scheme0_ranks[_mb_mode]=ri;
171 _chooser->scheme0_list[ri]=_mb_mode;
172 /*Now add the bit cost for the mode to each scheme.*/
174 _chooser->scheme_bits[si]+=
175 OC_MODE_BITS[si+1>>3][_chooser->mode_ranks[si][_mb_mode]];
177 /*Finally, re-order the list of schemes.*/
183 scheme0=_chooser->scheme_list[si];
184 bits0=_chooser->scheme_bits[scheme0];
187 scheme1=_chooser->scheme_list[sj-1];
188 if(bits0>=_chooser->scheme_bits[scheme1])break;
189 _chooser->scheme_list[sj]=scheme1;
192 _chooser->scheme_list[sj]=scheme0;
198 /*The number of bits required to encode a super block run.
199 _run_count: The desired run count; must be positive and less than 4130.*/
200 static int oc_sb_run_bits(int _run_count){
202 for(i=0;_run_count>=OC_SB_RUN_VAL_MIN[i+1];i++);
203 return OC_SB_RUN_CODE_NBITS[i];
206 /*The number of bits required to encode a block run.
207 _run_count: The desired run count; must be positive and less than 30.*/
208 static int oc_block_run_bits(int _run_count){
209 return OC_BLOCK_RUN_CODE_NBITS[_run_count-1];
214 /*State to track coded block flags and their bit cost.*/
217 unsigned sb_partial_count:16;
218 unsigned sb_full_count:16;
219 unsigned b_coded_count_prev:8;
220 unsigned b_coded_count:8;
222 signed int sb_partial:2;
223 signed int sb_full:2;
224 signed int b_coded_prev:2;
225 signed int b_coded:2;
230 static void oc_fr_state_init(oc_fr_state *_fr){
232 _fr->sb_partial_count=0;
233 _fr->sb_full_count=0;
234 _fr->b_coded_count_prev=0;
235 _fr->b_coded_count=0;
239 _fr->b_coded_prev=-1;
244 static void oc_fr_state_advance_sb(oc_fr_state *_fr,
245 int _sb_partial,int _sb_full){
247 int sb_partial_count;
250 /*Extend the sb_partial run, or start a new one.*/
251 sb_partial_count=_fr->sb_partial;
252 if(_fr->sb_partial==_sb_partial){
253 if(sb_partial_count>=4129){
257 else bits-=oc_sb_run_bits(sb_partial_count);
259 else sb_partial_count=0;
261 bits+=oc_sb_run_bits(sb_partial_count);
263 /*Extend the sb_full run, or start a new one.*/
264 sb_full_count=_fr->sb_full_count;
265 if(_fr->sb_full==_sb_full){
266 if(sb_full_count>=4129){
270 else bits-=oc_sb_run_bits(sb_full_count);
272 else sb_full_count=0;
274 bits+=oc_sb_run_bits(sb_full_count);
275 _fr->sb_full=_sb_full;
276 _fr->sb_full_count=sb_full_count;
279 _fr->sb_partial=_sb_partial;
280 _fr->sb_partial_count=sb_partial_count;
283 /*Flush any outstanding block flags for a SB (e.g., one with fewer than 16
285 static void oc_fr_state_flush_sb(oc_fr_state *_fr){
292 b_count=_fr->b_count;
295 b_coded=_fr->b_coded;
296 b_coded_count=_fr->b_coded_count;
297 if(b_coded_count>=b_count){
298 /*This SB was fully coded/uncoded; roll back the partial block flags.*/
299 bits-=oc_block_run_bits(b_coded_count);
300 if(b_coded_count>b_count)bits+=oc_block_run_bits(b_coded_count-b_count);
303 b_coded=_fr->b_coded_prev;
304 b_coded_count=_fr->b_coded_count_prev;
307 /*It was partially coded.*/
309 /*sb_full is unused.*/
312 _fr->b_coded_count=b_coded_count;
313 _fr->b_coded_count_prev=b_coded_count;
315 _fr->b_coded=b_coded;
316 _fr->b_coded_prev=b_coded;
317 oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
321 static void oc_fr_state_advance_block(oc_fr_state *_fr,int _b_coded){
328 /*Extend the b_coded run, or start a new one.*/
329 b_coded_count=_fr->b_coded_count;
330 if(_fr->b_coded==_b_coded)bits-=oc_block_run_bits(b_coded_count);
331 else b_coded_count=0;
333 b_count=_fr->b_count+1;
335 /*We finished a superblock.*/
336 if(b_coded_count>=16){
337 /*It was fully coded/uncoded; roll back the partial block flags.*/
338 if(b_coded_count>16)bits+=oc_block_run_bits(b_coded_count-16);
341 _b_coded=_fr->b_coded_prev;
342 b_coded_count=_fr->b_coded_count_prev;
345 bits+=oc_block_run_bits(b_coded_count);
346 /*It was partially coded.*/
348 /*sb_full is unused.*/
351 _fr->b_coded_count=b_coded_count;
352 _fr->b_coded_count_prev=b_coded_count;
354 _fr->b_coded=_b_coded;
355 _fr->b_coded_prev=_b_coded;
356 oc_fr_state_advance_sb(_fr,sb_partial,sb_full);
359 bits+=oc_block_run_bits(b_coded_count);
361 _fr->b_coded_count=b_coded_count;
362 _fr->b_count=b_count;
363 _fr->b_coded=_b_coded;
367 static void oc_fr_skip_block(oc_fr_state *_fr){
368 oc_fr_state_advance_block(_fr,0);
371 static void oc_fr_code_block(oc_fr_state *_fr){
372 oc_fr_state_advance_block(_fr,1);
375 static int oc_fr_cost1(const oc_fr_state *_fr){
379 oc_fr_skip_block(&tmp);
382 oc_fr_code_block(&tmp);
383 return (int)(tmp.bits-bits);
386 static int oc_fr_cost4(const oc_fr_state *_pre,const oc_fr_state *_post){
389 oc_fr_skip_block(&tmp);
390 oc_fr_skip_block(&tmp);
391 oc_fr_skip_block(&tmp);
392 oc_fr_skip_block(&tmp);
393 return (int)(_post->bits-tmp.bits);
400 unsigned qi01_count:14;
402 unsigned qi12_count:14;
408 static void oc_qii_state_init(oc_qii_state *_qs){
417 static void oc_qii_state_advance(oc_qii_state *_qd,
418 const oc_qii_state *_qs,int _qii){
426 qi01_count=_qs->qi01_count;
428 if(qi01_count>=4129){
432 else bits-=oc_sb_run_bits(qi01_count);
436 bits+=oc_sb_run_bits(qi01_count);
437 qi12_count=_qs->qi12_count;
441 if(qi12_count>=4129){
445 else bits-=oc_sb_run_bits(qi12_count);
449 bits+=oc_sb_run_bits(qi12_count);
454 _qd->qi01_count=qi01_count;
456 _qd->qi12_count=qi12_count;
461 /*Temporary encoder state for the analysis pipeline.*/
462 struct oc_enc_pipeline_state{
463 int bounding_values[256];
466 /*Condensed dequantization tables.*/
467 const ogg_uint16_t *dequant[3][3][2];
468 /*Condensed quantization tables.*/
469 const oc_iquant *enquant[3][3][2];
470 /*Skip SSD storage for the current MCU in each plane.*/
471 unsigned *skip_ssd[3];
472 /*Coded/uncoded fragment lists for each plane for the current MCU.*/
473 ptrdiff_t *coded_fragis[3];
474 ptrdiff_t *uncoded_fragis[3];
475 ptrdiff_t ncoded_fragis[3];
476 ptrdiff_t nuncoded_fragis[3];
477 /*The starting fragment for the current MCU in each plane.*/
478 ptrdiff_t froffset[3];
479 /*The starting row for the current MCU in each plane.*/
481 /*The ending row for the current MCU in each plane.*/
483 /*The starting superblock for the current MCU in each plane.*/
485 /*The ending superblock for the current MCU in each plane.*/
487 /*The number of tokens for zzi=1 for each color plane.*/
489 /*The outstanding eob_run count for zzi=1 for each color plane.*/
491 /*Whether or not the loop filter is enabled.*/
496 static void oc_enc_pipeline_init(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe){
497 ptrdiff_t *coded_fragis;
499 ptrdiff_t mcu_nfrags;
505 /*Initialize the per-plane coded block flag trackers.
506 These are used for bit-estimation purposes only; the real flag bits span
507 all three planes, so we can't compute them in parallel.*/
508 for(pli=0;pli<3;pli++)oc_fr_state_init(_pipe->fr+pli);
509 for(pli=0;pli<3;pli++)oc_qii_state_init(_pipe->qs+pli);
510 /*Set up the per-plane skip SSD storage pointers.*/
511 mcu_nvsbs=_enc->mcu_nvsbs;
512 mcu_nfrags=mcu_nvsbs*_enc->state.fplanes[0].nhsbs*16;
513 hdec=!(_enc->state.info.pixel_fmt&1);
514 vdec=!(_enc->state.info.pixel_fmt&2);
515 _pipe->skip_ssd[0]=_enc->mcu_skip_ssd;
516 _pipe->skip_ssd[1]=_pipe->skip_ssd[0]+mcu_nfrags;
517 _pipe->skip_ssd[2]=_pipe->skip_ssd[1]+(mcu_nfrags>>hdec+vdec);
518 /*Set up per-plane pointers to the coded and uncoded fragments lists.
519 Unlike the decoder, each planes' coded and uncoded fragment list is kept
520 separate during the analysis stage; we only make the coded list for all
521 three planes contiguous right before the final packet is output
522 (destroying the uncoded lists, which are no longer needed).*/
523 coded_fragis=_enc->state.coded_fragis;
524 for(pli=0;pli<3;pli++){
525 _pipe->coded_fragis[pli]=coded_fragis;
526 coded_fragis+=_enc->state.fplanes[pli].nfrags;
527 _pipe->uncoded_fragis[pli]=coded_fragis;
529 memset(_pipe->ncoded_fragis,0,sizeof(_pipe->ncoded_fragis));
530 memset(_pipe->nuncoded_fragis,0,sizeof(_pipe->nuncoded_fragis));
531 /*Set up condensed quantizer tables.*/
532 for(pli=0;pli<3;pli++){
533 for(qii=0;qii<_enc->state.nqis;qii++){
535 qi=_enc->state.qis[qii];
536 for(qti=0;qti<2;qti++){
537 _pipe->dequant[pli][qii][qti]=_enc->state.dequant_tables[qi][pli][qti];
538 _pipe->enquant[pli][qii][qti]=_enc->enquant_tables[qi][pli][qti];
542 /*Initialize the tokenization state.*/
543 for(pli=0;pli<3;pli++){
544 _pipe->ndct_tokens1[pli]=0;
545 _pipe->eob_run1[pli]=0;
547 /*Initialize the bounding value array for the loop filter.*/
548 _pipe->loop_filter=!oc_state_loop_filter_init(&_enc->state,
549 _pipe->bounding_values);
552 /*Sets the current MCU stripe to super block row _sby.
553 Return: A non-zero value if this was the last MCU.*/
554 static int oc_enc_pipeline_set_stripe(oc_enc_ctx *_enc,
555 oc_enc_pipeline_state *_pipe,int _sby){
556 const oc_fragment_plane *fplane;
562 mcu_nvsbs=_enc->mcu_nvsbs;
563 sby_end=_enc->state.fplanes[0].nvsbs;
564 notdone=_sby+mcu_nvsbs<sby_end;
565 if(notdone)sby_end=_sby+mcu_nvsbs;
567 for(pli=0;pli<3;pli++){
568 fplane=_enc->state.fplanes+pli;
569 _pipe->sbi0[pli]=fplane->sboffset+(_sby>>vdec)*fplane->nhsbs;
570 _pipe->fragy0[pli]=_sby<<2-vdec;
571 _pipe->froffset[pli]=fplane->froffset
572 +_pipe->fragy0[pli]*(ptrdiff_t)fplane->nhfrags;
574 _pipe->sbi_end[pli]=fplane->sboffset+(sby_end>>vdec)*fplane->nhsbs;
575 _pipe->fragy_end[pli]=sby_end<<2-vdec;
578 _pipe->sbi_end[pli]=fplane->sboffset+fplane->nsbs;
579 _pipe->fragy_end[pli]=fplane->nvfrags;
581 vdec=!(_enc->state.info.pixel_fmt&2);
586 static void oc_enc_pipeline_finish_mcu_plane(oc_enc_ctx *_enc,
587 oc_enc_pipeline_state *_pipe,int _pli,int _sdelay,int _edelay){
589 /*Copy over all the uncoded fragments from this plane and advance the uncoded
591 _pipe->uncoded_fragis[_pli]-=_pipe->nuncoded_fragis[_pli];
592 oc_state_frag_copy_list(&_enc->state,_pipe->uncoded_fragis[_pli],
593 _pipe->nuncoded_fragis[_pli],OC_FRAME_SELF,OC_FRAME_PREV,_pli);
594 _pipe->nuncoded_fragis[_pli]=0;
595 /*Perform DC prediction.*/
596 oc_enc_pred_dc_frag_rows(_enc,_pli,
597 _pipe->fragy0[_pli],_pipe->fragy_end[_pli]);
598 /*Finish DC tokenization.*/
599 oc_enc_tokenize_dc_frag_list(_enc,_pli,
600 _pipe->coded_fragis[_pli],_pipe->ncoded_fragis[_pli],
601 _pipe->ndct_tokens1[_pli],_pipe->eob_run1[_pli]);
602 _pipe->ndct_tokens1[_pli]=_enc->ndct_tokens[_pli][1];
603 _pipe->eob_run1[_pli]=_enc->eob_run[_pli][1];
604 /*And advance the coded fragment list.*/
605 _enc->state.ncoded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
606 _pipe->coded_fragis[_pli]+=_pipe->ncoded_fragis[_pli];
607 _pipe->ncoded_fragis[_pli]=0;
608 /*Apply the loop filter if necessary.*/
609 refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
610 if(_pipe->loop_filter){
611 oc_state_loop_filter_frag_rows(&_enc->state,_pipe->bounding_values,
612 refi,_pli,_pipe->fragy0[_pli]-_sdelay,_pipe->fragy_end[_pli]-_edelay);
614 else _sdelay=_edelay=0;
615 /*To fill borders, we have an additional two pixel delay, since a fragment
616 in the next row could filter its top edge, using two pixels from a
617 fragment in this row.
618 But there's no reason to delay a full fragment between the two.*/
619 oc_state_borders_fill_rows(&_enc->state,refi,_pli,
620 (_pipe->fragy0[_pli]-_sdelay<<3)-(_sdelay<<1),
621 (_pipe->fragy_end[_pli]-_edelay<<3)-(_edelay<<1));
626 /*Cost information about the coded blocks in a MB.*/
636 static int oc_enc_block_transform_quantize(oc_enc_ctx *_enc,
637 oc_enc_pipeline_state *_pipe,int _pli,ptrdiff_t _fragi,int _overhead_bits,
638 oc_rd_metric *_mo,oc_token_checkpoint **_stack){
639 OC_ALIGN16(ogg_int16_t dct[64]);
640 OC_ALIGN16(ogg_int16_t data[64]);
641 ogg_uint16_t dc_dequant;
642 const ogg_uint16_t *dequant;
643 const oc_iquant *enquant;
646 const unsigned char *src;
647 const unsigned char *ref;
651 unsigned uncoded_ssd;
654 oc_token_checkpoint *checkpoint;
670 frags=_enc->state.frags;
671 frag_offs=_enc->state.frag_buf_offs[_fragi];
672 ystride=_enc->state.ref_ystride[_pli];
673 src=_enc->state.ref_frame_data[OC_FRAME_IO]+frag_offs;
674 borderi=frags[_fragi].borderi;
675 qii=frags[_fragi].qii;
677 #if !defined(OC_COLLECT_METRICS)
678 if(_enc->sp_level>=OC_SP_LEVEL_EARLY_SKIP){
679 /*Enable early skip detection.*/
680 frags[_fragi].coded=0;
684 /*Try and code this block anyway.*/
686 frags[_fragi].qii=qii;
688 mb_mode=frags[_fragi].mb_mode;
689 ref=_enc->state.ref_frame_data[
690 _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(mb_mode)]]+frag_offs;
691 dst=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_SELF]]
693 /*Motion compensation:*/
697 oc_enc_frag_sub_128(_enc,data,src,ystride);
699 case OC_MODE_GOLDEN_NOMV:
700 case OC_MODE_INTER_NOMV:{
703 oc_enc_frag_sub(_enc,data,src,ref,ystride);
706 const oc_mv *frag_mvs;
707 frag_mvs=(const oc_mv *)_enc->state.frag_mvs;
708 nmv_offs=oc_state_get_mv_offsets(&_enc->state,mv_offs,_pli,
709 frag_mvs[_fragi][0],frag_mvs[_fragi][1]);
711 oc_enc_frag_copy2(_enc,dst,
712 ref+mv_offs[0],ref+mv_offs[1],ystride);
713 oc_enc_frag_sub(_enc,data,src,dst,ystride);
715 else oc_enc_frag_sub(_enc,data,src,ref+mv_offs[0],ystride);
718 #if defined(OC_COLLECT_METRICS)
722 case 0:satd=oc_enc_frag_intra_satd(_enc,src,ystride);break;
724 satd=oc_enc_frag_satd_thresh(_enc,src,ref+mv_offs[0],ystride,UINT_MAX);
727 satd=oc_enc_frag_satd_thresh(_enc,src,dst,ystride,UINT_MAX);
730 _enc->frag_satd[_fragi]=satd;
734 oc_enc_fdct8x8(_enc,dct,data);
735 /*Quantize the DC coefficient:*/
736 qti=mb_mode!=OC_MODE_INTRA;
737 enquant=_pipe->enquant[_pli][0][qti];
738 dc_dequant=_pipe->dequant[_pli][0][qti][0];
743 val=((enquant[0].m*(ogg_int32_t)val>>16)+val>>enquant[0].l)-s;
744 dc=OC_CLAMPI(-580,val,580);
746 /*Quantize the AC coefficients:*/
747 dequant=_pipe->dequant[_pli][qii][qti];
748 enquant=_pipe->enquant[_pli][qii][qti];
749 for(zzi=1;zzi<64;zzi++){
750 v=dct[OC_FZIG_ZAG[zzi]];
756 /*The bias added here rounds ties away from zero, since token
757 optimization can only decrease the magnitude of the quantized
760 /*Note the arithmetic right shift is not guaranteed by ANSI C.
761 Hopefully no one still uses ones-complement architectures.*/
762 val=((enquant[zzi].m*(ogg_int32_t)val>>16)+val>>enquant[zzi].l)-s;
763 data[zzi]=OC_CLAMPI(-580,val,580);
770 ac_bits=oc_enc_tokenize_ac(_enc,_pli,_fragi,data,dequant,dct,nonzero+1,
773 TODO: nonzero may need to be adjusted after tokenization.*/
777 /*We round this dequant product (and not any of the others) because there's
779 p=(ogg_int16_t)(dc*(ogg_int32_t)dc_dequant+15>>5);
781 for(ci=0;ci<64;ci++)data[ci]=p;
784 data[0]=dc*dc_dequant;
785 oc_idct8x8(&_enc->state,data,nonzero+1);
787 if(!qti)oc_enc_frag_recon_intra(_enc,dst,ystride,data);
789 oc_enc_frag_recon_inter(_enc,dst,
790 nmv_offs==1?ref+mv_offs[0]:dst,ystride,data);
792 frame_type=_enc->state.frame_type;
793 #if !defined(OC_COLLECT_METRICS)
794 if(frame_type!=OC_INTRA_FRAME)
797 /*In retrospect, should we have skipped this block?*/
798 oc_enc_frag_sub(_enc,data,src,dst,ystride);
799 coded_ssd=coded_dc=0;
801 for(pi=0;pi<64;pi++){
802 coded_ssd+=data[pi]*data[pi];
808 mask=_enc->state.borders[borderi].mask;
809 for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
810 coded_ssd+=data[pi]*data[pi];
814 /*Scale to match DCT domain.*/
816 /*We actually only want the AC contribution to the SSD.*/
817 coded_ssd-=coded_dc*coded_dc>>2;
818 #if defined(OC_COLLECT_METRICS)
819 _enc->frag_ssd[_fragi]=coded_ssd;
821 if(frame_type!=OC_INTRA_FRAME){
823 uncoded_ssd=_pipe->skip_ssd[_pli][_fragi-_pipe->froffset[_pli]];
824 if(uncoded_ssd<UINT_MAX){
825 /*Although the fragment coding overhead determination is accurate, it is
826 greedy, using very coarse-grained local information.
827 Allowing it to mildly discourage coding turns out to be beneficial, but
828 it's not clear that allowing it to encourage coding through negative
829 coding overhead deltas is useful.
830 For that reason, we disallow negative coding_overheads.*/
831 if(_overhead_bits<0)_overhead_bits=0;
832 if(uncoded_ssd<=coded_ssd+(_overhead_bits+ac_bits)*_enc->lambda&&
833 /*Don't allow luma blocks to be skipped in 4MV mode when VP3
834 compatibility is enabled.*/
835 (!_enc->vp3_compatible||mb_mode!=OC_MODE_INTER_MV_FOUR||_pli)){
836 /*Hm, not worth it; roll back.*/
837 oc_enc_tokenlog_rollback(_enc,checkpoint,(*_stack)-checkpoint);
839 frags[_fragi].coded=0;
844 _mo->uncoded_ac_ssd+=uncoded_ssd;
845 _mo->coded_ac_ssd+=coded_ssd;
846 _mo->ac_bits+=ac_bits;
848 oc_qii_state_advance(_pipe->qs+_pli,_pipe->qs+_pli,qii);
850 frags[_fragi].coded=1;
854 static int oc_enc_mb_transform_quantize_luma(oc_enc_ctx *_enc,
855 oc_enc_pipeline_state *_pipe,unsigned _mbi,int _mode_overhead){
856 /*Worst case token stack usage for 4 fragments.*/
857 oc_token_checkpoint stack[64*4];
858 oc_token_checkpoint *stackptr;
859 const oc_sb_map *sb_maps;
860 signed char *mb_modes;
862 ptrdiff_t *coded_fragis;
863 ptrdiff_t ncoded_fragis;
864 ptrdiff_t *uncoded_fragis;
865 ptrdiff_t nuncoded_fragis;
867 oc_fr_state fr_checkpoint;
868 oc_qii_state qs_checkpoint;
873 *&fr_checkpoint=*(_pipe->fr+0);
874 *&qs_checkpoint=*(_pipe->qs+0);
875 sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
876 mb_modes=_enc->state.mb_modes;
877 frags=_enc->state.frags;
878 coded_fragis=_pipe->coded_fragis[0];
879 ncoded_fragis=_pipe->ncoded_fragis[0];
880 uncoded_fragis=_pipe->uncoded_fragis[0];
881 nuncoded_fragis=_pipe->nuncoded_fragis[0];
882 mb_mode=mb_modes[_mbi];
885 memset(&mo,0,sizeof(mo));
887 fragi=sb_maps[_mbi>>2][_mbi&3][bi];
888 frags[fragi].mb_mode=mb_mode;
889 if(oc_enc_block_transform_quantize(_enc,
890 _pipe,0,fragi,oc_fr_cost1(_pipe->fr+0),&mo,&stackptr)){
891 oc_fr_code_block(_pipe->fr+0);
892 coded_fragis[ncoded_fragis++]=fragi;
896 *(uncoded_fragis-++nuncoded_fragis)=fragi;
897 oc_fr_skip_block(_pipe->fr+0);
900 if(_enc->state.frame_type!=OC_INTRA_FRAME){
901 if(ncoded>0&&!mo.dc_flag){
903 /*Some individual blocks were worth coding.
904 See if that's still true when accounting for mode and MV overhead.*/
905 cost=mo.coded_ac_ssd+_enc->lambda*(mo.ac_bits
906 +oc_fr_cost4(&fr_checkpoint,_pipe->fr+0)+_mode_overhead);
907 if(mo.uncoded_ac_ssd<=cost){
908 /*Taking macroblock overhead into account, it is not worth coding this
910 oc_enc_tokenlog_rollback(_enc,stack,stackptr-stack);
911 *(_pipe->fr+0)=*&fr_checkpoint;
912 *(_pipe->qs+0)=*&qs_checkpoint;
914 fragi=sb_maps[_mbi>>2][_mbi&3][bi];
915 if(frags[fragi].coded){
916 *(uncoded_fragis-++nuncoded_fragis)=fragi;
917 frags[fragi].coded=0;
919 oc_fr_skip_block(_pipe->fr+0);
921 ncoded_fragis-=ncoded;
925 /*If no luma blocks coded, the mode is forced.*/
926 if(ncoded==0)mb_modes[_mbi]=OC_MODE_INTER_NOMV;
927 /*Assume that a 1MV with a single coded block is always cheaper than a 4MV
928 with a single coded block.
929 This may not be strictly true: a 4MV computes chroma MVs using (0,0) for
930 skipped blocks, while a 1MV does not.*/
931 else if(ncoded==1&&mb_mode==OC_MODE_INTER_MV_FOUR){
932 mb_modes[_mbi]=OC_MODE_INTER_MV;
935 _pipe->ncoded_fragis[0]=ncoded_fragis;
936 _pipe->nuncoded_fragis[0]=nuncoded_fragis;
940 static void oc_enc_sb_transform_quantize_chroma(oc_enc_ctx *_enc,
941 oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
942 const oc_sb_map *sb_maps;
943 oc_sb_flags *sb_flags;
944 ptrdiff_t *coded_fragis;
945 ptrdiff_t ncoded_fragis;
946 ptrdiff_t *uncoded_fragis;
947 ptrdiff_t nuncoded_fragis;
949 sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
950 sb_flags=_enc->state.sb_flags;
951 coded_fragis=_pipe->coded_fragis[_pli];
952 ncoded_fragis=_pipe->ncoded_fragis[_pli];
953 uncoded_fragis=_pipe->uncoded_fragis[_pli];
954 nuncoded_fragis=_pipe->nuncoded_fragis[_pli];
955 for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
956 /*Worst case token stack usage for 1 fragment.*/
957 oc_token_checkpoint stack[64];
961 memset(&mo,0,sizeof(mo));
962 for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
964 fragi=sb_maps[sbi][quadi][bi];
966 oc_token_checkpoint *stackptr;
968 if(oc_enc_block_transform_quantize(_enc,
969 _pipe,_pli,fragi,oc_fr_cost1(_pipe->fr+_pli),&mo,&stackptr)){
970 coded_fragis[ncoded_fragis++]=fragi;
971 oc_fr_code_block(_pipe->fr+_pli);
974 *(uncoded_fragis-++nuncoded_fragis)=fragi;
975 oc_fr_skip_block(_pipe->fr+_pli);
979 oc_fr_state_flush_sb(_pipe->fr+_pli);
980 sb_flags[sbi].coded_fully=_pipe->fr[_pli].sb_full;
981 sb_flags[sbi].coded_partially=_pipe->fr[_pli].sb_partial;
983 _pipe->ncoded_fragis[_pli]=ncoded_fragis;
984 _pipe->nuncoded_fragis[_pli]=nuncoded_fragis;
987 /*Mode decision is done by exhaustively examining all potential choices.
988 Obviously, doing the motion compensation, fDCT, tokenization, and then
989 counting the bits each token uses is computationally expensive.
990 Theora's EOB runs can also split the cost of these tokens across multiple
991 fragments, and naturally we don't know what the optimal choice of Huffman
992 codes will be until we know all the tokens we're going to encode in all the
994 So we use a simple approach to estimating the bit cost and distortion of each
995 mode based upon the SATD value of the residual before coding.
996 The mathematics behind the technique are outlined by Kim \cite{Kim03}, but
997 the process (modified somewhat from that of the paper) is very simple.
998 We build a non-linear regression of the mappings from
999 (pre-transform+quantization) SATD to (post-transform+quantization) bits and
1001 A separate set of mappings is kept for each quantization type and color
1003 The mappings are constructed by partitioning the SATD values into a small
1004 number of bins (currently 24) and using a linear regression in each bin
1005 (as opposed to the 0th-order regression used by Kim).
1006 The bit counts and SSD measurements are obtained by examining actual encoded
1007 frames, with appropriate lambda values and optimal Huffman codes selected.
1008 EOB bits are assigned to the fragment that started the EOB run (as opposed to
1009 dividing them among all the blocks in the run; though the latter approach
1010 seems more theoretically correct, Monty's testing showed a small improvement
1011 with the former, though that may have been merely statistical noise).
1014 author="Hyun Mun Kim",
1015 title="Adaptive Rate Control Using Nonlinear Regression",
1016 journal="IEEE Transactions on Circuits and Systems for Video Technology",
1024 /*Computes (_ssd+_lambda*_rate)/(1<<OC_BIT_SCALE) with rounding, avoiding
1025 overflow for large lambda values.*/
1026 #define OC_MODE_RD_COST(_ssd,_rate,_lambda) \
1027 ((_ssd)>>OC_BIT_SCALE)+((_rate)>>OC_BIT_SCALE)*(_lambda) \
1028 +(((_ssd)&(1<<OC_BIT_SCALE)-1)+((_rate)&(1<<OC_BIT_SCALE)-1)*(_lambda) \
1029 +((1<<OC_BIT_SCALE)>>1)>>OC_BIT_SCALE)
1031 /*Estimate the R-D cost of the DCT coefficients given the SATD of a block after
1033 static unsigned oc_dct_cost2(unsigned *_ssd,
1034 int _qi,int _pli,int _qti,int _satd){
1042 /*SATD metrics for chroma planes vary much less than luma, so we scale them
1043 by 4 to distribute them into the mode decision bins more evenly.*/
1045 bin=OC_MINI(_satd>>OC_SAD_SHIFT,OC_SAD_BINS-2);
1046 dx=_satd-(bin<<OC_SAD_SHIFT);
1047 y0=OC_MODE_RD[_qi][_pli][_qti][bin].rate;
1048 z0=OC_MODE_RD[_qi][_pli][_qti][bin].rmse;
1049 dy=OC_MODE_RD[_qi][_pli][_qti][bin+1].rate-y0;
1050 dz=OC_MODE_RD[_qi][_pli][_qti][bin+1].rmse-z0;
1051 rmse=OC_MAXI(z0+(dz*dx>>OC_SAD_SHIFT),0);
1052 *_ssd=rmse*rmse>>2*OC_RMSE_SCALE-OC_BIT_SCALE;
1053 return OC_MAXI(y0+(dy*dx>>OC_SAD_SHIFT),0);
1056 /*Select luma block-level quantizers for a MB in an INTRA frame.*/
1057 static unsigned oc_analyze_intra_mb_luma(oc_enc_ctx *_enc,
1058 const oc_qii_state *_qs,unsigned _mbi){
1059 const unsigned char *src;
1060 const ptrdiff_t *frag_buf_offs;
1061 const oc_sb_map *sb_maps;
1063 ptrdiff_t frag_offs;
1065 oc_qii_state qs[4][3];
1066 unsigned cost[4][3];
1068 unsigned rate[4][3];
1080 frag_buf_offs=_enc->state.frag_buf_offs;
1081 sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1082 src=_enc->state.ref_frame_data[OC_FRAME_IO];
1083 ystride=_enc->state.ref_ystride[0];
1084 fragi=sb_maps[_mbi>>2][_mbi&3][0];
1085 frag_offs=frag_buf_offs[fragi];
1086 satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
1087 nqis=_enc->state.nqis;
1088 lambda=_enc->lambda;
1089 for(qii=0;qii<nqis;qii++){
1090 oc_qii_state_advance(qs[0]+qii,_qs,qii);
1091 rate[0][qii]=oc_dct_cost2(ssd[0]+qii,_enc->state.qis[qii],0,0,satd)
1092 +(qs[0][qii].bits-_qs->bits<<OC_BIT_SCALE);
1093 cost[0][qii]=OC_MODE_RD_COST(ssd[0][qii],rate[0][qii],lambda);
1095 for(bi=1;bi<4;bi++){
1096 fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1097 frag_offs=frag_buf_offs[fragi];
1098 satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
1099 for(qii=0;qii<nqis;qii++){
1105 oc_qii_state_advance(qt+0,qs[bi-1]+0,qii);
1106 cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,0,satd);
1107 best_ssd=ssd[bi-1][0]+cur_ssd;
1108 best_rate=rate[bi-1][0]+cur_rate
1109 +(qt[0].bits-qs[bi-1][0].bits<<OC_BIT_SCALE);
1110 best_cost=OC_MODE_RD_COST(best_ssd,best_rate,lambda);
1112 for(qij=1;qij<nqis;qij++){
1114 unsigned chain_rate;
1115 unsigned chain_cost;
1116 oc_qii_state_advance(qt+qij,qs[bi-1]+qij,qii);
1117 chain_ssd=ssd[bi-1][qij]+cur_ssd;
1118 chain_rate=rate[bi-1][qij]+cur_rate
1119 +(qt[qij].bits-qs[bi-1][qij].bits<<OC_BIT_SCALE);
1120 chain_cost=OC_MODE_RD_COST(chain_ssd,chain_rate,lambda);
1121 if(chain_cost<best_cost){
1122 best_cost=chain_cost;
1124 best_rate=chain_rate;
1128 *(qs[bi]+qii)=*(qt+best_qij);
1129 cost[bi][qii]=best_cost;
1130 ssd[bi][qii]=best_ssd;
1131 rate[bi][qii]=best_rate;
1132 prev[bi-1][qii]=best_qij;
1136 best_cost=cost[3][0];
1137 for(qii=1;qii<nqis;qii++){
1138 if(cost[3][qii]<best_cost){
1139 best_cost=cost[3][qii];
1143 frags=_enc->state.frags;
1145 fragi=sb_maps[_mbi>>2][_mbi&3][bi];
1146 frags[fragi].qii=best_qii;
1148 best_qii=prev[bi][best_qii];
1153 /*Select a block-level quantizer for a single chroma block in an INTRA frame.*/
1154 static unsigned oc_analyze_intra_chroma_block(oc_enc_ctx *_enc,
1155 const oc_qii_state *_qs,int _pli,ptrdiff_t _fragi){
1156 const unsigned char *src;
1158 ptrdiff_t frag_offs;
1168 src=_enc->state.ref_frame_data[OC_FRAME_IO];
1169 ystride=_enc->state.ref_ystride[_pli];
1170 frag_offs=_enc->state.frag_buf_offs[_fragi];
1171 satd=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
1172 nqis=_enc->state.nqis;
1173 lambda=_enc->lambda;
1175 for(qii=0;qii<nqis;qii++){
1178 oc_qii_state_advance(qt+qii,_qs,qii);
1179 cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],_pli,0,satd)
1180 +(qt[qii].bits-_qs->bits<<OC_BIT_SCALE);
1181 cost[qii]=OC_MODE_RD_COST(cur_ssd,cur_rate,lambda);
1184 for(qii=1;qii<nqis;qii++){
1185 if(cost[qii]<best_cost){
1186 best_cost=cost[qii];
1190 frags=_enc->state.frags;
1191 frags[_fragi].qii=best_qii;
1195 static void oc_enc_sb_transform_quantize_intra_chroma(oc_enc_ctx *_enc,
1196 oc_enc_pipeline_state *_pipe,int _pli,int _sbi_start,int _sbi_end){
1197 const oc_sb_map *sb_maps;
1198 oc_sb_flags *sb_flags;
1199 ptrdiff_t *coded_fragis;
1200 ptrdiff_t ncoded_fragis;
1202 sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1203 sb_flags=_enc->state.sb_flags;
1204 coded_fragis=_pipe->coded_fragis[_pli];
1205 ncoded_fragis=_pipe->ncoded_fragis[_pli];
1206 for(sbi=_sbi_start;sbi<_sbi_end;sbi++){
1207 /*Worst case token stack usage for 1 fragment.*/
1208 oc_token_checkpoint stack[64];
1211 for(quadi=0;quadi<4;quadi++)for(bi=0;bi<4;bi++){
1213 fragi=sb_maps[sbi][quadi][bi];
1215 oc_token_checkpoint *stackptr;
1216 oc_analyze_intra_chroma_block(_enc,_pipe->qs+_pli,_pli,fragi);
1218 oc_enc_block_transform_quantize(_enc,
1219 _pipe,_pli,fragi,0,NULL,&stackptr);
1220 coded_fragis[ncoded_fragis++]=fragi;
1224 _pipe->ncoded_fragis[_pli]=ncoded_fragis;
1227 /*Analysis stage for an INTRA frame.*/
1228 void oc_enc_analyze_intra(oc_enc_ctx *_enc,int _recode){
1229 oc_enc_pipeline_state pipe;
1230 const unsigned char *map_idxs;
1232 oc_sb_flags *sb_flags;
1233 signed char *mb_modes;
1234 const oc_mb_map *mb_maps;
1235 oc_mb_enc_info *embs;
1237 unsigned stripe_sby;
1243 _enc->state.frame_type=OC_INTRA_FRAME;
1244 oc_enc_tokenize_start(_enc);
1245 oc_enc_pipeline_init(_enc,&pipe);
1246 /*Choose MVs and MB modes and quantize and code luma.
1247 Must be done in Hilbert order.*/
1248 map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1249 nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1250 _enc->state.ncoded_fragis[0]=0;
1251 _enc->state.ncoded_fragis[1]=0;
1252 _enc->state.ncoded_fragis[2]=0;
1253 sb_flags=_enc->state.sb_flags;
1254 mb_modes=_enc->state.mb_modes;
1255 mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
1257 frags=_enc->state.frags;
1260 mcu_nvsbs=_enc->mcu_nvsbs;
1261 for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
1264 notdone=oc_enc_pipeline_set_stripe(_enc,&pipe,stripe_sby);
1265 sbi_end=pipe.sbi_end[0];
1266 for(sbi=pipe.sbi0[0];sbi<sbi_end;sbi++){
1268 /*Mode addressing is through Y plane, always 4 MB per SB.*/
1269 for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
1276 /*Motion estimation:
1277 We always do a basic 1MV search for all macroblocks, coded or not,
1279 if(!_recode&&_enc->state.curframe_num>0)oc_mcenc_search(_enc,mbi);
1280 oc_analyze_intra_mb_luma(_enc,pipe.qs+0,mbi);
1281 mb_modes[mbi]=OC_MODE_INTRA;
1282 oc_enc_mb_transform_quantize_luma(_enc,&pipe,mbi,0);
1283 /*Propagate final MB mode and MVs to the chroma blocks.*/
1284 for(mapii=4;mapii<nmap_idxs;mapii++){
1285 mapi=map_idxs[mapii];
1288 fragi=mb_maps[mbi][pli][bi];
1289 frags[fragi].mb_mode=OC_MODE_INTRA;
1293 oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone);
1294 /*Code chroma planes.*/
1295 for(pli=1;pli<3;pli++){
1296 oc_enc_sb_transform_quantize_intra_chroma(_enc,&pipe,
1297 pli,pipe.sbi0[pli],pipe.sbi_end[pli]);
1298 oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone);
1302 /*Finish filling in the reference frame borders.*/
1303 refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
1304 for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
1305 _enc->state.ntotal_coded_fragis=_enc->state.nfrags;
1310 /*Cost information about a MB mode.*/
1311 struct oc_mode_choice{
1316 unsigned char qii[12];
1321 static void oc_mode_set_cost(oc_mode_choice *_modec,int _lambda){
1322 _modec->cost=OC_MODE_RD_COST(_modec->ssd,
1323 _modec->rate+_modec->overhead,_lambda);
1326 /*A set of skip SSD's to use to disable early skipping.*/
1327 static const unsigned OC_NOSKIP[12]={
1328 UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1329 UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX,
1330 UINT_MAX,UINT_MAX,UINT_MAX,UINT_MAX
1333 /*The estimated number of bits used by a coded chroma block to specify the AC
1335 TODO: Currently this is just 0.5*log2(3) (estimating about 50% compression);
1336 measurements suggest this is in the right ballpark, but it varies somewhat
1338 #define OC_CHROMA_QII_RATE ((0xCAE00D1DU>>31-OC_BIT_SCALE)+1>>1)
1340 static void oc_analyze_mb_mode_luma(oc_enc_ctx *_enc,
1341 oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1342 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],int _qti){
1363 lambda=_enc->lambda;
1364 nqis=_enc->state.nqis;
1365 /*We could do a trellis optimization here, but we don't make final skip
1366 decisions until after transform+quantization, so the result wouldn't be
1368 Instead we just use a greedy approach; for most SATD values, the
1369 differences between the qiis are large enough to drown out the cost to
1370 code the flags, anyway.*/
1373 ssd=rate=overhead=nskipped=0;
1374 for(bi=0;bi<4;bi++){
1378 satd=_frag_satd[bi];
1380 oc_fr_code_block(ft+0);
1381 oc_qii_state_advance(qt+0,&qs,0);
1382 best_overhead=(ft[0].bits-fr.bits<<OC_BIT_SCALE);
1383 best_rate=oc_dct_cost2(&best_ssd,_enc->state.qis[0],0,_qti,satd)
1384 +(qt[0].bits-qs.bits<<OC_BIT_SCALE);
1385 best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate+best_overhead,lambda);
1388 for(qii=1;qii<nqis;qii++){
1389 oc_qii_state_advance(qt+qii,&qs,qii);
1390 cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,_qti,satd)
1391 +(qt[qii].bits-qs.bits<<OC_BIT_SCALE);
1392 cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate+best_overhead,lambda);
1393 if(cur_cost<best_cost){
1400 if(_skip_ssd[bi]<UINT_MAX&&nskipped<3){
1402 oc_fr_skip_block(ft+1);
1403 cur_overhead=ft[1].bits-fr.bits<<OC_BIT_SCALE;
1404 cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1405 cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_overhead,lambda);
1406 if(cur_cost<=best_cost){
1409 best_overhead=cur_overhead;
1416 overhead+=best_overhead;
1417 *&fr=*(ft+best_fri);
1418 if(best_fri==0)*&qs=*(qt+best_qii);
1420 _modec->qii[bi]=best_qii;
1424 _modec->overhead=OC_MAXI(overhead,0);
1427 static void oc_analyze_mb_mode_chroma(oc_enc_ctx *_enc,
1428 oc_mode_choice *_modec,const oc_fr_state *_fr,const oc_qii_state *_qs,
1429 const unsigned _frag_satd[12],const unsigned _skip_ssd[12],int _qti){
1445 lambda=_enc->lambda;
1446 nqis=_enc->state.nqis;
1449 /*Because (except in 4:4:4 mode) we aren't considering chroma blocks in coded
1450 order, we assume a constant overhead for coded block and qii flags.*/
1451 nblocks=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1452 nblocks=(nblocks-4>>1)+4;
1454 for(pli=1;pli<3;pli++){
1455 for(;bi<nblocks;bi++){
1457 satd=_frag_satd[bi];
1458 best_rate=oc_dct_cost2(&best_ssd,_enc->state.qis[0],pli,_qti,satd)
1459 +OC_CHROMA_QII_RATE;
1460 best_cost=OC_MODE_RD_COST(ssd+best_ssd,rate+best_rate,lambda);
1462 for(qii=1;qii<nqis;qii++){
1463 cur_rate=oc_dct_cost2(&cur_ssd,_enc->state.qis[qii],0,_qti,satd)
1464 +OC_CHROMA_QII_RATE;
1465 cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate+cur_rate,lambda);
1466 if(cur_cost<best_cost){
1473 if(_skip_ssd[bi]<UINT_MAX){
1474 cur_ssd=_skip_ssd[bi]<<OC_BIT_SCALE;
1475 cur_cost=OC_MODE_RD_COST(ssd+cur_ssd,rate,lambda);
1476 if(cur_cost<=best_cost){
1484 _modec->qii[bi]=best_qii;
1486 nblocks=(nblocks-4<<1)+4;
1492 static void oc_skip_cost(oc_enc_ctx *_enc,oc_enc_pipeline_state *_pipe,
1493 unsigned _mbi,unsigned _ssd[12]){
1494 OC_ALIGN16(ogg_int16_t buffer[64]);
1495 const unsigned char *src;
1496 const unsigned char *ref;
1498 const oc_fragment *frags;
1499 const ptrdiff_t *frag_buf_offs;
1500 const ptrdiff_t *sb_map;
1501 const oc_mb_map_plane *mb_map;
1502 const unsigned char *map_idxs;
1505 unsigned uncoded_ssd;
1507 unsigned dc_dequant;
1514 ptrdiff_t frag_offs;
1517 src=_enc->state.ref_frame_data[OC_FRAME_IO];
1518 ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
1519 ystride=_enc->state.ref_ystride[0];
1520 frags=_enc->state.frags;
1521 frag_buf_offs=_enc->state.frag_buf_offs;
1522 sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1523 dc_dequant=_enc->state.dequant_tables[_enc->state.qis[0]][0][1][0];
1524 for(bi=0;bi<4;bi++){
1526 frag_offs=frag_buf_offs[fragi];
1527 oc_enc_frag_sub(_enc,buffer,src+frag_offs,ref+frag_offs,ystride);
1528 borderi=frags[fragi].borderi;
1529 uncoded_ssd=uncoded_dc=0;
1531 for(pi=0;pi<64;pi++){
1532 uncoded_ssd+=buffer[pi]*buffer[pi];
1533 uncoded_dc+=buffer[pi];
1538 mask=_enc->state.borders[borderi].mask;
1539 for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
1540 uncoded_ssd+=buffer[pi]*buffer[pi];
1541 uncoded_dc+=buffer[pi];
1544 /*Scale to match DCT domain.*/
1546 /*We actually only want the AC contribution to the SSD.*/
1547 uncoded_ssd-=uncoded_dc*uncoded_dc>>2;
1548 /*DC is a special case; if there's more than a full-quantizer improvement
1549 in the effective DC component, always force-code the block.*/
1550 dc_flag=abs(uncoded_dc)>dc_dequant<<1;
1551 uncoded_ssd|=-dc_flag;
1552 _pipe->skip_ssd[0][fragi-_pipe->froffset[0]]=_ssd[bi]=uncoded_ssd;
1554 mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
1555 map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1556 map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1557 map_nidxs=(map_nidxs-4>>1)+4;
1559 for(pli=1;pli<3;pli++){
1560 ystride=_enc->state.ref_ystride[pli];
1561 dc_dequant=_enc->state.dequant_tables[_enc->state.qis[0]][pli][1][0];
1562 for(;mapii<map_nidxs;mapii++){
1563 mapi=map_idxs[mapii];
1565 fragi=mb_map[pli][bi];
1566 frag_offs=frag_buf_offs[fragi];
1567 oc_enc_frag_sub(_enc,buffer,src+frag_offs,ref+frag_offs,ystride);
1568 borderi=frags[fragi].borderi;
1569 uncoded_ssd=uncoded_dc=0;
1571 for(pi=0;pi<64;pi++){
1572 uncoded_ssd+=buffer[pi]*buffer[pi];
1573 uncoded_dc+=buffer[pi];
1577 mask=_enc->state.borders[borderi].mask;
1578 for(pi=0;pi<64;pi++,mask>>=1)if(mask&1){
1579 uncoded_ssd+=buffer[pi]*buffer[pi];
1580 uncoded_dc+=buffer[pi];
1583 /*Scale to match DCT domain.*/
1585 /*We actually only want the AC contribution to the SSD.*/
1586 uncoded_ssd-=uncoded_dc*uncoded_dc>>2;
1587 /*DC is a special case; if there's more than a full-quantizer improvement
1588 in the effective DC component, always force-code the block.*/
1589 dc_flag=abs(uncoded_dc)>dc_dequant<<1;
1590 uncoded_ssd|=-dc_flag;
1591 _pipe->skip_ssd[pli][fragi-_pipe->froffset[pli]]=_ssd[mapii]=uncoded_ssd;
1593 map_nidxs=(map_nidxs-4<<1)+4;
1597 static void oc_mb_intra_satd(oc_enc_ctx *_enc,unsigned _mbi,
1598 unsigned _frag_satd[12]){
1599 const unsigned char *src;
1600 const ptrdiff_t *frag_buf_offs;
1601 const ptrdiff_t *sb_map;
1602 const oc_mb_map_plane *mb_map;
1603 const unsigned char *map_idxs;
1611 ptrdiff_t frag_offs;
1612 frag_buf_offs=_enc->state.frag_buf_offs;
1613 sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1614 src=_enc->state.ref_frame_data[OC_FRAME_IO];
1615 ystride=_enc->state.ref_ystride[0];
1616 for(bi=0;bi<4;bi++){
1618 frag_offs=frag_buf_offs[fragi];
1619 _frag_satd[bi]=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
1621 mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
1622 map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1623 map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1624 /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
1625 ystride=_enc->state.ref_ystride[1];
1626 for(mapii=4;mapii<map_nidxs;mapii++){
1627 mapi=map_idxs[mapii];
1630 fragi=mb_map[pli][bi];
1631 frag_offs=frag_buf_offs[fragi];
1632 _frag_satd[mapii]=oc_enc_frag_intra_satd(_enc,src+frag_offs,ystride);
1636 static void oc_cost_intra(oc_enc_ctx *_enc,oc_mode_choice *_modec,
1637 unsigned _mbi,const oc_fr_state *_fr,const oc_qii_state *_qs,
1638 const unsigned _frag_satd[12],const unsigned _skip_ssd[12]){
1639 oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,0);
1640 oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,_frag_satd,_skip_ssd,0);
1642 oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTRA)<<OC_BIT_SCALE;
1643 oc_mode_set_cost(_modec,_enc->lambda);
1646 static void oc_cost_inter(oc_enc_ctx *_enc,oc_mode_choice *_modec,
1647 unsigned _mbi,int _mb_mode,const signed char *_mv,
1648 const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12]){
1649 unsigned frag_satd[12];
1650 const unsigned char *src;
1651 const unsigned char *ref;
1653 const ptrdiff_t *frag_buf_offs;
1654 const ptrdiff_t *sb_map;
1655 const oc_mb_map_plane *mb_map;
1656 const unsigned char *map_idxs;
1666 ptrdiff_t frag_offs;
1667 src=_enc->state.ref_frame_data[OC_FRAME_IO];
1668 ref=_enc->state.ref_frame_data[
1669 _enc->state.ref_frame_idx[OC_FRAME_FOR_MODE(_mb_mode)]];
1670 ystride=_enc->state.ref_ystride[0];
1671 frag_buf_offs=_enc->state.frag_buf_offs;
1672 sb_map=_enc->state.sb_maps[_mbi>>2][_mbi&3];
1675 _modec->rate=_modec->ssd=0;
1676 if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,dx,dy)>1){
1677 for(bi=0;bi<4;bi++){
1679 frag_offs=frag_buf_offs[fragi];
1680 frag_satd[bi]=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
1681 ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
1685 for(bi=0;bi<4;bi++){
1687 frag_offs=frag_buf_offs[fragi];
1688 frag_satd[bi]=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
1689 ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
1692 mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
1693 map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1694 map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1695 /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
1696 ystride=_enc->state.ref_ystride[1];
1697 if(oc_state_get_mv_offsets(&_enc->state,mv_offs,1,dx,dy)>1){
1698 for(mapii=4;mapii<map_nidxs;mapii++){
1699 mapi=map_idxs[mapii];
1702 fragi=mb_map[pli][bi];
1703 frag_offs=frag_buf_offs[fragi];
1704 frag_satd[mapii]=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
1705 ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
1709 for(mapii=4;mapii<map_nidxs;mapii++){
1710 mapi=map_idxs[mapii];
1713 fragi=mb_map[pli][bi];
1714 frag_offs=frag_buf_offs[fragi];
1715 frag_satd[mapii]=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
1716 ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
1719 oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1);
1720 oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1);
1722 oc_mode_scheme_chooser_cost(&_enc->chooser,_mb_mode)<<OC_BIT_SCALE;
1723 oc_mode_set_cost(_modec,_enc->lambda);
1726 static void oc_cost_inter_nomv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
1727 unsigned _mbi,int _mb_mode,const oc_fr_state *_fr,const oc_qii_state *_qs,
1728 const unsigned _skip_ssd[12]){
1729 static const oc_mv OC_MV_ZERO;
1730 oc_cost_inter(_enc,_modec,_mbi,_mb_mode,OC_MV_ZERO,_fr,_qs,_skip_ssd);
1733 static int oc_cost_inter1mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
1734 unsigned _mbi,int _mb_mode,const signed char *_mv,
1735 const oc_fr_state *_fr,const oc_qii_state *_qs,const unsigned _skip_ssd[12]){
1737 oc_cost_inter(_enc,_modec,_mbi,_mb_mode,_mv,_fr,_qs,_skip_ssd);
1738 bits0=OC_MV_BITS[0][_mv[0]+31]+OC_MV_BITS[0][_mv[1]+31];
1739 _modec->overhead+=OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+12)
1740 -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
1741 oc_mode_set_cost(_modec,_enc->lambda);
1745 /*A mapping from oc_mb_map (raster) ordering to oc_sb_map (Hilbert) ordering.*/
1746 static const unsigned char OC_MB_PHASE[4][4]={
1747 {0,1,3,2},{0,3,1,2},{0,3,1,2},{2,3,1,0}
1750 static void oc_cost_inter4mv(oc_enc_ctx *_enc,oc_mode_choice *_modec,
1751 unsigned _mbi,oc_mv _mv[4],const oc_fr_state *_fr,const oc_qii_state *_qs,
1752 const unsigned _skip_ssd[12]){
1753 unsigned frag_satd[12];
1756 const unsigned char *src;
1757 const unsigned char *ref;
1759 const ptrdiff_t *frag_buf_offs;
1761 const oc_mb_map_plane *mb_map;
1762 const unsigned char *map_idxs;
1773 ptrdiff_t frag_offs;
1777 src=_enc->state.ref_frame_data[OC_FRAME_IO];
1778 ref=_enc->state.ref_frame_data[_enc->state.ref_frame_idx[OC_FRAME_PREV]];
1779 ystride=_enc->state.ref_ystride[0];
1780 frag_buf_offs=_enc->state.frag_buf_offs;
1781 frag_mvs=_enc->state.frag_mvs;
1782 mb_map=(const oc_mb_map_plane *)_enc->state.mb_maps[_mbi];
1783 _modec->rate=_modec->ssd=0;
1784 for(bi=0;bi<4;bi++){
1785 fragi=mb_map[0][bi];
1788 /*Save the block MVs as the current ones while we're here; we'll replace
1789 them if we don't ultimately choose 4MV mode.*/
1790 frag_mvs[fragi][0]=(signed char)dx;
1791 frag_mvs[fragi][1]=(signed char)dy;
1792 frag_offs=frag_buf_offs[fragi];
1793 if(oc_state_get_mv_offsets(&_enc->state,mv_offs,0,dx,dy)>1){
1794 satd=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
1795 ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
1798 satd=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
1799 ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
1801 frag_satd[OC_MB_PHASE[_mbi&3][bi]]=satd;
1803 oc_analyze_mb_mode_luma(_enc,_modec,_fr,_qs,frag_satd,
1804 _enc->vp3_compatible?OC_NOSKIP:_skip_ssd,1);
1805 /*Figure out which blocks are being skipped and give them (0,0) MVs.*/
1808 nqis=_enc->state.nqis;
1809 for(bi=0;bi<4;bi++){
1810 if(_modec->qii[OC_MB_PHASE[_mbi&3][bi]]>=nqis){
1811 memset(lbmvs+bi,0,sizeof(*lbmvs));
1814 memcpy(lbmvs+bi,_mv+bi,sizeof(*lbmvs));
1815 bits0+=OC_MV_BITS[0][_mv[bi][0]+31]+OC_MV_BITS[0][_mv[bi][1]+31];
1819 (*OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt])(cbmvs,
1820 (const oc_mv *)lbmvs);
1821 map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1822 map_nidxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1823 /*Note: This assumes ref_ystride[1]==ref_ystride[2].*/
1824 ystride=_enc->state.ref_ystride[1];
1825 for(mapii=4;mapii<map_nidxs;mapii++){
1826 mapi=map_idxs[mapii];
1829 fragi=mb_map[pli][bi];
1832 frag_offs=frag_buf_offs[fragi];
1833 /*TODO: We could save half these calls by re-using the results for the Cb
1834 and Cr planes; is it worth it?*/
1835 if(oc_state_get_mv_offsets(&_enc->state,mv_offs,pli,dx,dy)>1){
1836 satd=oc_enc_frag_satd2_thresh(_enc,src+frag_offs,
1837 ref+frag_offs+mv_offs[0],ref+frag_offs+mv_offs[1],ystride,UINT_MAX);
1840 satd=oc_enc_frag_satd_thresh(_enc,src+frag_offs,
1841 ref+frag_offs+mv_offs[0],ystride,UINT_MAX);
1843 frag_satd[mapii]=satd;
1845 oc_analyze_mb_mode_chroma(_enc,_modec,_fr,_qs,frag_satd,_skip_ssd,1);
1847 oc_mode_scheme_chooser_cost(&_enc->chooser,OC_MODE_INTER_MV_FOUR)
1848 +OC_MINI(_enc->mv_bits[0]+bits0,_enc->mv_bits[1]+bits1)
1849 -OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
1850 oc_mode_set_cost(_modec,_enc->lambda);
1853 int oc_enc_analyze_inter(oc_enc_ctx *_enc,int _allow_keyframe,int _recode){
1854 oc_set_chroma_mvs_func set_chroma_mvs;
1855 oc_enc_pipeline_state pipe;
1856 oc_qii_state intra_luma_qs;
1859 ogg_int64_t interbits;
1860 ogg_int64_t intrabits;
1861 const unsigned char *map_idxs;
1863 unsigned *coded_mbis;
1864 unsigned *uncoded_mbis;
1866 size_t nuncoded_mbis;
1867 oc_sb_flags *sb_flags;
1868 signed char *mb_modes;
1869 const oc_sb_map *sb_maps;
1870 const oc_mb_map *mb_maps;
1871 oc_mb_enc_info *embs;
1875 unsigned stripe_sby;
1884 set_chroma_mvs=OC_SET_CHROMA_MVS_TABLE[_enc->state.info.pixel_fmt];
1885 _enc->state.frame_type=OC_INTER_FRAME;
1886 oc_mode_scheme_chooser_reset(&_enc->chooser);
1887 oc_enc_tokenize_start(_enc);
1888 oc_enc_pipeline_init(_enc,&pipe);
1889 if(_allow_keyframe)oc_qii_state_init(&intra_luma_qs);
1890 _enc->mv_bits[0]=_enc->mv_bits[1]=0;
1891 interbits=intrabits=0;
1892 last_mv[0]=last_mv[1]=prior_mv[0]=prior_mv[1]=0;
1893 /*Choose MVs and MB modes and quantize and code luma.
1894 Must be done in Hilbert order.*/
1895 map_idxs=OC_MB_MAP_IDXS[_enc->state.info.pixel_fmt];
1896 nmap_idxs=OC_MB_MAP_NIDXS[_enc->state.info.pixel_fmt];
1897 qi=_enc->state.qis[0];
1898 coded_mbis=_enc->coded_mbis;
1899 uncoded_mbis=coded_mbis+_enc->state.nmbs;
1902 _enc->state.ncoded_fragis[0]=0;
1903 _enc->state.ncoded_fragis[1]=0;
1904 _enc->state.ncoded_fragis[2]=0;
1905 sb_flags=_enc->state.sb_flags;
1906 mb_modes=_enc->state.mb_modes;
1907 sb_maps=(const oc_sb_map *)_enc->state.sb_maps;
1908 mb_maps=(const oc_mb_map *)_enc->state.mb_maps;
1910 frags=_enc->state.frags;
1911 frag_mvs=_enc->state.frag_mvs;
1912 vdec=!(_enc->state.info.pixel_fmt&2);
1915 mcu_nvsbs=_enc->mcu_nvsbs;
1916 for(stripe_sby=0;notdone;stripe_sby+=mcu_nvsbs){
1917 notdone=oc_enc_pipeline_set_stripe(_enc,&pipe,stripe_sby);
1918 sbi_end=pipe.sbi_end[0];
1919 for(sbi=pipe.sbi0[0];sbi<sbi_end;sbi++){
1921 /*Mode addressing is through Y plane, always 4 MB per SB.*/
1922 for(quadi=0;quadi<4;quadi++)if(sb_flags[sbi].quad_valid&1<<quadi){
1923 oc_mode_choice modes[8];
1924 unsigned skip_ssd[12];
1925 unsigned intra_satd[12];
1938 /*Motion estimation:
1939 We always do a basic 1MV search for all macroblocks, coded or not,
1941 if(!_recode&&_enc->sp_level<OC_SP_LEVEL_NOMC)oc_mcenc_search(_enc,mbi);
1943 /*Find the block choice with the lowest estimated coding cost.
1944 If a Cb or Cr block is coded but no Y' block from a macro block then
1945 the mode MUST be OC_MODE_INTER_NOMV.
1946 This is the default state to which the mode data structure is
1947 initialised in encoder and decoder at the start of each frame.*/
1948 /*Block coding cost is estimated from correlated SATD metrics.*/
1949 /*At this point, all blocks that are in frame are still marked coded.*/
1951 memcpy(embs[mbi].unref_mv,
1952 embs[mbi].analysis_mv[0],sizeof(embs[mbi].unref_mv));
1953 embs[mbi].refined=0;
1955 oc_mb_intra_satd(_enc,mbi,intra_satd);
1956 /*Estimate the cost of coding this MB in a keyframe.*/
1957 if(_allow_keyframe){
1958 oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
1959 pipe.fr+0,&intra_luma_qs,intra_satd,OC_NOSKIP);
1960 intrabits+=modes[OC_MODE_INTRA].rate;
1961 for(bi=0;bi<4;bi++){
1962 oc_qii_state_advance(&intra_luma_qs,&intra_luma_qs,
1963 modes[OC_MODE_INTRA].qii[bi]);
1966 /*Estimate the cost in a delta frame for various modes.*/
1967 oc_skip_cost(_enc,&pipe,mbi,skip_ssd);
1968 oc_cost_inter_nomv(_enc,modes+OC_MODE_INTER_NOMV,mbi,
1969 OC_MODE_INTER_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd);
1970 if(_enc->sp_level<OC_SP_LEVEL_NOMC){
1971 oc_cost_intra(_enc,modes+OC_MODE_INTRA,mbi,
1972 pipe.fr+0,pipe.qs+0,intra_satd,skip_ssd);
1973 mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
1974 OC_MODE_INTER_MV,embs[mbi].unref_mv[OC_FRAME_PREV],
1975 pipe.fr+0,pipe.qs+0,skip_ssd);
1976 oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST,mbi,
1977 OC_MODE_INTER_MV_LAST,last_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
1978 oc_cost_inter(_enc,modes+OC_MODE_INTER_MV_LAST2,mbi,
1979 OC_MODE_INTER_MV_LAST2,prior_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
1980 oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
1981 embs[mbi].block_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
1982 oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
1983 OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd);
1984 mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
1985 OC_MODE_GOLDEN_MV,embs[mbi].unref_mv[OC_FRAME_GOLD],
1986 pipe.fr+0,pipe.qs+0,skip_ssd);
1987 /*The explicit MV modes (2,6,7) have not yet gone through halfpel
1989 We choose the explicit MV mode that's already furthest ahead on
1990 R-D cost and refine only that one.
1991 We have to be careful to remember which ones we've refined so that
1992 we don't refine it again if we re-encode this frame.*/
1993 inter_mv_pref=_enc->lambda*3;
1994 if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_INTER_MV].cost&&
1995 modes[OC_MODE_INTER_MV_FOUR].cost<modes[OC_MODE_GOLDEN_MV].cost){
1996 if(!(embs[mbi].refined&0x80)){
1997 oc_mcenc_refine4mv(_enc,mbi);
1998 embs[mbi].refined|=0x80;
2000 oc_cost_inter4mv(_enc,modes+OC_MODE_INTER_MV_FOUR,mbi,
2001 embs[mbi].ref_mv,pipe.fr+0,pipe.qs+0,skip_ssd);
2003 else if(modes[OC_MODE_GOLDEN_MV].cost+inter_mv_pref<
2004 modes[OC_MODE_INTER_MV].cost){
2005 if(!(embs[mbi].refined&0x40)){
2006 oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_GOLD);
2007 embs[mbi].refined|=0x40;
2009 mb_gmv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_GOLDEN_MV,mbi,
2010 OC_MODE_GOLDEN_MV,embs[mbi].analysis_mv[0][OC_FRAME_GOLD],
2011 pipe.fr+0,pipe.qs+0,skip_ssd);
2013 if(!(embs[mbi].refined&0x04)){
2014 oc_mcenc_refine1mv(_enc,mbi,OC_FRAME_PREV);
2015 embs[mbi].refined|=0x04;
2017 mb_mv_bits_0=oc_cost_inter1mv(_enc,modes+OC_MODE_INTER_MV,mbi,
2018 OC_MODE_INTER_MV,embs[mbi].analysis_mv[0][OC_FRAME_PREV],
2019 pipe.fr+0,pipe.qs+0,skip_ssd);
2020 /*Finally, pick the mode with the cheapest estimated R-D cost.*/
2021 mb_mode=OC_MODE_INTER_NOMV;
2022 if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2023 mb_mode=OC_MODE_INTRA;
2025 if(modes[OC_MODE_INTER_MV_LAST].cost<modes[mb_mode].cost){
2026 mb_mode=OC_MODE_INTER_MV_LAST;
2028 if(modes[OC_MODE_INTER_MV_LAST2].cost<modes[mb_mode].cost){
2029 mb_mode=OC_MODE_INTER_MV_LAST2;
2031 if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2032 mb_mode=OC_MODE_GOLDEN_NOMV;
2034 if(modes[OC_MODE_GOLDEN_MV].cost<modes[mb_mode].cost){
2035 mb_mode=OC_MODE_GOLDEN_MV;
2037 if(modes[OC_MODE_INTER_MV_FOUR].cost<modes[mb_mode].cost){
2038 mb_mode=OC_MODE_INTER_MV_FOUR;
2040 /*We prefer OC_MODE_INTER_MV, but not over LAST and LAST2.*/
2041 if(mb_mode==OC_MODE_INTER_MV_LAST||mb_mode==OC_MODE_INTER_MV_LAST2){
2044 if(modes[OC_MODE_INTER_MV].cost<modes[mb_mode].cost+inter_mv_pref){
2045 mb_mode=OC_MODE_INTER_MV;
2049 oc_cost_inter_nomv(_enc,modes+OC_MODE_GOLDEN_NOMV,mbi,
2050 OC_MODE_GOLDEN_NOMV,pipe.fr+0,pipe.qs+0,skip_ssd);
2051 mb_mode=OC_MODE_INTER_NOMV;
2052 if(modes[OC_MODE_INTRA].cost<modes[OC_MODE_INTER_NOMV].cost){
2053 mb_mode=OC_MODE_INTRA;
2055 if(modes[OC_MODE_GOLDEN_NOMV].cost<modes[mb_mode].cost){
2056 mb_mode=OC_MODE_GOLDEN_NOMV;
2058 mb_mv_bits_0=mb_gmv_bits_0=0;
2060 mb_modes[mbi]=mb_mode;
2061 /*Propagate the MVs to the luma blocks.*/
2062 if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2064 case OC_MODE_INTER_MV:{
2065 dx=embs[mbi].analysis_mv[0][OC_FRAME_PREV][0];
2066 dy=embs[mbi].analysis_mv[0][OC_FRAME_PREV][1];
2068 case OC_MODE_INTER_MV_LAST:{
2072 case OC_MODE_INTER_MV_LAST2:{
2076 case OC_MODE_GOLDEN_MV:{
2077 dx=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][0];
2078 dy=embs[mbi].analysis_mv[0][OC_FRAME_GOLD][1];
2081 for(bi=0;bi<4;bi++){
2082 fragi=mb_maps[mbi][0][bi];
2083 frag_mvs[fragi][0]=(signed char)dx;
2084 frag_mvs[fragi][1]=(signed char)dy;
2087 for(bi=0;bi<4;bi++){
2088 fragi=sb_maps[mbi>>2][mbi&3][bi];
2089 frags[fragi].qii=modes[mb_mode].qii[bi];
2091 if(oc_enc_mb_transform_quantize_luma(_enc,&pipe,mbi,
2092 modes[mb_mode].overhead>>OC_BIT_SCALE)>0){
2094 orig_mb_mode=mb_mode;
2095 mb_mode=mb_modes[mbi];
2097 case OC_MODE_INTER_MV:{
2098 memcpy(prior_mv,last_mv,sizeof(prior_mv));
2099 /*If we're backing out from 4MV, find the MV we're actually
2101 if(orig_mb_mode==OC_MODE_INTER_MV_FOUR){
2103 fragi=mb_maps[mbi][0][bi];
2104 if(frags[fragi].coded){
2105 memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
2106 dx=frag_mvs[fragi][0];
2107 dy=frag_mvs[fragi][1];
2111 mb_mv_bits_0=OC_MV_BITS[0][dx+31]+OC_MV_BITS[0][dy+31];
2113 /*Otherwise we used the original analysis MV.*/
2116 embs[mbi].analysis_mv[0][OC_FRAME_PREV],sizeof(last_mv));
2118 _enc->mv_bits[0]+=mb_mv_bits_0;
2119 _enc->mv_bits[1]+=12;
2121 case OC_MODE_INTER_MV_LAST2:{
2123 memcpy(tmp_mv,prior_mv,sizeof(tmp_mv));
2124 memcpy(prior_mv,last_mv,sizeof(prior_mv));
2125 memcpy(last_mv,tmp_mv,sizeof(last_mv));
2127 case OC_MODE_GOLDEN_MV:{
2128 _enc->mv_bits[0]+=mb_gmv_bits_0;
2129 _enc->mv_bits[1]+=12;
2131 case OC_MODE_INTER_MV_FOUR:{
2134 memcpy(prior_mv,last_mv,sizeof(prior_mv));
2135 for(bi=0;bi<4;bi++){
2136 fragi=mb_maps[mbi][0][bi];
2137 if(frags[fragi].coded){
2138 memcpy(last_mv,frag_mvs[fragi],sizeof(last_mv));
2139 memcpy(lbmvs[bi],frag_mvs[fragi],sizeof(lbmvs[bi]));
2140 _enc->mv_bits[0]+=OC_MV_BITS[0][frag_mvs[fragi][0]+31]
2141 +OC_MV_BITS[0][frag_mvs[fragi][1]+31];
2142 _enc->mv_bits[1]+=12;
2144 /*Replace the block MVs for not-coded blocks with (0,0).*/
2145 else memset(lbmvs[bi],0,sizeof(lbmvs[bi]));
2147 (*set_chroma_mvs)(cbmvs,(const oc_mv *)lbmvs);
2148 for(mapii=4;mapii<nmap_idxs;mapii++){
2149 mapi=map_idxs[mapii];
2152 fragi=mb_maps[mbi][pli][bi];
2153 frags[fragi].mb_mode=mb_mode;
2154 frags[fragi].qii=modes[OC_MODE_INTER_MV_FOUR].qii[mapii];
2155 memcpy(frag_mvs[fragi],cbmvs[bi],sizeof(frag_mvs[fragi]));
2159 coded_mbis[ncoded_mbis++]=mbi;
2160 oc_mode_scheme_chooser_update(&_enc->chooser,mb_mode);
2161 interbits+=modes[mb_mode].rate+modes[mb_mode].overhead;
2164 *(uncoded_mbis-++nuncoded_mbis)=mbi;
2165 mb_mode=OC_MODE_INTER_NOMV;
2168 /*Propagate final MB mode and MVs to the chroma blocks.
2169 This has already been done for 4MV mode, since it requires individual
2170 block motion vectors.*/
2171 if(mb_mode!=OC_MODE_INTER_MV_FOUR){
2172 for(mapii=4;mapii<nmap_idxs;mapii++){
2173 mapi=map_idxs[mapii];
2176 fragi=mb_maps[mbi][pli][bi];
2177 frags[fragi].mb_mode=mb_mode;
2178 /*If we switched from 4MV mode to INTER_MV mode, then the qii
2179 values won't have been chosen with the right MV, but it's
2180 probaby not worth re-estimating them.*/
2181 frags[fragi].qii=modes[mb_mode].qii[mapii];
2182 frag_mvs[fragi][0]=(signed char)dx;
2183 frag_mvs[fragi][1]=(signed char)dy;
2187 oc_fr_state_flush_sb(pipe.fr+0);
2188 sb_flags[sbi].coded_fully=pipe.fr[0].sb_full;
2189 sb_flags[sbi].coded_partially=pipe.fr[0].sb_partial;
2191 oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,0,notstart,notdone);
2192 /*Code chroma planes.*/
2193 for(pli=1;pli<3;pli++){
2194 oc_enc_sb_transform_quantize_chroma(_enc,&pipe,
2195 pli,pipe.sbi0[pli],pipe.sbi_end[pli]);
2196 oc_enc_pipeline_finish_mcu_plane(_enc,&pipe,pli,notstart,notdone);
2200 /*Finish filling in the reference frame borders.*/
2201 refi=_enc->state.ref_frame_idx[OC_FRAME_SELF];
2202 for(pli=0;pli<3;pli++)oc_state_borders_fill_caps(&_enc->state,refi,pli);
2203 /*Finish adding flagging overhead costs to inter bit counts to determine if
2204 we should have coded a key frame instead.*/
2205 if(_allow_keyframe){
2206 if(interbits>intrabits)return 1;
2207 /*Technically the chroma plane counts are over-estimations, because they
2208 don't account for continuing runs from the luma planes, but the
2209 inaccuracy is small.*/
2210 for(pli=0;pli<3;pli++)interbits+=pipe.fr[pli].bits<<OC_BIT_SCALE;
2211 interbits+=OC_MINI(_enc->mv_bits[0],_enc->mv_bits[1])<<OC_BIT_SCALE;
2213 _enc->chooser.scheme_bits[_enc->chooser.scheme_list[0]]<<OC_BIT_SCALE;
2214 if(interbits>intrabits)return 1;
2216 _enc->ncoded_mbis=ncoded_mbis;
2217 /*Compact the coded fragment list.*/
2219 ptrdiff_t ncoded_fragis;
2220 ncoded_fragis=_enc->state.ncoded_fragis[0];
2221 for(pli=1;pli<3;pli++){
2222 memmove(_enc->state.coded_fragis+ncoded_fragis,
2223 _enc->state.coded_fragis+_enc->state.fplanes[pli].froffset,
2224 _enc->state.ncoded_fragis[pli]*sizeof(*_enc->state.coded_fragis));
2225 ncoded_fragis+=_enc->state.ncoded_fragis[pli];
2227 _enc->state.ntotal_coded_fragis=ncoded_fragis;
2232 #if defined(OC_COLLECT_METRICS)
2236 /*TODO: It may be helpful (for block-level quantizers especially) to separate
2237 out the contributions from AC and DC into separate tables.*/
2239 # define OC_ZWEIGHT (0.25)
2241 static void oc_mode_metrics_add(oc_mode_metrics *_metrics,
2242 double _w,int _satd,int _rate,double _rmse){
2244 /*Accumulate statistics without the scaling; this lets us change the scale
2245 factor yet still use old data.*/
2246 rate=ldexp(_rate,-OC_BIT_SCALE);
2247 if(_metrics->fragw>0){
2252 dsatd=_satd-_metrics->satd/_metrics->fragw;
2253 drate=rate-_metrics->rate/_metrics->fragw;
2254 drmse=_rmse-_metrics->rmse/_metrics->fragw;
2255 w=_metrics->fragw*_w/(_metrics->fragw+_w);
2256 _metrics->satd2+=dsatd*dsatd*w;
2257 _metrics->satdrate+=dsatd*drate*w;
2258 _metrics->rate2+=drate*drate*w;
2259 _metrics->satdrmse+=dsatd*drmse*w;
2260 _metrics->rmse2+=drmse*drmse*w;
2262 _metrics->fragw+=_w;
2263 _metrics->satd+=_satd*_w;
2264 _metrics->rate+=rate*_w;
2265 _metrics->rmse+=_rmse*_w;
2268 static void oc_mode_metrics_merge(oc_mode_metrics *_dst,
2269 const oc_mode_metrics *_src,int _n){
2271 /*Find a non-empty set of metrics.*/
2272 for(i=0;i<_n&&_src[i].fragw<=0;i++);
2274 memset(_dst,0,sizeof(*_dst));
2277 memcpy(_dst,_src+i,sizeof(*_dst));
2278 /*And iterate over the remaining non-empty sets of metrics.*/
2279 for(i++;i<_n;i++)if(_src[i].fragw>0){
2288 dsatd=_src[i].satd/wb-_dst->satd/wa;
2289 drate=_src[i].rate/wb-_dst->rate/wa;
2290 drmse=_src[i].rmse/wb-_dst->rmse/wa;
2292 _dst->fragw+=_src[i].fragw;
2293 _dst->satd+=_src[i].satd;
2294 _dst->rate+=_src[i].rate;
2295 _dst->rmse+=_src[i].rmse;
2296 _dst->satd2+=_src[i].satd2+dsatd*dsatd*w;
2297 _dst->satdrate+=_src[i].satdrate+dsatd*drate*w;
2298 _dst->rate2+=_src[i].rate2+drate*drate*w;
2299 _dst->satdrmse+=_src[i].satdrmse+dsatd*drmse*w;
2300 _dst->rmse2+=_src[i].rmse2+drmse*drmse*w;
2304 /*Compile collected SATD/rate/RMSE metrics into a form that's immediately
2305 useful for mode decision.*/
2306 static void oc_enc_mode_metrics_update(oc_enc_ctx *_enc,int _qi){
2309 oc_restore_fpu(&_enc->state);
2310 /*Convert raw collected data into cleaned up sample points.*/
2311 for(pli=0;pli<3;pli++){
2312 for(qti=0;qti<2;qti++){
2319 for(bin=0;bin<OC_SAD_BINS;bin++){
2320 oc_mode_metrics metrics;
2321 OC_MODE_RD[_qi][pli][qti][bin].rate=0;
2322 OC_MODE_RD[_qi][pli][qti][bin].rmse=0;
2323 /*Find some points on either side of the current bin.*/
2324 while((bin1<bin+1||fragw<OC_ZWEIGHT)&&bin1<OC_SAD_BINS-1){
2325 fragw+=OC_MODE_METRICS[_qi][pli][qti][bin1++].fragw;
2327 while(bin0+1<bin&&bin0+1<bin1&&
2328 fragw-OC_MODE_METRICS[_qi][pli][qti][bin0].fragw>=OC_ZWEIGHT){
2329 fragw-=OC_MODE_METRICS[_qi][pli][qti][bin0++].fragw;
2331 /*Merge statistics and fit lines.*/
2332 oc_mode_metrics_merge(&metrics,
2333 OC_MODE_METRICS[_qi][pli][qti]+bin0,bin1-bin0);
2334 if(metrics.fragw>0&&metrics.satd2>0){
2342 msatd=metrics.satd/metrics.fragw;
2343 mrate=metrics.rate/metrics.fragw;
2344 mrmse=metrics.rmse/metrics.fragw;
2345 /*Compute the points on these lines corresponding to the actual bin
2347 b=metrics.satdrate/metrics.satd2;
2349 rate=ldexp(a+b*(bin<<OC_SAD_SHIFT),OC_BIT_SCALE);
2350 OC_MODE_RD[_qi][pli][qti][bin].rate=
2351 (ogg_int16_t)OC_CLAMPI(-32768,(int)(rate+0.5),32767);
2352 b=metrics.satdrmse/metrics.satd2;
2354 rmse=ldexp(a+b*(bin<<OC_SAD_SHIFT),OC_RMSE_SCALE);
2355 OC_MODE_RD[_qi][pli][qti][bin].rmse=
2356 (ogg_int16_t)OC_CLAMPI(-32768,(int)(rmse+0.5),32767);
2365 /*The following token skipping code used to also be used in the decoder (and
2366 even at one point other places in the encoder).
2367 However, it was obsoleted by other optimizations, and is now only used here.
2368 It has been moved here to avoid generating the code when it's not needed.*/
2370 /*Determines the number of blocks or coefficients to be skipped for a given
2372 _token: The token value to skip.
2373 _extra_bits: The extra bits attached to this token.
2374 Return: A positive value indicates that number of coefficients are to be
2375 skipped in the current block.
2376 Otherwise, the negative of the return value indicates that number of
2377 blocks are to be ended.*/
2378 typedef ptrdiff_t (*oc_token_skip_func)(int _token,int _extra_bits);
2380 /*Handles the simple end of block tokens.*/
2381 static ptrdiff_t oc_token_skip_eob(int _token,int _extra_bits){
2383 nblocks_adjust=OC_UNIBBLE_TABLE32(0,1,2,3,7,15,0,0,_token)+1;
2384 return -_extra_bits-nblocks_adjust;
2387 /*The last EOB token has a special case, where an EOB run of size zero ends all
2388 the remaining blocks in the frame.*/
2389 static ptrdiff_t oc_token_skip_eob6(int _token,int _extra_bits){
2390 /*Note: We want to return -PTRDIFF_MAX, but that requires C99, which is not
2391 yet available everywhere; this should be equivalent.*/
2392 if(!_extra_bits)return -(~(size_t)0>>1);
2393 return -_extra_bits;
2396 /*Handles the pure zero run tokens.*/
2397 static ptrdiff_t oc_token_skip_zrl(int _token,int _extra_bits){
2398 return _extra_bits+1;
2401 /*Handles a normal coefficient value token.*/
2402 static ptrdiff_t oc_token_skip_val(void){
2406 /*Handles a category 1A zero run/coefficient value combo token.*/
2407 static ptrdiff_t oc_token_skip_run_cat1a(int _token){
2408 return _token-OC_DCT_RUN_CAT1A+2;
2411 /*Handles category 1b, 1c, 2a, and 2b zero run/coefficient value combo tokens.*/
2412 static ptrdiff_t oc_token_skip_run(int _token,int _extra_bits){
2416 run_cati=_token-OC_DCT_RUN_CAT1B;
2417 ncoeffs_mask=OC_BYTE_TABLE32(3,7,0,1,run_cati);
2418 ncoeffs_adjust=OC_BYTE_TABLE32(7,11,2,3,run_cati);
2419 return (_extra_bits&ncoeffs_mask)+ncoeffs_adjust;
2422 /*A jump table for computing the number of coefficients or blocks to skip for
2423 a given token value.
2424 This reduces all the conditional branches, etc., needed to parse these token
2425 values down to one indirect jump.*/
2426 static const oc_token_skip_func OC_TOKEN_SKIP_TABLE[TH_NDCT_TOKENS]={
2436 (oc_token_skip_func)oc_token_skip_val,
2437 (oc_token_skip_func)oc_token_skip_val,
2438 (oc_token_skip_func)oc_token_skip_val,
2439 (oc_token_skip_func)oc_token_skip_val,
2440 (oc_token_skip_func)oc_token_skip_val,
2441 (oc_token_skip_func)oc_token_skip_val,
2442 (oc_token_skip_func)oc_token_skip_val,
2443 (oc_token_skip_func)oc_token_skip_val,
2444 (oc_token_skip_func)oc_token_skip_val,
2445 (oc_token_skip_func)oc_token_skip_val,
2446 (oc_token_skip_func)oc_token_skip_val,
2447 (oc_token_skip_func)oc_token_skip_val,
2448 (oc_token_skip_func)oc_token_skip_val,
2449 (oc_token_skip_func)oc_token_skip_val,
2450 (oc_token_skip_func)oc_token_skip_run_cat1a,
2451 (oc_token_skip_func)oc_token_skip_run_cat1a,
2452 (oc_token_skip_func)oc_token_skip_run_cat1a,
2453 (oc_token_skip_func)oc_token_skip_run_cat1a,
2454 (oc_token_skip_func)oc_token_skip_run_cat1a,
2461 /*Determines the number of blocks or coefficients to be skipped for a given
2463 _token: The token value to skip.
2464 _extra_bits: The extra bits attached to this token.
2465 Return: A positive value indicates that number of coefficients are to be
2466 skipped in the current block.
2467 Otherwise, the negative of the return value indicates that number of
2468 blocks are to be ended.
2469 0 will never be returned, so that at least one coefficient in one
2470 block will always be decoded for every token.*/
2471 static ptrdiff_t oc_dct_token_skip(int _token,int _extra_bits){
2472 return (*OC_TOKEN_SKIP_TABLE[_token])(_token,_extra_bits);
2477 void oc_enc_mode_metrics_collect(oc_enc_ctx *_enc){
2478 static const unsigned char OC_ZZI_HUFF_OFFSET[64]={
2479 0,16,16,16,16,16,32,32,
2480 32,32,32,32,32,32,32,48,
2481 48,48,48,48,48,48,48,48,
2482 48,48,48,48,64,64,64,64,
2483 64,64,64,64,64,64,64,64,
2484 64,64,64,64,64,64,64,64,
2485 64,64,64,64,64,64,64,64
2487 const oc_fragment *frags;
2488 const unsigned *frag_satd;
2489 const unsigned *frag_ssd;
2490 const ptrdiff_t *coded_fragis;
2491 ptrdiff_t ncoded_fragis;
2501 oc_restore_fpu(&_enc->state);
2502 /*Load any existing mode metrics if we haven't already.*/
2503 if(!oc_has_mode_metrics){
2505 memset(OC_MODE_METRICS,0,sizeof(OC_MODE_METRICS));
2506 fmetrics=fopen("modedec.stats","rb");
2508 fread(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics);
2511 for(qi=0;qi<64;qi++)oc_enc_mode_metrics_update(_enc,qi);
2512 oc_has_mode_metrics=1;
2514 qti=_enc->state.frame_type;
2515 frags=_enc->state.frags;
2516 frag_satd=_enc->frag_satd;
2517 frag_ssd=_enc->frag_ssd;
2518 coded_fragis=_enc->state.coded_fragis;
2519 ncoded_fragis=fragii=0;
2520 /*Weight the fragments by the inverse frame size; this prevents HD content
2521 from dominating the statistics.*/
2522 fragw=1.0/_enc->state.nfrags;
2523 for(pli=0;pli<3;pli++){
2527 /*Set up token indices and eob run counts.
2528 We don't bother trying to figure out the real cost of the runs that span
2529 coefficients; instead we use the costs that were available when R-D
2530 token optimization was done.*/
2531 for(zzi=0;zzi<64;zzi++){
2532 ti[zzi]=_enc->dct_token_offs[pli][zzi];
2534 token=_enc->dct_tokens[pli][zzi][0];
2535 eb=_enc->extra_bits[pli][zzi][0];
2536 eob_token[zzi]=token;
2537 eob_run[zzi]=-oc_dct_token_skip(token,eb);
2540 eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
2544 /*Scan the list of coded fragments for this plane.*/
2545 ncoded_fragis+=_enc->state.ncoded_fragis[pli];
2546 for(;fragii<ncoded_fragis;fragii++){
2548 ogg_uint32_t frag_bits;
2554 fragi=coded_fragis[fragii];
2558 /*We've reached the end of the block.*/
2562 huffi=_enc->huff_idxs[qti][zzi>0][pli+1>>1]
2563 +OC_ZZI_HUFF_OFFSET[zzi];
2564 if(eob_token[zzi]<OC_NDCT_EOB_TOKEN_MAX){
2565 /*This token caused an EOB run to be flushed.
2566 Therefore it gets the bits associated with it.*/
2567 frag_bits+=_enc->huff_codes[huffi][eob_token[zzi]].nbits
2568 +OC_DCT_TOKEN_EXTRA_BITS[eob_token[zzi]];
2569 eob_token[zzi]=OC_NDCT_EOB_TOKEN_MAX;
2571 token=_enc->dct_tokens[pli][zzi][ti[zzi]];
2572 eb=_enc->extra_bits[pli][zzi][ti[zzi]];
2574 skip=oc_dct_token_skip(token,eb);
2576 eob_token[zzi]=token;
2580 /*A regular DCT value token; accumulate the bits for it.*/
2581 frag_bits+=_enc->huff_codes[huffi][token].nbits
2582 +OC_DCT_TOKEN_EXTRA_BITS[token];
2586 mb_mode=frags[fragi].mb_mode;
2587 qi=_enc->state.qis[frags[fragi].qii];
2588 satd=frag_satd[fragi]<<(pli+1&2);
2589 bin=OC_MINI(satd>>OC_SAD_SHIFT,OC_SAD_BINS-1);
2590 oc_mode_metrics_add(OC_MODE_METRICS[qi][pli][mb_mode!=OC_MODE_INTRA]+bin,
2591 fragw,satd,frag_bits<<OC_BIT_SCALE,sqrt(frag_ssd[fragi]));
2594 /*Update global SATD/rate/RMSE estimation matrix.*/
2595 for(qii=0;qii<_enc->state.nqis;qii++){
2596 oc_enc_mode_metrics_update(_enc,_enc->state.qis[qii]);
2600 void oc_enc_mode_metrics_dump(oc_enc_ctx *_enc){
2603 /*Generate sample points for complete list of QI values.*/
2604 for(qi=0;qi<64;qi++)oc_enc_mode_metrics_update(_enc,qi);
2605 fmetrics=fopen("modedec.stats","wb");
2607 fwrite(OC_MODE_METRICS,sizeof(OC_MODE_METRICS),1,fmetrics);
2611 "/*File generated by libtheora with OC_COLLECT_METRICS"
2612 " defined at compile time.*/\n"
2613 "#if !defined(_modedec_H)\n"
2614 "# define _modedec_H (1)\n"
2618 "# if defined(OC_COLLECT_METRICS)\n"
2619 "typedef struct oc_mode_metrics oc_mode_metrics;\n"
2621 "typedef struct oc_mode_rd oc_mode_rd;\n"
2625 "/*The number of extra bits of precision at which to store rate"
2627 "# define OC_BIT_SCALE (%i)\n"
2628 "/*The number of extra bits of precision at which to store RMSE metrics.\n"
2629 " This must be at least half OC_BIT_SCALE (rounded up).*/\n"
2630 "# define OC_RMSE_SCALE (%i)\n"
2631 "/*The number of bins to partition statistics into.*/\n"
2632 "# define OC_SAD_BINS (%i)\n"
2633 "/*The number of bits of precision to drop"
2634 " from SAD scores to assign them to a\n"
2636 "# define OC_SAD_SHIFT (%i)\n"
2640 "# if defined(OC_COLLECT_METRICS)\n"
2641 "struct oc_mode_metrics{\n"
2647 " double satdrate;\n"
2649 " double satdrmse;\n"
2654 "int oc_has_mode_metrics;\n"
2655 "oc_mode_metrics OC_MODE_METRICS[64][3][2][OC_SAD_BINS];\n"
2660 "struct oc_mode_rd{\n"
2661 " ogg_int16_t rate;\n"
2662 " ogg_int16_t rmse;\n"
2666 "# if !defined(OC_COLLECT_METRICS)\n"
2669 "oc_mode_rd OC_MODE_RD[64][3][2][OC_SAD_BINS]={\n",
2670 OC_BIT_SCALE,OC_RMSE_SCALE,OC_SAD_BINS,OC_SAD_SHIFT);
2671 for(qi=0;qi<64;qi++){
2673 fprintf(stdout," {\n");
2674 for(pli=0;pli<3;pli++){
2676 fprintf(stdout," {\n");
2677 for(qti=0;qti<2;qti++){
2679 static const char *pl_names[3]={"Y'","Cb","Cr"};
2680 static const char *qti_names[2]={"INTRA","INTER"};
2681 fprintf(stdout," /*%s qi=%i %s*/\n",
2682 pl_names[pli],qi,qti_names[qti]);
2683 fprintf(stdout," {\n");
2684 fprintf(stdout," ");
2685 for(bin=0;bin<OC_SAD_BINS;bin++){
2686 if(bin&&!(bin&0x3))fprintf(stdout,"\n ");
2687 fprintf(stdout,"{%5i,%5i}",
2688 OC_MODE_RD[qi][pli][qti][bin].rate,
2689 OC_MODE_RD[qi][pli][qti][bin].rmse);
2690 if(bin+1<OC_SAD_BINS)fprintf(stdout,",");
2692 fprintf(stdout,"\n }");
2693 if(qti<1)fprintf(stdout,",");
2694 fprintf(stdout,"\n");
2696 fprintf(stdout," }");
2697 if(pli<2)fprintf(stdout,",");
2698 fprintf(stdout,"\n");
2700 fprintf(stdout," }");
2701 if(qi<63)fprintf(stdout,",");
2702 fprintf(stdout,"\n");