lib/idct.c

   1 /********************************************************************
   2  *                                                                  *
   3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
   4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
   5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
   6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
   7  *                                                                  *
   8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
   9  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
  10  *                                                                  *
  11  ********************************************************************
  12
  13   function:
  14     last mod: $Id: idct.c 16503 2009-08-22 18:14:02Z giles $
  15
  16  ********************************************************************/
  17
  18 #include <string.h>
  19 #include "internal.h"
  20 #include "dct.h"
  21
  22 /*Performs an inverse 8 point Type-II DCT transform.
  23   The output is scaled by a factor of 2 relative to the orthonormal version of
  24    the transform.
  25   _y: The buffer to store the result in.
  26       Data will be placed in every 8th entry (e.g., in a column of an 8x8
  27        block).
  28   _x: The input coefficients.
  29       The first 8 entries are used (e.g., from a row of an 8x8 block).*/
  30 static void idct8(ogg_int16_t *_y,const ogg_int16_t _x[8]){
  31   ogg_int32_t t[8];
  32   ogg_int32_t r;
  33   /*Stage 1:*/
  34   /*0-1 butterfly.*/
  35   t[0]=OC_C4S4*(ogg_int16_t)(_x[0]+_x[4])>>16;
  36   t[1]=OC_C4S4*(ogg_int16_t)(_x[0]-_x[4])>>16;
  37   /*2-3 rotation by 6pi/16.*/
  38   t[2]=(OC_C6S2*_x[2]>>16)-(OC_C2S6*_x[6]>>16);
  39   t[3]=(OC_C2S6*_x[2]>>16)+(OC_C6S2*_x[6]>>16);
  40   /*4-7 rotation by 7pi/16.*/
  41   t[4]=(OC_C7S1*_x[1]>>16)-(OC_C1S7*_x[7]>>16);
  42   /*5-6 rotation by 3pi/16.*/
  43   t[5]=(OC_C3S5*_x[5]>>16)-(OC_C5S3*_x[3]>>16);
  44   t[6]=(OC_C5S3*_x[5]>>16)+(OC_C3S5*_x[3]>>16);
  45   t[7]=(OC_C1S7*_x[1]>>16)+(OC_C7S1*_x[7]>>16);
  46   /*Stage 2:*/
  47   /*4-5 butterfly.*/
  48   r=t[4]+t[5];
  49   t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16;
  50   t[4]=r;
  51   /*7-6 butterfly.*/
  52   r=t[7]+t[6];
  53   t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16;
  54   t[7]=r;
  55   /*Stage 3:*/
  56   /*0-3 butterfly.*/
  57   r=t[0]+t[3];
  58   t[3]=t[0]-t[3];
  59   t[0]=r;
  60   /*1-2 butterfly.*/
  61   r=t[1]+t[2];
  62   t[2]=t[1]-t[2];
  63   t[1]=r;
  64   /*6-5 butterfly.*/
  65   r=t[6]+t[5];
  66   t[5]=t[6]-t[5];
  67   t[6]=r;
  68   /*Stage 4:*/
  69   /*0-7 butterfly.*/
  70   _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
  71   /*1-6 butterfly.*/
  72   _y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
  73   /*2-5 butterfly.*/
  74   _y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
  75   /*3-4 butterfly.*/
  76   _y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
  77   _y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
  78   _y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
  79   _y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
  80   _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
  81 }
  82
  83 /*Performs an inverse 8 point Type-II DCT transform.
  84   The output is scaled by a factor of 2 relative to the orthonormal version of
  85    the transform.
  86   _y: The buffer to store the result in.
  87       Data will be placed in every 8th entry (e.g., in a column of an 8x8
  88        block).
  89   _x: The input coefficients.
  90       Only the first 4 entries are used.
  91       The other 4 are assumed to be 0.*/
  92 static void idct8_4(ogg_int16_t *_y,const ogg_int16_t _x[8]){
  93   ogg_int32_t t[8];
  94   ogg_int32_t r;
  95   /*Stage 1:*/
  96   t[0]=OC_C4S4*_x[0]>>16;
  97   t[2]=OC_C6S2*_x[2]>>16;
  98   t[3]=OC_C2S6*_x[2]>>16;
  99   t[4]=OC_C7S1*_x[1]>>16;
 100   t[5]=-(OC_C5S3*_x[3]>>16);
 101   t[6]=OC_C3S5*_x[3]>>16;
 102   t[7]=OC_C1S7*_x[1]>>16;
 103   /*Stage 2:*/
 104   r=t[4]+t[5];
 105   t[5]=OC_C4S4*(ogg_int16_t)(t[4]-t[5])>>16;
 106   t[4]=r;
 107   r=t[7]+t[6];
 108   t[6]=OC_C4S4*(ogg_int16_t)(t[7]-t[6])>>16;
 109   t[7]=r;
 110   /*Stage 3:*/
 111   t[1]=t[0]+t[2];
 112   t[2]=t[0]-t[2];
 113   r=t[0]+t[3];
 114   t[3]=t[0]-t[3];
 115   t[0]=r;
 116   r=t[6]+t[5];
 117   t[5]=t[6]-t[5];
 118   t[6]=r;
 119   /*Stage 4:*/
 120   _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
 121   _y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
 122   _y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
 123   _y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
 124   _y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
 125   _y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
 126   _y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
 127   _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
 128 }
 129
 130 /*Performs an inverse 8 point Type-II DCT transform.
 131   The output is scaled by a factor of 2 relative to the orthonormal version of
 132    the transform.
 133   _y: The buffer to store the result in.
 134       Data will be placed in every 8th entry (e.g., in a column of an 8x8
 135        block).
 136   _x: The input coefficients.
 137       Only the first 3 entries are used.
 138       The other 5 are assumed to be 0.*/
 139 static void idct8_3(ogg_int16_t *_y,const ogg_int16_t _x[8]){
 140   ogg_int32_t t[8];
 141   ogg_int32_t r;
 142   /*Stage 1:*/
 143   t[0]=OC_C4S4*_x[0]>>16;
 144   t[2]=OC_C6S2*_x[2]>>16;
 145   t[3]=OC_C2S6*_x[2]>>16;
 146   t[4]=OC_C7S1*_x[1]>>16;
 147   t[7]=OC_C1S7*_x[1]>>16;
 148   /*Stage 2:*/
 149   t[5]=OC_C4S4*t[4]>>16;
 150   t[6]=OC_C4S4*t[7]>>16;
 151   /*Stage 3:*/
 152   t[1]=t[0]+t[2];
 153   t[2]=t[0]-t[2];
 154   r=t[0]+t[3];
 155   t[3]=t[0]-t[3];
 156   t[0]=r;
 157   r=t[6]+t[5];
 158   t[5]=t[6]-t[5];
 159   t[6]=r;
 160   /*Stage 4:*/
 161   _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
 162   _y[1<<3]=(ogg_int16_t)(t[1]+t[6]);
 163   _y[2<<3]=(ogg_int16_t)(t[2]+t[5]);
 164   _y[3<<3]=(ogg_int16_t)(t[3]+t[4]);
 165   _y[4<<3]=(ogg_int16_t)(t[3]-t[4]);
 166   _y[5<<3]=(ogg_int16_t)(t[2]-t[5]);
 167   _y[6<<3]=(ogg_int16_t)(t[1]-t[6]);
 168   _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
 169 }
 170
 171 /*Performs an inverse 8 point Type-II DCT transform.
 172   The output is scaled by a factor of 2 relative to the orthonormal version of
 173    the transform.
 174   _y: The buffer to store the result in.
 175       Data will be placed in every 8th entry (e.g., in a column of an 8x8
 176        block).
 177   _x: The input coefficients.
 178       Only the first 2 entries are used.
 179       The other 6 are assumed to be 0.*/
 180 static void idct8_2(ogg_int16_t *_y,const ogg_int16_t _x[8]){
 181   ogg_int32_t t[8];
 182   ogg_int32_t r;
 183   /*Stage 1:*/
 184   t[0]=OC_C4S4*_x[0]>>16;
 185   t[4]=OC_C7S1*_x[1]>>16;
 186   t[7]=OC_C1S7*_x[1]>>16;
 187   /*Stage 2:*/
 188   t[5]=OC_C4S4*t[4]>>16;
 189   t[6]=OC_C4S4*t[7]>>16;
 190   /*Stage 3:*/
 191   r=t[6]+t[5];
 192   t[5]=t[6]-t[5];
 193   t[6]=r;
 194   /*Stage 4:*/
 195   _y[0<<3]=(ogg_int16_t)(t[0]+t[7]);
 196   _y[1<<3]=(ogg_int16_t)(t[0]+t[6]);
 197   _y[2<<3]=(ogg_int16_t)(t[0]+t[5]);
 198   _y[3<<3]=(ogg_int16_t)(t[0]+t[4]);
 199   _y[4<<3]=(ogg_int16_t)(t[0]-t[4]);
 200   _y[5<<3]=(ogg_int16_t)(t[0]-t[5]);
 201   _y[6<<3]=(ogg_int16_t)(t[0]-t[6]);
 202   _y[7<<3]=(ogg_int16_t)(t[0]-t[7]);
 203 }
 204
 205 /*Performs an inverse 8 point Type-II DCT transform.
 206   The output is scaled by a factor of 2 relative to the orthonormal version of
 207    the transform.
 208   _y: The buffer to store the result in.
 209       Data will be placed in every 8th entry (e.g., in a column of an 8x8
 210        block).
 211   _x: The input coefficients.
 212       Only the first entry is used.
 213       The other 7 are assumed to be 0.*/
 214 static void idct8_1(ogg_int16_t *_y,const ogg_int16_t _x[1]){
 215   _y[0<<3]=_y[1<<3]=_y[2<<3]=_y[3<<3]=
 216    _y[4<<3]=_y[5<<3]=_y[6<<3]=_y[7<<3]=(ogg_int16_t)(OC_C4S4*_x[0]>>16);
 217 }
 218
 219 /*Performs an inverse 8x8 Type-II DCT transform.
 220   The input is assumed to be scaled by a factor of 4 relative to orthonormal
 221    version of the transform.
 222   All coefficients but the first 3 in zig-zag scan order are assumed to be 0:
 223    x  x  0  0  0  0  0  0
 224    x  0  0  0  0  0  0  0
 225    0  0  0  0  0  0  0  0
 226    0  0  0  0  0  0  0  0
 227    0  0  0  0  0  0  0  0
 228    0  0  0  0  0  0  0  0
 229    0  0  0  0  0  0  0  0
 230    0  0  0  0  0  0  0  0
 231   _y: The buffer to store the result in.
 232       This may be the same as _x.
 233   _x: The input coefficients.*/
 234 static void oc_idct8x8_3(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
 235   const ogg_int16_t *in;
 236   ogg_int16_t       *end;
 237   ogg_int16_t       *out;
 238   ogg_int16_t        w[64];
 239   /*Transform rows of x into columns of w.*/
 240   idct8_2(w,_x);
 241   idct8_1(w+1,_x+8);
 242   /*Transform rows of w into columns of y.*/
 243   for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_2(out,in);
 244   /*Adjust for the scale factor.*/
 245   for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
 246 }
 247
 248 /*Performs an inverse 8x8 Type-II DCT transform.
 249   The input is assumed to be scaled by a factor of 4 relative to orthonormal
 250    version of the transform.
 251   All coefficients but the first 10 in zig-zag scan order are assumed to be 0:
 252    x  x  x  x  0  0  0  0
 253    x  x  x  0  0  0  0  0
 254    x  x  0  0  0  0  0  0
 255    x  0  0  0  0  0  0  0
 256    0  0  0  0  0  0  0  0
 257    0  0  0  0  0  0  0  0
 258    0  0  0  0  0  0  0  0
 259    0  0  0  0  0  0  0  0
 260   _y: The buffer to store the result in.
 261       This may be the same as _x.
 262   _x: The input coefficients.*/
 263 static void oc_idct8x8_10(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
 264   const ogg_int16_t *in;
 265   ogg_int16_t       *end;
 266   ogg_int16_t       *out;
 267   ogg_int16_t        w[64];
 268   /*Transform rows of x into columns of w.*/
 269   idct8_4(w,_x);
 270   idct8_3(w+1,_x+8);
 271   idct8_2(w+2,_x+16);
 272   idct8_1(w+3,_x+24);
 273   /*Transform rows of w into columns of y.*/
 274   for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8_4(out,in);
 275   /*Adjust for the scale factor.*/
 276   for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
 277 }
 278
 279 /*Performs an inverse 8x8 Type-II DCT transform.
 280   The input is assumed to be scaled by a factor of 4 relative to orthonormal
 281    version of the transform.
 282   _y: The buffer to store the result in.
 283       This may be the same as _x.
 284   _x: The input coefficients.*/
 285 static void oc_idct8x8_slow(ogg_int16_t _y[64],const ogg_int16_t _x[64]){
 286   const ogg_int16_t *in;
 287   ogg_int16_t       *end;
 288   ogg_int16_t       *out;
 289   ogg_int16_t        w[64];
 290   /*Transform rows of x into columns of w.*/
 291   for(in=_x,out=w,end=out+8;out<end;in+=8,out++)idct8(out,in);
 292   /*Transform rows of w into columns of y.*/
 293   for(in=w,out=_y,end=out+8;out<end;in+=8,out++)idct8(out,in);
 294   /*Adjust for the scale factor.*/
 295   for(out=_y,end=out+64;out<end;out++)*out=(ogg_int16_t)(*out+8>>4);
 296 }
 297
 298 void oc_idct8x8(const oc_theora_state *_state,ogg_int16_t _y[64],
 299  int _last_zzi){
 300   (*_state->opt_vtable.idct8x8)(_y,_last_zzi);
 301 }
 302
 303 /*Performs an inverse 8x8 Type-II DCT transform.
 304   The input is assumed to be scaled by a factor of 4 relative to orthonormal
 305    version of the transform.*/
 306 void oc_idct8x8_c(ogg_int16_t _y[64],int _last_zzi){
 307   /*_last_zzi is subtly different from an actual count of the number of
 308      coefficients we decoded for this block.
 309     It contains the value of zzi BEFORE the final token in the block was
 310      decoded.
 311     In most cases this is an EOB token (the continuation of an EOB run from a
 312      previous block counts), and so this is the same as the coefficient count.
 313     However, in the case that the last token was NOT an EOB token, but filled
 314      the block up with exactly 64 coefficients, _last_zzi will be less than 64.
 315     Provided the last token was not a pure zero run, the minimum value it can
 316      be is 46, and so that doesn't affect any of the cases in this routine.
 317     However, if the last token WAS a pure zero run of length 63, then _last_zzi
 318      will be 1 while the number of coefficients decoded is 64.
 319     Thus, we will trigger the following special case, where the real
 320      coefficient count would not.
 321     Note also that a zero run of length 64 will give _last_zzi a value of 0,
 322      but we still process the DC coefficient, which might have a non-zero value
 323      due to DC prediction.
 324     Although convoluted, this is arguably the correct behavior: it allows us to
 325      use a smaller transform when the block ends with a long zero run instead
 326      of a normal EOB token.
 327     It could be smarter... multiple separate zero runs at the end of a block
 328      will fool it, but an encoder that generates these really deserves what it
 329      gets.
 330     Needless to say we inherited this approach from VP3.*/
 331   /*Then perform the iDCT.*/
 332   if(_last_zzi<3)oc_idct8x8_3(_y,_y);
 333   else if(_last_zzi<10)oc_idct8x8_10(_y,_y);
 334   else oc_idct8x8_slow(_y,_y);
 335 }