lib/mathops.c

   1 #include "mathops.h"
   2 #include <limits.h>
   3
   4 /*The fastest fallback strategy for platforms with fast multiplication appears
   5    to be based on de Bruijn sequences~\cite{LP98}.
   6   Tests confirmed this to be true even on an ARM11, where it is actually faster
   7    than using the native clz instruction.
   8   Define OC_ILOG_NODEBRUIJN to use a simpler fallback on platforms where
   9    multiplication or table lookups are too expensive.
  10
  11   @UNPUBLISHED{LP98,
  12     author="Charles E. Leiserson and Harald Prokop",
  13     title="Using de {Bruijn} Sequences to Index a 1 in a Computer Word",
  14     month=Jun,
  15     year=1998,
  16     note="\url{http://supertech.csail.mit.edu/papers/debruijn.pdf}"
  17   }*/
  18 #if !defined(OC_ILOG_NODEBRUIJN)&& \
  19  !defined(OC_CLZ32)||!defined(OC_CLZ64)&&LONG_MAX<9223372036854775807LL
  20 static const unsigned char OC_DEBRUIJN_IDX32[32]={
  21    0, 1,28, 2,29,14,24, 3,30,22,20,15,25,17, 4, 8,
  22   31,27,13,23,21,19,16, 7,26,12,18, 6,11, 5,10, 9
  23 };
  24 #endif
  25
  26 int oc_ilog32(ogg_uint32_t _v){
  27 #if defined(OC_CLZ32)
  28   return (OC_CLZ32_OFFS-OC_CLZ32(_v))&-!!_v;
  29 #else
  30 /*On a Pentium M, this branchless version tested as the fastest version without
  31    multiplications on 1,000,000,000 random 32-bit integers, edging out a
  32    similar version with branches, and a 256-entry LUT version.*/
  33 # if defined(OC_ILOG_NODEBRUIJN)
  34   int ret;
  35   int m;
  36   ret=_v>0;
  37   m=(_v>0xFFFFU)<<4;
  38   _v>>=m;
  39   ret|=m;
  40   m=(_v>0xFFU)<<3;
  41   _v>>=m;
  42   ret|=m;
  43   m=(_v>0xFU)<<2;
  44   _v>>=m;
  45   ret|=m;
  46   m=(_v>3)<<1;
  47   _v>>=m;
  48   ret|=m;
  49   ret+=_v>1;
  50   return ret;
  51 /*This de Bruijn sequence version is faster if you have a fast multiplier.*/
  52 # else
  53   int ret;
  54   ret=_v>0;
  55   _v|=_v>>1;
  56   _v|=_v>>2;
  57   _v|=_v>>4;
  58   _v|=_v>>8;
  59   _v|=_v>>16;
  60   _v=(_v>>1)+1;
  61   ret+=OC_DEBRUIJN_IDX32[_v*0x77CB531U>>27&0x1F];
  62   return ret;
  63 # endif
  64 #endif
  65 }
  66
  67 int oc_ilog64(ogg_int64_t _v){
  68 #if defined(OC_CLZ64)
  69   return (OC_CLZ64_OFFS-OC_CLZ64(_v))&-!!_v;
  70 #else
  71 # if defined(OC_ILOG_NODEBRUIJN)
  72   ogg_uint32_t v;
  73   int          ret;
  74   int          m;
  75   ret=_v>0;
  76   m=(_v>0xFFFFFFFFU)<<5;
  77   v=(ogg_uint32_t)(_v>>m);
  78   ret|=m;
  79   m=(v>0xFFFFU)<<4;
  80   v>>=m;
  81   ret|=m;
  82   m=(v>0xFFU)<<3;
  83   v>>=m;
  84   ret|=m;
  85   m=(v>0xFU)<<2;
  86   v>>=m;
  87   ret|=m;
  88   m=(v>3)<<1;
  89   v>>=m;
  90   ret|=m;
  91   ret+=v>1;
  92   return ret;
  93 # else
  94 /*If we don't have a 64-bit word, split it into two 32-bit halves.*/
  95 #  if LONG_MAX<9223372036854775807LL
  96   ogg_uint32_t v;
  97   int          ret;
  98   int          m;
  99   ret=_v>0;
 100   m=(_v>0xFFFFFFFFU)<<5;
 101   v=(ogg_uint32_t)(_v>>m);
 102   ret|=m;
 103   v|=v>>1;
 104   v|=v>>2;
 105   v|=v>>4;
 106   v|=v>>8;
 107   v|=v>>16;
 108   v=(v>>1)+1;
 109   ret+=OC_DEBRUIJN_IDX32[v*0x77CB531U>>27&0x1F];
 110   return ret;
 111 /*Otherwise do it in one 64-bit operation.*/
 112 #  else
 113   static const unsigned char OC_DEBRUIJN_IDX64[64]={
 114      0, 1, 2, 7, 3,13, 8,19, 4,25,14,28, 9,34,20,40,
 115      5,17,26,38,15,46,29,48,10,31,35,54,21,50,41,57,
 116     63, 6,12,18,24,27,33,39,16,37,45,47,30,53,49,56,
 117     62,11,23,32,36,44,52,55,61,22,43,51,60,42,59,58
 118   };
 119   int ret;
 120   ret=_v>0;
 121   _v|=_v>>1;
 122   _v|=_v>>2;
 123   _v|=_v>>4;
 124   _v|=_v>>8;
 125   _v|=_v>>16;
 126   _v|=_v>>32;
 127   _v=(_v>>1)+1;
 128   ret+=OC_DEBRUIJN_IDX64[_v*0x218A392CD3D5DBF>>58&0x3F];
 129   return ret;
 130 #  endif
 131 # endif
 132 #endif
 133 }
 134
 135 /*round(2**(62+i)*atanh(2**(-(i+1)))/log(2))*/
 136 static const ogg_int64_t OC_ATANH_LOG2[32]={
 137   0x32B803473F7AD0F4LL,0x2F2A71BD4E25E916LL,0x2E68B244BB93BA06LL,
 138   0x2E39FB9198CE62E4LL,0x2E2E683F68565C8FLL,0x2E2B850BE2077FC1LL,
 139   0x2E2ACC58FE7B78DBLL,0x2E2A9E2DE52FD5F2LL,0x2E2A92A338D53EECLL,
 140   0x2E2A8FC08F5E19B6LL,0x2E2A8F07E51A485ELL,0x2E2A8ED9BA8AF388LL,
 141   0x2E2A8ECE2FE7384ALL,0x2E2A8ECB4D3E4B1ALL,0x2E2A8ECA94940FE8LL,
 142   0x2E2A8ECA6669811DLL,0x2E2A8ECA5ADEDD6ALL,0x2E2A8ECA57FC347ELL,
 143   0x2E2A8ECA57438A43LL,0x2E2A8ECA57155FB4LL,0x2E2A8ECA5709D510LL,
 144   0x2E2A8ECA5706F267LL,0x2E2A8ECA570639BDLL,0x2E2A8ECA57060B92LL,
 145   0x2E2A8ECA57060008LL,0x2E2A8ECA5705FD25LL,0x2E2A8ECA5705FC6CLL,
 146   0x2E2A8ECA5705FC3ELL,0x2E2A8ECA5705FC33LL,0x2E2A8ECA5705FC30LL,
 147   0x2E2A8ECA5705FC2FLL,0x2E2A8ECA5705FC2FLL
 148 };
 149
 150 /*Computes the binary exponential of _z, a log base 2 in Q57 format.*/
 151 ogg_int64_t oc_bexp64(ogg_int64_t _z){
 152   ogg_int64_t w;
 153   ogg_int64_t z;
 154   int         ipart;
 155   ipart=(int)(_z>>57);
 156   if(ipart<0)return 0;
 157   if(ipart>=63)return 0x7FFFFFFFFFFFFFFFLL;
 158   z=_z-OC_Q57(ipart);
 159   if(z){
 160     ogg_int64_t mask;
 161     long        wlo;
 162     int         i;
 163     /*C doesn't give us 64x64->128 muls, so we use CORDIC.
 164       This is not particularly fast, but it's not being used in time-critical
 165        code; it is very accurate.*/
 166     /*z is the fractional part of the log in Q62 format.
 167       We need 1 bit of headroom since the magnitude can get larger than 1
 168        during the iteration, and a sign bit.*/
 169     z<<=5;
 170     /*w is the exponential in Q61 format (since it also needs headroom and can
 171        get as large as 2.0); we could get another bit if we dropped the sign,
 172        but we'll recover that bit later anyway.
 173       Ideally this should start out as
 174         \lim_{n->\infty} 2^{61}/\product_{i=1}^n \sqrt{1-2^{-2i}}
 175        but in order to guarantee convergence we have to repeat iterations 4,
 176         13 (=3*4+1), and 40 (=3*13+1, etc.), so it winds up somewhat larger.*/
 177     w=0x26A3D0E401DD846DLL;
 178     for(i=0;;i++){
 179       mask=-(z<0);
 180       w+=(w>>i+1)+mask^mask;
 181       z-=OC_ATANH_LOG2[i]+mask^mask;
 182       /*Repeat iteration 4.*/
 183       if(i>=3)break;
 184       z<<=1;
 185     }
 186     for(;;i++){
 187       mask=-(z<0);
 188       w+=(w>>i+1)+mask^mask;
 189       z-=OC_ATANH_LOG2[i]+mask^mask;
 190       /*Repeat iteration 13.*/
 191       if(i>=12)break;
 192       z<<=1;
 193     }
 194     for(;i<32;i++){
 195       mask=-(z<0);
 196       w+=(w>>i+1)+mask^mask;
 197       z=z-(OC_ATANH_LOG2[i]+mask^mask)<<1;
 198     }
 199     wlo=0;
 200     /*Skip the remaining iterations unless we really require that much
 201        precision.
 202       We could have bailed out earlier for smaller iparts, but that would
 203        require initializing w from a table, as the limit doesn't converge to
 204        61-bit precision until n=30.*/
 205     if(ipart>30){
 206       /*For these iterations, we just update the low bits, as the high bits
 207          can't possibly be affected.
 208         OC_ATANH_LOG2 has also converged (it actually did so one iteration
 209          earlier, but that's no reason for an extra special case).*/
 210       for(;;i++){
 211         mask=-(z<0);
 212         wlo+=(w>>i)+mask^mask;
 213         z-=OC_ATANH_LOG2[31]+mask^mask;
 214         /*Repeat iteration 40.*/
 215         if(i>=39)break;
 216         z<<=1;
 217       }
 218       for(;i<61;i++){
 219         mask=-(z<0);
 220         wlo+=(w>>i)+mask^mask;
 221         z=z-(OC_ATANH_LOG2[31]+mask^mask)<<1;
 222       }
 223     }
 224     w=(w<<1)+wlo;
 225   }
 226   else w=(ogg_int64_t)1<<62;
 227   if(ipart<62)w=(w>>61-ipart)+1>>1;
 228   return w;
 229 }
 230
 231 /*Computes the binary logarithm of _w, returned in Q57 format.*/
 232 ogg_int64_t oc_blog64(ogg_int64_t _w){
 233   ogg_int64_t z;
 234   int         ipart;
 235   if(_w<=0)return -1;
 236   ipart=OC_ILOGNZ_64(_w)-1;
 237   if(ipart>61)_w>>=ipart-61;
 238   else _w<<=61-ipart;
 239   z=0;
 240   if(_w&_w-1){
 241     ogg_int64_t x;
 242     ogg_int64_t y;
 243     ogg_int64_t u;
 244     ogg_int64_t mask;
 245     int         i;
 246     /*C doesn't give us 64x64->128 muls, so we use CORDIC.
 247       This is not particularly fast, but it's not being used in time-critical
 248        code; it is very accurate.*/
 249     /*z is the fractional part of the log in Q61 format.*/
 250     /*x and y are the cosh() and sinh(), respectively, in Q61 format.
 251       We are computing z=2*atanh(y/x)=2*atanh((_w-1)/(_w+1)).*/
 252     x=_w+((ogg_int64_t)1<<61);
 253     y=_w-((ogg_int64_t)1<<61);
 254     for(i=0;i<4;i++){
 255       mask=-(y<0);
 256       z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
 257       u=x>>i+1;
 258       x-=(y>>i+1)+mask^mask;
 259       y-=u+mask^mask;
 260     }
 261     /*Repeat iteration 4.*/
 262     for(i--;i<13;i++){
 263       mask=-(y<0);
 264       z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
 265       u=x>>i+1;
 266       x-=(y>>i+1)+mask^mask;
 267       y-=u+mask^mask;
 268     }
 269     /*Repeat iteration 13.*/
 270     for(i--;i<32;i++){
 271       mask=-(y<0);
 272       z+=(OC_ATANH_LOG2[i]>>i)+mask^mask;
 273       u=x>>i+1;
 274       x-=(y>>i+1)+mask^mask;
 275       y-=u+mask^mask;
 276     }
 277     /*OC_ATANH_LOG2 has converged.*/
 278     for(;i<40;i++){
 279       mask=-(y<0);
 280       z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
 281       u=x>>i+1;
 282       x-=(y>>i+1)+mask^mask;
 283       y-=u+mask^mask;
 284     }
 285     /*Repeat iteration 40.*/
 286     for(i--;i<62;i++){
 287       mask=-(y<0);
 288       z+=(OC_ATANH_LOG2[31]>>i)+mask^mask;
 289       u=x>>i+1;
 290       x-=(y>>i+1)+mask^mask;
 291       y-=u+mask^mask;
 292     }
 293     z=z+8>>4;
 294   }
 295   return OC_Q57(ipart)+z;
 296 }