Extras/simdmathlibrary/spu/simdmath/fmodd2.h

   1 /* fmodd2 -
   2    Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
   3    All rights reserved.
   4
   5    Redistribution and use in source and binary forms,
   6    with or without modification, are permitted provided that the
   7    following conditions are met:
   8     * Redistributions of source code must retain the above copyright
   9       notice, this list of conditions and the following disclaimer.
  10     * Redistributions in binary form must reproduce the above copyright
  11       notice, this list of conditions and the following disclaimer in the
  12       documentation and/or other materials provided with the distribution.
  13     * Neither the name of the Sony Computer Entertainment Inc nor the names
  14       of its contributors may be used to endorse or promote products derived
  15       from this software without specific prior written permission.
  16
  17    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  18    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  19    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  20    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
  21    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  22    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  23    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  24    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  25    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  26    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  27    POSSIBILITY OF SUCH DAMAGE.
  28  */
  29
  30 #ifndef ___SIMD_MATH_FMODD2_H___
  31 #define ___SIMD_MATH_FMODD2_H___
  32
  33 #include <simdmath.h>
  34 #include <spu_intrinsics.h>
  35
  36 #include <simdmath/_vec_utils.h>
  37
  38 /*
  39  * a vector is returned that contains the remainder of xi/yi,
  40  * for coresponding elements of vector double x and vector double y,
  41  * as described below:
  42  * if yi is 0, the result is 0
  43  * otherwise, the funciton determines the unique signed integer value i
  44  * such that the returned element is xi - i * yi with the same sign  as xi and
  45  * magnitude less than |yi|
  46  */
  47
  48 static inline vector double
  49 _fmodd2(vector double x, vector double y)
  50 {
  51   int shift0, shift1;
  52   vec_uchar16 swap_words = (vec_uchar16){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
  53   vec_uchar16 propagate = (vec_uchar16){4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192};
  54   vec_uchar16 splat_hi = (vec_uchar16){0,1,2,3,0,1,2,3, 8,9,10,11, 8,9,10,11};
  55   vec_uchar16 merge = (vec_uchar16){8,9,10,11,12,13,14,15, 24,25,26,27,28,29,30,31};
  56   vec_int4 n, shift, power;
  57   vec_uint4 z;
  58   vec_uint4 x_hi, y_hi;
  59   vec_uint4 abs_x, abs_y;
  60   vec_uint4 exp_x, exp_y;
  61   vec_uint4 zero_x, zero_y;
  62   vec_uint4 mant_x, mant_x0, mant_x1, mant_y ;
  63   vec_uint4 norm, denorm, norm0, norm1, denorm0, denorm1;
  64   vec_uint4 result, result0, resultx, cnt, sign, borrow, mask;
  65   vec_uint4 x_7ff, x_inf, x_nan, y_7ff, y_inf, y_nan,  is_normal;
  66   vec_uint4  x_is_norm, y_is_norm, frac_x, frac_y, cnt_x, cnt_y, mant_x_norm, mant_y_norm;
  67   vec_uint4  mant_x_denorm0, mant_x_denorm1, mant_x_denorm;
  68   vec_uint4 mant_y_denorm0, mant_y_denorm1, mant_y_denorm;
  69   vec_uint4 lsb = (vec_uint4)(spu_splats(0x0000000000000001ULL));
  70   vec_uint4 sign_mask = (vec_uint4)(spu_splats(0x8000000000000000ULL));
  71   vec_uint4 implied_1 = (vec_uint4)(spu_splats(0x0010000000000000ULL));
  72   vec_uint4 mant_mask = (vec_uint4)(spu_splats(0x000FFFFFFFFFFFFFULL));
  73
  74   sign = spu_and((vec_uint4)x, sign_mask);
  75
  76   abs_x = spu_andc((vec_uint4)x, sign_mask);
  77   abs_y = spu_andc((vec_uint4)y, sign_mask);
  78
  79   x_hi = spu_shuffle(abs_x, abs_x, splat_hi);
  80   y_hi = spu_shuffle(abs_y, abs_y, splat_hi);
  81
  82   exp_x  = spu_rlmask(x_hi, -20);
  83   exp_y  = spu_rlmask(y_hi, -20);
  84
  85   // y>x
  86   resultx = __vec_gt64(abs_y, abs_x);
  87
  88   //is Inf,  is Nan
  89   x_7ff = spu_cmpgt(x_hi, spu_splats((unsigned int)0x7fefffff));
  90   x_inf = __vec_eq64_half(abs_x, ((vec_uint4){0x7ff00000,0x0,0x7ff00000,0x0}));
  91   x_nan = spu_andc(x_7ff,  x_inf);
  92
  93   y_7ff = spu_cmpgt(y_hi, spu_splats((unsigned int)0x7fefffff));
  94   y_inf = __vec_eq64_half(abs_y, ((vec_uint4){0x7ff00000,0x0,0x7ff00000,0x0}));
  95   y_nan = spu_andc(y_7ff,  y_inf);
  96
  97   // is zero
  98   zero_x = __vec_eq64_half(abs_x, spu_splats((unsigned int)0x0));
  99   zero_y = __vec_eq64_half(abs_y, spu_splats((unsigned int)0x0));
 100
 101
 102   /* Determine ilogb of abs_x and abs_y and
 103    * extract the mantissas (mant_x, mant_y)
 104    */
 105   /* change form*/
 106   // 0 -> ! is_normal
 107   // 0 don't care  (because (x=0, y!=0)match x<y, (x!=0 && y=0)match y=0,  (x==0 && y==0) resultx)
 108
 109   x_is_norm = spu_cmpgt(x_hi, spu_splats((unsigned int)0x000fffff));
 110   y_is_norm = spu_cmpgt(y_hi, spu_splats((unsigned int)0x000fffff));
 111
 112   frac_x = spu_and((vec_uint4)x, mant_mask);
 113   frac_y = spu_and((vec_uint4)y, mant_mask);
 114
 115   //cntlz(use when denorm)
 116   cnt_x = spu_cntlz(frac_x);
 117   cnt_x = spu_add(cnt_x, spu_and(spu_rlqwbyte(cnt_x, 4), spu_cmpeq(cnt_x, 32)));
 118   cnt_x = spu_add(spu_shuffle(cnt_x, cnt_x, splat_hi), -11);
 119
 120   cnt_y = spu_cntlz(frac_y);
 121   cnt_y = spu_add(cnt_y, spu_and(spu_rlqwbyte(cnt_y, 4), spu_cmpeq(cnt_y, 32)));
 122   cnt_y = spu_add(spu_shuffle(cnt_y, cnt_y, splat_hi), -11);
 123
 124   /*
 125     mant_x_norm = spu_andc(spu_sel(implied_1, abs_x, mant_mask), zero_x);
 126     mant_y_norm = spu_andc(spu_sel(implied_1, abs_y, mant_mask), zero_y);
 127   */
 128   //norm
 129   mant_x_norm = spu_or(implied_1, frac_x);
 130   mant_y_norm = spu_or(implied_1, frac_y);
 131
 132   //denorm
 133   shift0 = spu_extract(cnt_x, 0);
 134   shift1 = spu_extract(cnt_x, 2);
 135   mant_x_denorm0 = spu_rlmaskqwbyte((vec_uint4)frac_x, -8);
 136   mant_x_denorm1 = spu_and((vec_uint4)frac_x, ((vec_uint4){0x0,0x0,-1,-1}));
 137   mant_x_denorm0 = spu_slqwbytebc(spu_slqw(mant_x_denorm0, shift0), shift0);
 138   mant_x_denorm1 = spu_slqwbytebc(spu_slqw(mant_x_denorm1, shift1), shift1);
 139   mant_x_denorm = spu_shuffle(mant_x_denorm0, mant_x_denorm1, merge);
 140
 141   //  vec_int4 shift_y = (vec_int4)spu_sub(cnt_y, spu_splats((unsigned int)11));
 142   shift0 = spu_extract(cnt_y, 0);
 143   shift1 = spu_extract(cnt_y, 2);
 144   mant_y_denorm0 = spu_rlmaskqwbyte((vec_uint4)frac_y, -8);
 145   mant_y_denorm1 = spu_and((vec_uint4)frac_y, ((vec_uint4){0x0,0x0,-1,-1}));
 146
 147   mant_y_denorm0 = spu_slqwbytebc(spu_slqw(mant_y_denorm0, shift0), shift0);
 148   mant_y_denorm1 = spu_slqwbytebc(spu_slqw(mant_y_denorm1, shift1), shift1);
 149   mant_y_denorm = spu_shuffle(mant_y_denorm0, mant_y_denorm1, merge);
 150
 151   // mant_x, mant_y( norm | denorm )
 152   mant_x = spu_sel(mant_x_denorm, mant_x_norm, x_is_norm);
 153   mant_y = spu_sel(mant_y_denorm, mant_y_norm, y_is_norm);
 154
 155   /*  power
 156    */
 157   vec_int4 power_x_norm = (vec_int4)exp_x;
 158   vec_int4 power_x_denorm = spu_sub(spu_splats((int)1), (vec_int4)cnt_x);
 159   vec_int4 power_x =   spu_sel(power_x_denorm, power_x_norm, x_is_norm);
 160
 161   vec_int4 power_y_norm = (vec_int4)exp_y;
 162   vec_int4 power_y_denorm = spu_sub(spu_splats((int)1), (vec_int4)cnt_y);
 163   vec_int4 power_y =   spu_sel(power_y_denorm, power_y_norm, y_is_norm);
 164
 165
 166   /* Compute fixed point fmod of mant_x and mant_y. Set the flag,
 167    * result0, to all ones if we detect that the final result is
 168    * ever 0.
 169    */
 170   result0 = spu_or(zero_x, zero_y);
 171
 172   //  n = spu_sub((vec_int4)logb_x, (vec_int4)logb_y); //zhao--
 173   n = spu_sub(power_x, power_y);
 174   mask = spu_cmpgt(n, 0);
 175
 176   while (spu_extract(spu_gather(mask), 0)) {
 177     borrow = spu_genb(mant_x, mant_y);
 178     borrow = spu_shuffle(borrow, borrow, propagate);
 179     z = spu_subx(mant_x, mant_y, borrow);
 180
 181     result0 = spu_or(spu_and(spu_cmpeq(spu_or(z, spu_shuffle(z, z, swap_words)), 0), mask), result0);
 182
 183     mant_x = spu_sel(mant_x,
 184                      spu_sel(spu_slqw(mant_x, 1), spu_andc(spu_slqw(z, 1), lsb), spu_cmpgt((vec_int4)spu_shuffle(z, z, splat_hi), -1)),
 185                      mask);
 186
 187     n = spu_add(n, -1);
 188     mask = spu_cmpgt(n, 0);
 189
 190   }
 191
 192   borrow = spu_genb(mant_x, mant_y);
 193   borrow = spu_shuffle(borrow, borrow, propagate);
 194   z = spu_subx(mant_x, mant_y, borrow);
 195   mant_x = spu_sel(mant_x, z, spu_cmpgt((vec_int4)spu_shuffle(z, z, splat_hi), -1));
 196
 197   result0 = spu_or(spu_cmpeq(spu_or(mant_x, spu_shuffle(mant_x, mant_x, swap_words)), 0), result0);
 198
 199
 200
 201   /* Convert the result back to floating point and restore
 202    * the sign. If we flagged the result to be zero (result0),
 203    * zero it. If we flagged the result to equal its input x,
 204    * (resultx) then return x.
 205    *
 206    * Double precision generates a denorm for an output.
 207    */
 208
 209   //  normal = spu_cmpgt((vec_int4)exp_y, 0);//zhao--
 210
 211   cnt = spu_cntlz(mant_x);
 212   cnt = spu_add(cnt, spu_and(spu_rlqwbyte(cnt, 4), spu_cmpeq(cnt, 32)));
 213   cnt = spu_add(spu_shuffle(cnt, cnt, splat_hi), -11);
 214
 215   mant_x0 = spu_rlmaskqwbyte(mant_x, -8);
 216   mant_x1 = spu_and(mant_x,((vec_uint4){0x0,0x0,-1,-1}));
 217
 218   power =spu_sub(power_y, (vec_int4)cnt);
 219
 220   is_normal = spu_cmpgt(power, 0);
 221
 222
 223
 224   //norm
 225
 226   shift0 = spu_extract(cnt, 0);
 227   shift1 = spu_extract(cnt, 2);
 228   /*
 229     norm0 = spu_slqwbytebc(spu_slqw(spu_andc(mant_x0, implied_1), shift0), shift0);
 230     norm1 = spu_slqwbytebc(spu_slqw(spu_andc(mant_x1, implied_1), shift1), shift1);
 231   */
 232   norm0 = spu_slqwbytebc(spu_slqw(mant_x0, shift0), shift0);
 233   norm1 = spu_slqwbytebc(spu_slqw(mant_x1, shift1), shift1);
 234
 235   norm   = spu_shuffle(norm0, norm1, merge);
 236
 237
 238   //denorm
 239   /*
 240     shift = spu_add((vec_int4)exp_y, -1);
 241     shift0 = spu_extract(shift, 0);
 242     shift1 = spu_extract(shift, 2);
 243     denorm0 = spu_slqwbytebc(spu_slqw(mant_x0, shift0), shift0);
 244     denorm1 = spu_slqwbytebc(spu_slqw(mant_x1, shift1), shift1);
 245   */
 246   shift = spu_add(power, -1);
 247   shift0 = spu_extract(shift, 0);
 248   shift1 = spu_extract(shift, 2);
 249
 250   //  printf("result denorm: shift0=%d, shift1=%d\n",shift0, shift1);
 251
 252   denorm0 = spu_rlmaskqwbytebc(spu_rlmaskqw(norm0, shift0), 7+shift0);
 253   denorm1 = spu_rlmaskqwbytebc(spu_rlmaskqw(norm1, shift1), 7+shift1);
 254
 255   denorm = spu_shuffle(denorm0, denorm1, merge);
 256
 257   // merge
 258   mant_x = spu_sel(denorm, norm, is_normal);
 259
 260   exp_y = (vec_uint4)power;
 261   exp_y = spu_and(spu_rl(exp_y, 20), is_normal);
 262
 263   result = spu_sel(exp_y, spu_or(sign, mant_x),((vec_uint4){0x800FFFFF, -1, 0x800FFFFF, -1}));
 264
 265   //y>x  || y<=x
 266   result = spu_sel(spu_andc(result, spu_rlmask(result0, -1)),
 267                    (vec_uint4)x, resultx);
 268   //y=+-inf  => 0
 269   result = spu_sel(result, (vec_uint4)x, y_inf);
 270   //x=+-inf  => NaN
 271   result = spu_sel(result, ((vec_uint4){0x7ff80000, 0x0, 0x7ff80000, 0x0}), x_inf);
 272   //y=0          =>  0
 273   result = spu_andc(result, zero_y);
 274
 275   //x=NaN or y=NaN   => 0
 276   result = spu_sel(result, (vec_uint4)x, x_nan);
 277   result = spu_sel(result, (vec_uint4)y, y_nan);
 278
 279   return ((vec_double2)result);
 280 }
 281
 282 #endif