2 Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
5 Redistribution and use in source and binary forms,
6 with or without modification, are permitted provided that the
7 following conditions are met:
8 * Redistributions of source code must retain the above copyright
9 notice, this list of conditions and the following disclaimer.
10 * Redistributions in binary form must reproduce the above copyright
11 notice, this list of conditions and the following disclaimer in the
12 documentation and/or other materials provided with the distribution.
13 * Neither the name of the Sony Computer Entertainment Inc nor the names
14 of its contributors may be used to endorse or promote products derived
15 from this software without specific prior written permission.
17 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
18 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
21 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
22 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
23 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
24 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
25 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
26 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
27 POSSIBILITY OF SUCH DAMAGE.
30 #ifndef ___SIMD_MATH_FMODD2_H___
31 #define ___SIMD_MATH_FMODD2_H___
34 #include <spu_intrinsics.h>
36 #include <simdmath/_vec_utils.h>
39 * a vector is returned that contains the remainder of xi/yi,
40 * for coresponding elements of vector double x and vector double y,
42 * if yi is 0, the result is 0
43 * otherwise, the funciton determines the unique signed integer value i
44 * such that the returned element is xi - i * yi with the same sign as xi and
45 * magnitude less than |yi|
48 static inline vector double
49 _fmodd2(vector double x, vector double y)
52 vec_uchar16 swap_words = (vec_uchar16){4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11};
53 vec_uchar16 propagate = (vec_uchar16){4,5,6,7, 192,192,192,192, 12,13,14,15, 192,192,192,192};
54 vec_uchar16 splat_hi = (vec_uchar16){0,1,2,3,0,1,2,3, 8,9,10,11, 8,9,10,11};
55 vec_uchar16 merge = (vec_uchar16){8,9,10,11,12,13,14,15, 24,25,26,27,28,29,30,31};
56 vec_int4 n, shift, power;
59 vec_uint4 abs_x, abs_y;
60 vec_uint4 exp_x, exp_y;
61 vec_uint4 zero_x, zero_y;
62 vec_uint4 mant_x, mant_x0, mant_x1, mant_y ;
63 vec_uint4 norm, denorm, norm0, norm1, denorm0, denorm1;
64 vec_uint4 result, result0, resultx, cnt, sign, borrow, mask;
65 vec_uint4 x_7ff, x_inf, x_nan, y_7ff, y_inf, y_nan, is_normal;
66 vec_uint4 x_is_norm, y_is_norm, frac_x, frac_y, cnt_x, cnt_y, mant_x_norm, mant_y_norm;
67 vec_uint4 mant_x_denorm0, mant_x_denorm1, mant_x_denorm;
68 vec_uint4 mant_y_denorm0, mant_y_denorm1, mant_y_denorm;
69 vec_uint4 lsb = (vec_uint4)(spu_splats(0x0000000000000001ULL));
70 vec_uint4 sign_mask = (vec_uint4)(spu_splats(0x8000000000000000ULL));
71 vec_uint4 implied_1 = (vec_uint4)(spu_splats(0x0010000000000000ULL));
72 vec_uint4 mant_mask = (vec_uint4)(spu_splats(0x000FFFFFFFFFFFFFULL));
74 sign = spu_and((vec_uint4)x, sign_mask);
76 abs_x = spu_andc((vec_uint4)x, sign_mask);
77 abs_y = spu_andc((vec_uint4)y, sign_mask);
79 x_hi = spu_shuffle(abs_x, abs_x, splat_hi);
80 y_hi = spu_shuffle(abs_y, abs_y, splat_hi);
82 exp_x = spu_rlmask(x_hi, -20);
83 exp_y = spu_rlmask(y_hi, -20);
86 resultx = __vec_gt64(abs_y, abs_x);
89 x_7ff = spu_cmpgt(x_hi, spu_splats((unsigned int)0x7fefffff));
90 x_inf = __vec_eq64_half(abs_x, ((vec_uint4){0x7ff00000,0x0,0x7ff00000,0x0}));
91 x_nan = spu_andc(x_7ff, x_inf);
93 y_7ff = spu_cmpgt(y_hi, spu_splats((unsigned int)0x7fefffff));
94 y_inf = __vec_eq64_half(abs_y, ((vec_uint4){0x7ff00000,0x0,0x7ff00000,0x0}));
95 y_nan = spu_andc(y_7ff, y_inf);
98 zero_x = __vec_eq64_half(abs_x, spu_splats((unsigned int)0x0));
99 zero_y = __vec_eq64_half(abs_y, spu_splats((unsigned int)0x0));
102 /* Determine ilogb of abs_x and abs_y and
103 * extract the mantissas (mant_x, mant_y)
107 // 0 don't care (because (x=0, y!=0)match x<y, (x!=0 && y=0)match y=0, (x==0 && y==0) resultx)
109 x_is_norm = spu_cmpgt(x_hi, spu_splats((unsigned int)0x000fffff));
110 y_is_norm = spu_cmpgt(y_hi, spu_splats((unsigned int)0x000fffff));
112 frac_x = spu_and((vec_uint4)x, mant_mask);
113 frac_y = spu_and((vec_uint4)y, mant_mask);
115 //cntlz(use when denorm)
116 cnt_x = spu_cntlz(frac_x);
117 cnt_x = spu_add(cnt_x, spu_and(spu_rlqwbyte(cnt_x, 4), spu_cmpeq(cnt_x, 32)));
118 cnt_x = spu_add(spu_shuffle(cnt_x, cnt_x, splat_hi), -11);
120 cnt_y = spu_cntlz(frac_y);
121 cnt_y = spu_add(cnt_y, spu_and(spu_rlqwbyte(cnt_y, 4), spu_cmpeq(cnt_y, 32)));
122 cnt_y = spu_add(spu_shuffle(cnt_y, cnt_y, splat_hi), -11);
125 mant_x_norm = spu_andc(spu_sel(implied_1, abs_x, mant_mask), zero_x);
126 mant_y_norm = spu_andc(spu_sel(implied_1, abs_y, mant_mask), zero_y);
129 mant_x_norm = spu_or(implied_1, frac_x);
130 mant_y_norm = spu_or(implied_1, frac_y);
133 shift0 = spu_extract(cnt_x, 0);
134 shift1 = spu_extract(cnt_x, 2);
135 mant_x_denorm0 = spu_rlmaskqwbyte((vec_uint4)frac_x, -8);
136 mant_x_denorm1 = spu_and((vec_uint4)frac_x, ((vec_uint4){0x0,0x0,-1,-1}));
137 mant_x_denorm0 = spu_slqwbytebc(spu_slqw(mant_x_denorm0, shift0), shift0);
138 mant_x_denorm1 = spu_slqwbytebc(spu_slqw(mant_x_denorm1, shift1), shift1);
139 mant_x_denorm = spu_shuffle(mant_x_denorm0, mant_x_denorm1, merge);
141 // vec_int4 shift_y = (vec_int4)spu_sub(cnt_y, spu_splats((unsigned int)11));
142 shift0 = spu_extract(cnt_y, 0);
143 shift1 = spu_extract(cnt_y, 2);
144 mant_y_denorm0 = spu_rlmaskqwbyte((vec_uint4)frac_y, -8);
145 mant_y_denorm1 = spu_and((vec_uint4)frac_y, ((vec_uint4){0x0,0x0,-1,-1}));
147 mant_y_denorm0 = spu_slqwbytebc(spu_slqw(mant_y_denorm0, shift0), shift0);
148 mant_y_denorm1 = spu_slqwbytebc(spu_slqw(mant_y_denorm1, shift1), shift1);
149 mant_y_denorm = spu_shuffle(mant_y_denorm0, mant_y_denorm1, merge);
151 // mant_x, mant_y( norm | denorm )
152 mant_x = spu_sel(mant_x_denorm, mant_x_norm, x_is_norm);
153 mant_y = spu_sel(mant_y_denorm, mant_y_norm, y_is_norm);
157 vec_int4 power_x_norm = (vec_int4)exp_x;
158 vec_int4 power_x_denorm = spu_sub(spu_splats((int)1), (vec_int4)cnt_x);
159 vec_int4 power_x = spu_sel(power_x_denorm, power_x_norm, x_is_norm);
161 vec_int4 power_y_norm = (vec_int4)exp_y;
162 vec_int4 power_y_denorm = spu_sub(spu_splats((int)1), (vec_int4)cnt_y);
163 vec_int4 power_y = spu_sel(power_y_denorm, power_y_norm, y_is_norm);
166 /* Compute fixed point fmod of mant_x and mant_y. Set the flag,
167 * result0, to all ones if we detect that the final result is
170 result0 = spu_or(zero_x, zero_y);
172 // n = spu_sub((vec_int4)logb_x, (vec_int4)logb_y); //zhao--
173 n = spu_sub(power_x, power_y);
174 mask = spu_cmpgt(n, 0);
176 while (spu_extract(spu_gather(mask), 0)) {
177 borrow = spu_genb(mant_x, mant_y);
178 borrow = spu_shuffle(borrow, borrow, propagate);
179 z = spu_subx(mant_x, mant_y, borrow);
181 result0 = spu_or(spu_and(spu_cmpeq(spu_or(z, spu_shuffle(z, z, swap_words)), 0), mask), result0);
183 mant_x = spu_sel(mant_x,
184 spu_sel(spu_slqw(mant_x, 1), spu_andc(spu_slqw(z, 1), lsb), spu_cmpgt((vec_int4)spu_shuffle(z, z, splat_hi), -1)),
188 mask = spu_cmpgt(n, 0);
192 borrow = spu_genb(mant_x, mant_y);
193 borrow = spu_shuffle(borrow, borrow, propagate);
194 z = spu_subx(mant_x, mant_y, borrow);
195 mant_x = spu_sel(mant_x, z, spu_cmpgt((vec_int4)spu_shuffle(z, z, splat_hi), -1));
197 result0 = spu_or(spu_cmpeq(spu_or(mant_x, spu_shuffle(mant_x, mant_x, swap_words)), 0), result0);
201 /* Convert the result back to floating point and restore
202 * the sign. If we flagged the result to be zero (result0),
203 * zero it. If we flagged the result to equal its input x,
204 * (resultx) then return x.
206 * Double precision generates a denorm for an output.
209 // normal = spu_cmpgt((vec_int4)exp_y, 0);//zhao--
211 cnt = spu_cntlz(mant_x);
212 cnt = spu_add(cnt, spu_and(spu_rlqwbyte(cnt, 4), spu_cmpeq(cnt, 32)));
213 cnt = spu_add(spu_shuffle(cnt, cnt, splat_hi), -11);
215 mant_x0 = spu_rlmaskqwbyte(mant_x, -8);
216 mant_x1 = spu_and(mant_x,((vec_uint4){0x0,0x0,-1,-1}));
218 power =spu_sub(power_y, (vec_int4)cnt);
220 is_normal = spu_cmpgt(power, 0);
226 shift0 = spu_extract(cnt, 0);
227 shift1 = spu_extract(cnt, 2);
229 norm0 = spu_slqwbytebc(spu_slqw(spu_andc(mant_x0, implied_1), shift0), shift0);
230 norm1 = spu_slqwbytebc(spu_slqw(spu_andc(mant_x1, implied_1), shift1), shift1);
232 norm0 = spu_slqwbytebc(spu_slqw(mant_x0, shift0), shift0);
233 norm1 = spu_slqwbytebc(spu_slqw(mant_x1, shift1), shift1);
235 norm = spu_shuffle(norm0, norm1, merge);
240 shift = spu_add((vec_int4)exp_y, -1);
241 shift0 = spu_extract(shift, 0);
242 shift1 = spu_extract(shift, 2);
243 denorm0 = spu_slqwbytebc(spu_slqw(mant_x0, shift0), shift0);
244 denorm1 = spu_slqwbytebc(spu_slqw(mant_x1, shift1), shift1);
246 shift = spu_add(power, -1);
247 shift0 = spu_extract(shift, 0);
248 shift1 = spu_extract(shift, 2);
250 // printf("result denorm: shift0=%d, shift1=%d\n",shift0, shift1);
252 denorm0 = spu_rlmaskqwbytebc(spu_rlmaskqw(norm0, shift0), 7+shift0);
253 denorm1 = spu_rlmaskqwbytebc(spu_rlmaskqw(norm1, shift1), 7+shift1);
255 denorm = spu_shuffle(denorm0, denorm1, merge);
258 mant_x = spu_sel(denorm, norm, is_normal);
260 exp_y = (vec_uint4)power;
261 exp_y = spu_and(spu_rl(exp_y, 20), is_normal);
263 result = spu_sel(exp_y, spu_or(sign, mant_x),((vec_uint4){0x800FFFFF, -1, 0x800FFFFF, -1}));
266 result = spu_sel(spu_andc(result, spu_rlmask(result0, -1)),
267 (vec_uint4)x, resultx);
269 result = spu_sel(result, (vec_uint4)x, y_inf);
271 result = spu_sel(result, ((vec_uint4){0x7ff80000, 0x0, 0x7ff80000, 0x0}), x_inf);
273 result = spu_andc(result, zero_y);
275 //x=NaN or y=NaN => 0
276 result = spu_sel(result, (vec_uint4)x, x_nan);
277 result = spu_sel(result, (vec_uint4)y, y_nan);
279 return ((vec_double2)result);