1 /* ldexpd2 - Multiply Double by 2 Raised to its Power
2 For large elements of ex (overflow), returns HUGE_VALF
3 For small elements of ex (underflow), returns 0.
4 Copyright (C) 2006, 2007 Sony Computer Entertainment Inc.
7 Redistribution and use in source and binary forms,
8 with or without modification, are permitted provided that the
9 following conditions are met:
10 * Redistributions of source code must retain the above copyright
11 notice, this list of conditions and the following disclaimer.
12 * Redistributions in binary form must reproduce the above copyright
13 notice, this list of conditions and the following disclaimer in the
14 documentation and/or other materials provided with the distribution.
15 * Neither the name of the Sony Computer Entertainment Inc nor the names
16 of its contributors may be used to endorse or promote products derived
17 from this software without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
23 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 POSSIBILITY OF SUCH DAMAGE.
32 #ifndef ___SIMD_MATH_LDEXPD2_H___
33 #define ___SIMD_MATH_LDEXPD2_H___
36 #include <spu_intrinsics.h>
38 static inline vector double
39 _ldexpd2(vector double x, vector signed long long ex)
42 vec_int4 min = spu_splats(-2099);
43 // vec_int4 min = spu_splats(-2044);
44 vec_int4 max = spu_splats( 2098);
45 // vec_int4 max = spu_splats( 2046);
46 vec_uint4 cmp_min, cmp_max;
47 vec_uint4 shift = ((vec_uint4){20, 32, 20, 32});
53 // check input data range
54 vec_int4 exp0 = spu_shuffle( (vec_int4)ex, (vec_int4)ex, ((vec_uchar16){4,5,6,7, 4,5,6,7, 12,13,14,15, 12,13,14,15}));
55 vec_int4 dmy = spu_shuffle( (vec_int4)spu_splats(0x10000), (vec_int4)ex, ((vec_uchar16){16,1,2,3, 16,1,2,3, 24,1,2,3,24,1,2,3}));
56 // (-)0xFFFFFFFF80000000 or (+)0x000000007FFFFFFF
57 vec_int4 msk_range = ((vec_int4){0,0x80000000, 0,0x80000000});
58 vec_int4 inrange = spu_addx( (vec_int4)ex, msk_range, spu_rlqwbyte(spu_genc((vec_int4)ex, msk_range), 4));
59 inrange = (vec_int4)spu_cmpeq( inrange, 0 );
60 inrange = spu_shuffle(inrange,inrange,((vec_uchar16){0,1,2,3,0,1,2,3,8,9,10,11,8,9,10,11}));
62 // select dummy over ranged data or input data
63 vec_int4 exp = spu_sel( dmy, exp0, (vec_uint4)inrange);
65 /* Clamp the specified exponent to the range -2044 to 2046.
67 cmp_min = spu_cmpgt(exp, min);
68 cmp_max = spu_cmpgt(exp, max);
69 exp = spu_sel(min, exp, cmp_min);
70 exp = spu_sel(exp, max, cmp_max);
72 /* Generate the factors f1 = 2^e1 and f2 = 2^e2
74 e1 = spu_rlmaska(exp, -1);
75 e2 = spu_sub(exp, e1);
77 f1 = (vec_double2)spu_sl(spu_add(e1, 1023), shift);
79 vec_double2 otmp = spu_mul(x, f1);
80 vec_uint4 fpscr1 = spu_mffpscr();
82 f2 = (vec_double2)spu_sl(spu_add(e2, 1023), shift);
84 out = spu_mul(otmp, f2);
85 vec_uint4 fpscr2 = spu_mffpscr();
87 /* Compute the product x * 2^e1 * 2^e2
89 // out = spu_mul(spu_mul(x, f1), f2);
91 // check floating point register DENORM bit
92 vec_uint4 fpscr0, fpscr;
93 fpscr0 = spu_or(fpscr1, fpscr2);
94 fpscr = spu_shuffle(fpscr0, fpscr0, ((vec_uchar16){0x80,0x80,0x80,0x80,0x80,0x80,10,0x80,0x80,0x80,6,0x80,0x80,0x80,0x80,0x80}));
95 fpscr = spu_or(fpscr0, fpscr);
96 if ( __builtin_expect(spu_extract(fpscr, 1) == 0, 1) ) return out;
99 //////////////////////
100 // Denormalized calc//
101 //////////////////////
103 vec_uchar16 splat_msb = { 0,0,0,0,0,0,0,0, 8,8,8,8,8,8,8,8};
104 vec_uint4 signmask = ((vec_uint4){0x80000000,0,0x80000000,0});
105 vec_int4 zeros = spu_splats(0);
106 vec_uchar16 msk_64_eq = ((vec_uchar16){4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11});
108 //check input was zero
109 vec_uint4 x_body = spu_and( (vec_uint4)x, ((vec_uint4){0x7FFFFFFF,-1,0x7FFFFFFF,-1}));
110 vec_uint4 x_zero = spu_cmpeq( x_body, (vec_uint4)zeros );
111 x_zero = spu_and( x_zero, spu_shuffle(x_zero,x_zero,msk_64_eq));
113 // check Denormalized input
114 vec_int4 cnt_zero = (vec_int4)spu_cntlz(x_body);
115 vec_uint4 is_den = (vec_uint4)spu_cmpgt(cnt_zero, 11); // Denormalized data 000XXXXX XXXXXXXX
116 is_den = spu_shuffle( is_den, is_den, splat_msb);
117 is_den = spu_sel(is_den, (vec_uint4)zeros, x_zero); // exclude zero from denormalized
119 // count 0bits for 64bit
120 vec_uint4 cnt_ex = (vec_uint4)spu_cmpgt(cnt_zero, 31); // Denormalized data 00000000 XXXXXXXX
121 vec_int4 cnt_z = spu_shuffle( cnt_zero, cnt_zero, ((vec_uchar16){4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}));
122 cnt_zero = spu_add(cnt_zero, spu_sel(zeros, cnt_z, cnt_ex));
123 cnt_zero = spu_shuffle(cnt_zero, cnt_zero, ((vec_uchar16){0,1,2,3,0,1,2,3,8,9,10,11,8,9,10,11}));
125 // extract each 64bit data
126 x_body = spu_and( (vec_uint4)x, ((vec_uint4){0x000FFFFF,-1,0x000FFFFF,-1}));
127 vec_uint4 mant0 = spu_shuffle(x_body, x_body, ((vec_uchar16){0,1, 2, 3, 4, 5, 6, 7,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}));
128 vec_uint4 mant1 = spu_shuffle(x_body, x_body, ((vec_uchar16){8,9,10,11,12,13,14,15,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}));
129 vec_uint4 sign = (vec_uint4)spu_rlmaska((vec_int4)exp_in, -31);
130 sign = spu_shuffle(sign, sign, splat_msb);
132 // set max shift count
133 vec_int4 sht = spu_add( cnt_zero, ((vec_int4){-11,-64,-11,-64}));
135 // denorm & exp+ shift left
136 vec_uint4 cmp = spu_cmpgt( sht, exp_in);
137 vec_int4 sht_l = spu_sel(sht, exp_in, cmp);
138 int shtl0 = spu_extract(sht_l, 0);
139 int shtl1 = spu_extract(sht_l, 2);
140 vec_uint4 mant0l = spu_slqwbytebc( spu_slqw(mant0, shtl0), shtl0 );
141 vec_uint4 mant1l = spu_slqwbytebc( spu_slqw(mant1, shtl1), shtl1 );
142 vec_int4 expp = spu_shuffle(spu_sub(exp_in, sht_l), zeros, ((vec_uchar16){0,1,2,3,0,1,2,3,8,9,10,11,8,9,10,11}));
144 exp0 = spu_sel( expp, exp_in, sign ); // select plus or minus caluc
145 vec_uint4 mantl = spu_shuffle( mant0l, mant1l, ((vec_uchar16){0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23}));
146 vec_uint4 mant = spu_sel( mantl, (vec_uint4)x, sign);
147 exp = spu_sel( exp_in, exp0, is_den ); // select denormalized
148 x = (vec_double2)spu_sel( (vec_uint4)x, mant, is_den);
151 //////////////////////////////////////////////////////////////////////////
153 vec_int4 expmask = ((vec_int4){0x7FF00000, 0, 0x7FF00000, 0});
154 e1 = spu_and((vec_int4)x, expmask);
155 e2 = spu_rlmask(e1,-20);
157 vec_uchar16 maxmask = (vec_uchar16)spu_cmpgt(exp, 2046);
158 vec_uchar16 minmask = (vec_uchar16)spu_cmpgt(spu_splats(-2044), exp);
159 minmask = spu_or (minmask, (vec_uchar16)x_zero);
161 vec_int4 esum = spu_add(e2, exp);
163 maxmask = spu_or (maxmask, (vec_uchar16)spu_cmpgt(esum, 2046));
164 maxmask = spu_shuffle(maxmask, maxmask, splat_msb);
165 // maxmask = spu_and(maxmask, ((vec_uchar16)spu_splats((long long)0x7FFFFFFFFFFFFFFFLL)));
166 minmask = spu_or (minmask, (vec_uchar16)spu_cmpgt(zeros, esum));
167 minmask = spu_shuffle(minmask, minmask, splat_msb);
170 vec_uint4 mxmask = spu_and(spu_cmpgt(e2, 0), ((vec_uint4){0x00100000,0,0x00100000,0})); // not denorm
171 vec_int4 esum2 = spu_sub(esum, (vec_int4)spu_rlmask(mxmask, -20)); // reverse to norm
172 vec_uint4 mrange = spu_and(spu_cmpgt(zeros, esum2), spu_cmpgt(esum2, -55)); // denorm range
173 mrange = spu_shuffle(mrange, mrange, splat_msb);
175 vec_int4 sht_r = spu_sel(spu_splats(-54), esum2, spu_cmpgt(esum2, spu_splats(-54)) );
176 vec_int4 sht_rh = spu_add( sht_r, ((vec_int4){7,7,7,7}));
178 x_body = spu_or( x_body, mxmask );
179 mant0 = spu_shuffle(x_body, x_body, ((vec_uchar16){0,1, 2, 3, 4, 5, 6, 7,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}));
180 mant1 = spu_shuffle(x_body, x_body, ((vec_uchar16){8,9,10,11,12,13,14,15,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}));
181 vec_uint4 mant0r = spu_rlmaskqwbytebc( spu_rlmaskqw(mant0, spu_extract(sht_r, 0)), spu_extract(sht_rh,0) );
182 vec_uint4 mant1r = spu_rlmaskqwbytebc( spu_rlmaskqw(mant1, spu_extract(sht_r, 2)), spu_extract(sht_rh,2) );
185 // check current round mode
186 fpscr = spu_shuffle(fpscr2, fpscr2, ((vec_uchar16){0x80,0x80,0x80,0x80,0,1,2,3,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}));
187 fpscr0 = spu_and(fpscr, ((vec_uint4){0,0xc00,0,0}));
188 fpscr1 = spu_and(fpscr, ((vec_uint4){0,0x300,0,0}));
190 // prepare round data
191 vec_uint4 rnd0 = spu_slqwbytebc( spu_slqw( mant0r, 31), 31);
192 vec_uint4 rnd1 = spu_slqwbytebc( spu_slqw( mant1r, 31), 31);
193 vec_uint4 rnd0w = (vec_uint4)spu_cntb( (vec_uchar16)rnd0 );
194 vec_uint4 rnd1w = (vec_uint4)spu_cntb( (vec_uchar16)rnd1 );
195 rnd0w = spu_or( spu_slqwbyte(rnd0w,4), spu_slqwbyte(rnd0w,8));
196 rnd1w = spu_or( spu_slqwbyte(rnd1w,4), spu_slqwbyte(rnd1w,8));
197 rnd0 = spu_or( rnd0, rnd0w);
198 rnd1 = spu_or( rnd1, rnd1w);
202 vec_uint4 hit0 = spu_cmpeq(rnd0, ((vec_uint4){0,0xc0000000,0,0})); //odd + round out
203 vec_uint4 hit1 = spu_cmpeq(rnd1, ((vec_uint4){0,0xc0000000,0,0})); //odd + round out
204 vec_uint4 add0 = spu_sel((vec_uint4)zeros, ((vec_uint4){0,1,0,0}), hit0);
205 vec_uint4 add1 = spu_sel((vec_uint4)zeros, ((vec_uint4){0,1,0,0}), hit1);
206 // check greater than half
207 rnd0 = spu_and( rnd0, ((vec_uint4){0,0x7FFFFFFF,0,0}));
208 rnd1 = spu_and( rnd1, ((vec_uint4){0,0x7FFFFFFF,0,0}));
209 hit0 = spu_cmpgt(rnd0, ((vec_uint4){0,0x40000000,0,0}));
210 hit1 = spu_cmpgt(rnd1, ((vec_uint4){0,0x40000000,0,0}));
211 add0 = spu_sel(add0, ((vec_uint4){0,1,0,0}), hit0);
212 add1 = spu_sel(add1, ((vec_uint4){0,1,0,0}), hit1);
214 add0 = spu_sel((vec_uint4)zeros, add0, spu_cmpeq(fpscr0, (vec_uint4)zeros));
215 add1 = spu_sel((vec_uint4)zeros, add1, spu_cmpeq(fpscr1, (vec_uint4)zeros));
217 // toward zero do nothing
219 sign = spu_rlmaska((vec_uint4)in, -31);
220 vec_uint4 sign0 = spu_shuffle(sign, sign, ((vec_uchar16){0x80,0x80,0x80,0x80,0,0,0,0,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}));
221 vec_uint4 sign1 = spu_shuffle(sign, sign, ((vec_uchar16){0x80,0x80,0x80,0x80,8,8,8,8,0x80,0x80,0x80,0x80,0x80,0x80,0x80,0x80}));
222 vec_uint4 hit0w = spu_cmpgt(rnd0, ((vec_uint4){0,0,0,0}));
223 vec_uint4 hit1w = spu_cmpgt(rnd1, ((vec_uint4){0,0,0,0}));
225 hit0 = spu_andc(hit0w, sign0);
226 hit1 = spu_andc(hit1w, sign1);
227 hit0 = spu_and(hit0, spu_cmpeq(fpscr0, ((vec_uint4){0,0x800,0,0})));
228 hit1 = spu_and(hit1, spu_cmpeq(fpscr1, ((vec_uint4){0,0x200,0,0})));
230 add0 = spu_sel(add0, ((vec_uint4){0,1,0,0}), hit0);
231 add1 = spu_sel(add1, ((vec_uint4){0,1,0,0}), hit1);
234 hit0 = spu_and(hit0w, sign0);
235 hit1 = spu_and(hit1w, sign1);
236 hit0 = spu_and(hit0, spu_cmpeq(fpscr0, ((vec_uint4){0,0xc00,0,0})));
237 hit1 = spu_and(hit1, spu_cmpeq(fpscr1, ((vec_uint4){0,0x300,0,0})));
239 add0 = spu_sel(add0, ((vec_uint4){0,1,0,0}), hit0);
240 add1 = spu_sel(add1, ((vec_uint4){0,1,0,0}), hit1);
243 mant0r = spu_addx(mant0r, add0, spu_rlqwbyte(spu_genc(mant0r, add0), 4));
244 mant1r = spu_addx(mant1r, add1, spu_rlqwbyte(spu_genc(mant1r, add1), 4));
246 #endif // LDEXPD2_ROUND
248 vec_uint4 mantr = spu_shuffle( mant0r, mant1r, ((vec_uchar16){0,1,2,3,4,5,6,7,16,17,18,19,20,21,22,23}));
250 // select right answer
251 x = spu_sel(x, (vec_double2)spu_sl(esum,20), (vec_ullong2)expmask);
252 x = spu_sel(x, (vec_double2)zeros, (vec_ullong2)minmask);
253 x = spu_sel(x, (vec_double2)spu_splats((long long)0x7FEFFFFFFFFFFFFFLL), (vec_ullong2)maxmask);
255 out = (vec_double2)spu_sel((vec_uint4)x , mantr, mrange);
257 // check Infinity,NaN
258 vec_uint4 is_inf = spu_cmpeq(e1, expmask);
259 is_inf = spu_and( is_inf, spu_shuffle(is_inf,is_inf,msk_64_eq));
260 out = (vec_double2)spu_sel((vec_uint4)out , (vec_uint4)in, is_inf);
262 out = spu_sel(out, in, (vec_ullong2)signmask);