Update.
[platform/upstream/glibc.git] / soft-fp / op-2.h
1 /* Software floating-point emulation.
2    Basic two-word fraction declaration and manipulation.
3    Copyright (C) 1997,1998,1999 Free Software Foundation, Inc.
4    This file is part of the GNU C Library.
5    Contributed by Richard Henderson (rth@cygnus.com),
6                   Jakub Jelinek (jj@ultra.linux.cz),
7                   David S. Miller (davem@redhat.com) and
8                   Peter Maydell (pmaydell@chiark.greenend.org.uk).
9
10    The GNU C Library is free software; you can redistribute it and/or
11    modify it under the terms of the GNU Library General Public License as
12    published by the Free Software Foundation; either version 2 of the
13    License, or (at your option) any later version.
14
15    The GNU C Library is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18    Library General Public License for more details.
19
20    You should have received a copy of the GNU Library General Public
21    License along with the GNU C Library; see the file COPYING.LIB.  If
22    not, write to the Free Software Foundation, Inc.,
23    59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.  */
24
25 #define _FP_FRAC_DECL_2(X)      _FP_W_TYPE X##_f0, X##_f1
26 #define _FP_FRAC_COPY_2(D,S)    (D##_f0 = S##_f0, D##_f1 = S##_f1)
27 #define _FP_FRAC_SET_2(X,I)     __FP_FRAC_SET_2(X, I)
28 #define _FP_FRAC_HIGH_2(X)      (X##_f1)
29 #define _FP_FRAC_LOW_2(X)       (X##_f0)
30 #define _FP_FRAC_WORD_2(X,w)    (X##_f##w)
31
32 #define _FP_FRAC_SLL_2(X,N)                                             \
33   do {                                                                  \
34     if ((N) < _FP_W_TYPE_SIZE)                                          \
35       {                                                                 \
36         if (__builtin_constant_p(N) && (N) == 1)                        \
37           {                                                             \
38             X##_f1 = X##_f1 + X##_f1 + (((_FP_WS_TYPE)(X##_f0)) < 0);   \
39             X##_f0 += X##_f0;                                           \
40           }                                                             \
41         else                                                            \
42           {                                                             \
43             X##_f1 = X##_f1 << (N) | X##_f0 >> (_FP_W_TYPE_SIZE - (N)); \
44             X##_f0 <<= (N);                                             \
45           }                                                             \
46       }                                                                 \
47     else                                                                \
48       {                                                                 \
49         X##_f1 = X##_f0 << ((N) - _FP_W_TYPE_SIZE);                     \
50         X##_f0 = 0;                                                     \
51       }                                                                 \
52   } while (0)
53
54 #define _FP_FRAC_SRL_2(X,N)                                             \
55   do {                                                                  \
56     if ((N) < _FP_W_TYPE_SIZE)                                          \
57       {                                                                 \
58         X##_f0 = X##_f0 >> (N) | X##_f1 << (_FP_W_TYPE_SIZE - (N));     \
59         X##_f1 >>= (N);                                                 \
60       }                                                                 \
61     else                                                                \
62       {                                                                 \
63         X##_f0 = X##_f1 >> ((N) - _FP_W_TYPE_SIZE);                     \
64         X##_f1 = 0;                                                     \
65       }                                                                 \
66   } while (0)
67
68 /* Right shift with sticky-lsb.  */
69 #define _FP_FRAC_SRS_2(X,N,sz)                                          \
70   do {                                                                  \
71     if ((N) < _FP_W_TYPE_SIZE)                                          \
72       {                                                                 \
73         X##_f0 = (X##_f1 << (_FP_W_TYPE_SIZE - (N)) | X##_f0 >> (N) |   \
74                   (__builtin_constant_p(N) && (N) == 1                  \
75                    ? X##_f0 & 1                                         \
76                    : (X##_f0 << (_FP_W_TYPE_SIZE - (N))) != 0));        \
77         X##_f1 >>= (N);                                                 \
78       }                                                                 \
79     else                                                                \
80       {                                                                 \
81         X##_f0 = (X##_f1 >> ((N) - _FP_W_TYPE_SIZE) |                   \
82                   (((X##_f1 << (sz - (N))) | X##_f0) != 0));            \
83         X##_f1 = 0;                                                     \
84       }                                                                 \
85   } while (0)
86
87 #define _FP_FRAC_ADDI_2(X,I)    \
88   __FP_FRAC_ADDI_2(X##_f1, X##_f0, I)
89
90 #define _FP_FRAC_ADD_2(R,X,Y)   \
91   __FP_FRAC_ADD_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
92
93 #define _FP_FRAC_SUB_2(R,X,Y)   \
94   __FP_FRAC_SUB_2(R##_f1, R##_f0, X##_f1, X##_f0, Y##_f1, Y##_f0)
95
96 #define _FP_FRAC_DEC_2(X,Y)     \
97   __FP_FRAC_DEC_2(X##_f1, X##_f0, Y##_f1, Y##_f0)
98
99 #define _FP_FRAC_CLZ_2(R,X)     \
100   do {                          \
101     if (X##_f1)                 \
102       __FP_CLZ(R,X##_f1);       \
103     else                        \
104     {                           \
105       __FP_CLZ(R,X##_f0);       \
106       R += _FP_W_TYPE_SIZE;     \
107     }                           \
108   } while(0)
109
110 /* Predicates */
111 #define _FP_FRAC_NEGP_2(X)      ((_FP_WS_TYPE)X##_f1 < 0)
112 #define _FP_FRAC_ZEROP_2(X)     ((X##_f1 | X##_f0) == 0)
113 #define _FP_FRAC_OVERP_2(fs,X)  (_FP_FRAC_HIGH_##fs(X) & _FP_OVERFLOW_##fs)
114 #define _FP_FRAC_EQ_2(X, Y)     (X##_f1 == Y##_f1 && X##_f0 == Y##_f0)
115 #define _FP_FRAC_GT_2(X, Y)     \
116   (X##_f1 > Y##_f1 || X##_f1 == Y##_f1 && X##_f0 > Y##_f0)
117 #define _FP_FRAC_GE_2(X, Y)     \
118   (X##_f1 > Y##_f1 || X##_f1 == Y##_f1 && X##_f0 >= Y##_f0)
119
120 #define _FP_ZEROFRAC_2          0, 0
121 #define _FP_MINFRAC_2           0, 1
122 #define _FP_MAXFRAC_2           (~(_FP_WS_TYPE)0), (~(_FP_WS_TYPE)0)
123
124 /*
125  * Internals 
126  */
127
128 #define __FP_FRAC_SET_2(X,I1,I0)        (X##_f0 = I0, X##_f1 = I1)
129
130 #define __FP_CLZ_2(R, xh, xl)   \
131   do {                          \
132     if (xh)                     \
133       __FP_CLZ(R,xh);           \
134     else                        \
135     {                           \
136       __FP_CLZ(R,xl);           \
137       R += _FP_W_TYPE_SIZE;     \
138     }                           \
139   } while(0)
140
141 #if 0
142
143 #ifndef __FP_FRAC_ADDI_2
144 #define __FP_FRAC_ADDI_2(xh, xl, i)     \
145   (xh += ((xl += i) < i))
146 #endif
147 #ifndef __FP_FRAC_ADD_2
148 #define __FP_FRAC_ADD_2(rh, rl, xh, xl, yh, yl) \
149   (rh = xh + yh + ((rl = xl + yl) < xl))
150 #endif
151 #ifndef __FP_FRAC_SUB_2
152 #define __FP_FRAC_SUB_2(rh, rl, xh, xl, yh, yl) \
153   (rh = xh - yh - ((rl = xl - yl) > xl))
154 #endif
155 #ifndef __FP_FRAC_DEC_2
156 #define __FP_FRAC_DEC_2(xh, xl, yh, yl) \
157   do {                                  \
158     UWtype _t = xl;                     \
159     xh -= yh + ((xl -= yl) > _t);       \
160   } while (0)
161 #endif
162
163 #else
164
165 #undef __FP_FRAC_ADDI_2
166 #define __FP_FRAC_ADDI_2(xh, xl, i)     add_ssaaaa(xh, xl, xh, xl, 0, i)
167 #undef __FP_FRAC_ADD_2
168 #define __FP_FRAC_ADD_2                 add_ssaaaa
169 #undef __FP_FRAC_SUB_2
170 #define __FP_FRAC_SUB_2                 sub_ddmmss
171 #undef __FP_FRAC_DEC_2
172 #define __FP_FRAC_DEC_2(xh, xl, yh, yl) sub_ddmmss(xh, xl, xh, xl, yh, yl)
173
174 #endif
175
176 /*
177  * Unpack the raw bits of a native fp value.  Do not classify or
178  * normalize the data.
179  */
180
181 #define _FP_UNPACK_RAW_2(fs, X, val)                    \
182   do {                                                  \
183     union _FP_UNION_##fs _flo; _flo.flt = (val);        \
184                                                         \
185     X##_f0 = _flo.bits.frac0;                           \
186     X##_f1 = _flo.bits.frac1;                           \
187     X##_e  = _flo.bits.exp;                             \
188     X##_s  = _flo.bits.sign;                            \
189   } while (0)
190
191 #define _FP_UNPACK_RAW_2_P(fs, X, val)                  \
192   do {                                                  \
193     union _FP_UNION_##fs *_flo =                        \
194       (union _FP_UNION_##fs *)(val);                    \
195                                                         \
196     X##_f0 = _flo->bits.frac0;                          \
197     X##_f1 = _flo->bits.frac1;                          \
198     X##_e  = _flo->bits.exp;                            \
199     X##_s  = _flo->bits.sign;                           \
200   } while (0)
201
202
203 /*
204  * Repack the raw bits of a native fp value.
205  */
206
207 #define _FP_PACK_RAW_2(fs, val, X)                      \
208   do {                                                  \
209     union _FP_UNION_##fs _flo;                          \
210                                                         \
211     _flo.bits.frac0 = X##_f0;                           \
212     _flo.bits.frac1 = X##_f1;                           \
213     _flo.bits.exp   = X##_e;                            \
214     _flo.bits.sign  = X##_s;                            \
215                                                         \
216     (val) = _flo.flt;                                   \
217   } while (0)
218
219 #define _FP_PACK_RAW_2_P(fs, val, X)                    \
220   do {                                                  \
221     union _FP_UNION_##fs *_flo =                        \
222       (union _FP_UNION_##fs *)(val);                    \
223                                                         \
224     _flo->bits.frac0 = X##_f0;                          \
225     _flo->bits.frac1 = X##_f1;                          \
226     _flo->bits.exp   = X##_e;                           \
227     _flo->bits.sign  = X##_s;                           \
228   } while (0)
229
230
231 /*
232  * Multiplication algorithms:
233  */
234
235 /* Given a 1W * 1W => 2W primitive, do the extended multiplication.  */
236
237 #define _FP_MUL_MEAT_2_wide(wfracbits, R, X, Y, doit)                   \
238   do {                                                                  \
239     _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);      \
240                                                                         \
241     doit(_FP_FRAC_WORD_4(_z,1), _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0); \
242     doit(_b_f1, _b_f0, X##_f0, Y##_f1);                                 \
243     doit(_c_f1, _c_f0, X##_f1, Y##_f0);                                 \
244     doit(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2), X##_f1, Y##_f1); \
245                                                                         \
246     __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),        \
247                     _FP_FRAC_WORD_4(_z,1), 0, _b_f1, _b_f0,             \
248                     _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),        \
249                     _FP_FRAC_WORD_4(_z,1));                             \
250     __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),        \
251                     _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0,             \
252                     _FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),        \
253                     _FP_FRAC_WORD_4(_z,1));                             \
254                                                                         \
255     /* Normalize since we know where the msb of the multiplicands       \
256        were (bit B), we know that the msb of the of the product is      \
257        at either 2B or 2B-1.  */                                        \
258     _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);                       \
259     R##_f0 = _FP_FRAC_WORD_4(_z,0);                                     \
260     R##_f1 = _FP_FRAC_WORD_4(_z,1);                                     \
261   } while (0)
262
263 /* Given a 1W * 1W => 2W primitive, do the extended multiplication.
264    Do only 3 multiplications instead of four. This one is for machines
265    where multiplication is much more expensive than subtraction.  */
266
267 #define _FP_MUL_MEAT_2_wide_3mul(wfracbits, R, X, Y, doit)              \
268   do {                                                                  \
269     _FP_FRAC_DECL_4(_z); _FP_FRAC_DECL_2(_b); _FP_FRAC_DECL_2(_c);      \
270     _FP_W_TYPE _d;                                                      \
271     int _c1, _c2;                                                       \
272                                                                         \
273     _b_f0 = X##_f0 + X##_f1;                                            \
274     _c1 = _b_f0 < X##_f0;                                               \
275     _b_f1 = Y##_f0 + Y##_f1;                                            \
276     _c2 = _b_f1 < Y##_f0;                                               \
277     doit(_d, _FP_FRAC_WORD_4(_z,0), X##_f0, Y##_f0);                    \
278     doit(_FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1), _b_f0, _b_f1);   \
279     doit(_c_f1, _c_f0, X##_f1, Y##_f1);                                 \
280                                                                         \
281     _b_f0 &= -_c2;                                                      \
282     _b_f1 &= -_c1;                                                      \
283     __FP_FRAC_ADD_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),        \
284                     _FP_FRAC_WORD_4(_z,1), (_c1 & _c2), 0, _d,          \
285                     0, _FP_FRAC_WORD_4(_z,2), _FP_FRAC_WORD_4(_z,1));   \
286     __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),       \
287                      _b_f0);                                            \
288     __FP_FRAC_ADDI_2(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),       \
289                      _b_f1);                                            \
290     __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),        \
291                     _FP_FRAC_WORD_4(_z,1),                              \
292                     0, _d, _FP_FRAC_WORD_4(_z,0));                      \
293     __FP_FRAC_DEC_3(_FP_FRAC_WORD_4(_z,3),_FP_FRAC_WORD_4(_z,2),        \
294                     _FP_FRAC_WORD_4(_z,1), 0, _c_f1, _c_f0);            \
295     __FP_FRAC_ADD_2(_FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2),       \
296                     _c_f1, _c_f0,                                       \
297                     _FP_FRAC_WORD_4(_z,3), _FP_FRAC_WORD_4(_z,2));      \
298                                                                         \
299     /* Normalize since we know where the msb of the multiplicands       \
300        were (bit B), we know that the msb of the of the product is      \
301        at either 2B or 2B-1.  */                                        \
302     _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);                       \
303     R##_f0 = _FP_FRAC_WORD_4(_z,0);                                     \
304     R##_f1 = _FP_FRAC_WORD_4(_z,1);                                     \
305   } while (0)
306
307 #define _FP_MUL_MEAT_2_gmp(wfracbits, R, X, Y)                          \
308   do {                                                                  \
309     _FP_FRAC_DECL_4(_z);                                                \
310     _FP_W_TYPE _x[2], _y[2];                                            \
311     _x[0] = X##_f0; _x[1] = X##_f1;                                     \
312     _y[0] = Y##_f0; _y[1] = Y##_f1;                                     \
313                                                                         \
314     mpn_mul_n(_z_f, _x, _y, 2);                                         \
315                                                                         \
316     /* Normalize since we know where the msb of the multiplicands       \
317        were (bit B), we know that the msb of the of the product is      \
318        at either 2B or 2B-1.  */                                        \
319     _FP_FRAC_SRS_4(_z, wfracbits-1, 2*wfracbits);                       \
320     R##_f0 = _z_f[0];                                                   \
321     R##_f1 = _z_f[1];                                                   \
322   } while (0)
323
324 /* Do at most 120x120=240 bits multiplication using double floating
325    point multiplication.  This is useful if floating point
326    multiplication has much bigger throughput than integer multiply.
327    It is supposed to work for _FP_W_TYPE_SIZE 64 and wfracbits
328    between 106 and 120 only.  
329    Caller guarantees that X and Y has (1LLL << (wfracbits - 1)) set.
330    SETFETZ is a macro which will disable all FPU exceptions and set rounding
331    towards zero,  RESETFE should optionally reset it back.  */
332
333 #define _FP_MUL_MEAT_2_120_240_double(wfracbits, R, X, Y, setfetz, resetfe)     \
334   do {                                                                          \
335     static const double _const[] = {                                            \
336       /* 2^-24 */ 5.9604644775390625e-08,                                       \
337       /* 2^-48 */ 3.5527136788005009e-15,                                       \
338       /* 2^-72 */ 2.1175823681357508e-22,                                       \
339       /* 2^-96 */ 1.2621774483536189e-29,                                       \
340       /* 2^28 */ 2.68435456e+08,                                                \
341       /* 2^4 */ 1.600000e+01,                                                   \
342       /* 2^-20 */ 9.5367431640625e-07,                                          \
343       /* 2^-44 */ 5.6843418860808015e-14,                                       \
344       /* 2^-68 */ 3.3881317890172014e-21,                                       \
345       /* 2^-92 */ 2.0194839173657902e-28,                                       \
346       /* 2^-116 */ 1.2037062152420224e-35};                                     \
347     double _a240, _b240, _c240, _d240, _e240, _f240,                            \
348            _g240, _h240, _i240, _j240, _k240;                                   \
349     union { double d; UDItype i; } _l240, _m240, _n240, _o240,                  \
350                                    _p240, _q240, _r240, _s240;                  \
351     UDItype _t240, _u240, _v240, _w240, _x240, _y240 = 0;                       \
352                                                                                 \
353     if (wfracbits < 106 || wfracbits > 120)                                     \
354       abort();                                                                  \
355                                                                                 \
356     setfetz;                                                                    \
357                                                                                 \
358     _e240 = (double)(long)(X##_f0 & 0xffffff);                                  \
359     _j240 = (double)(long)(Y##_f0 & 0xffffff);                                  \
360     _d240 = (double)(long)((X##_f0 >> 24) & 0xffffff);                          \
361     _i240 = (double)(long)((Y##_f0 >> 24) & 0xffffff);                          \
362     _c240 = (double)(long)(((X##_f1 << 16) & 0xffffff) | (X##_f0 >> 48));       \
363     _h240 = (double)(long)(((Y##_f1 << 16) & 0xffffff) | (Y##_f0 >> 48));       \
364     _b240 = (double)(long)((X##_f1 >> 8) & 0xffffff);                           \
365     _g240 = (double)(long)((Y##_f1 >> 8) & 0xffffff);                           \
366     _a240 = (double)(long)(X##_f1 >> 32);                                       \
367     _f240 = (double)(long)(Y##_f1 >> 32);                                       \
368     _e240 *= _const[3];                                                         \
369     _j240 *= _const[3];                                                         \
370     _d240 *= _const[2];                                                         \
371     _i240 *= _const[2];                                                         \
372     _c240 *= _const[1];                                                         \
373     _h240 *= _const[1];                                                         \
374     _b240 *= _const[0];                                                         \
375     _g240 *= _const[0];                                                         \
376     _s240.d =                                                         _e240*_j240;\
377     _r240.d =                                           _d240*_j240 + _e240*_i240;\
378     _q240.d =                             _c240*_j240 + _d240*_i240 + _e240*_h240;\
379     _p240.d =               _b240*_j240 + _c240*_i240 + _d240*_h240 + _e240*_g240;\
380     _o240.d = _a240*_j240 + _b240*_i240 + _c240*_h240 + _d240*_g240 + _e240*_f240;\
381     _n240.d = _a240*_i240 + _b240*_h240 + _c240*_g240 + _d240*_f240;            \
382     _m240.d = _a240*_h240 + _b240*_g240 + _c240*_f240;                          \
383     _l240.d = _a240*_g240 + _b240*_f240;                                        \
384     _k240 =   _a240*_f240;                                                      \
385     _r240.d += _s240.d;                                                         \
386     _q240.d += _r240.d;                                                         \
387     _p240.d += _q240.d;                                                         \
388     _o240.d += _p240.d;                                                         \
389     _n240.d += _o240.d;                                                         \
390     _m240.d += _n240.d;                                                         \
391     _l240.d += _m240.d;                                                         \
392     _k240 += _l240.d;                                                           \
393     _s240.d -= ((_const[10]+_s240.d)-_const[10]);                               \
394     _r240.d -= ((_const[9]+_r240.d)-_const[9]);                                 \
395     _q240.d -= ((_const[8]+_q240.d)-_const[8]);                                 \
396     _p240.d -= ((_const[7]+_p240.d)-_const[7]);                                 \
397     _o240.d += _const[7];                                                       \
398     _n240.d += _const[6];                                                       \
399     _m240.d += _const[5];                                                       \
400     _l240.d += _const[4];                                                       \
401     if (_s240.d != 0.0) _y240 = 1;                                              \
402     if (_r240.d != 0.0) _y240 = 1;                                              \
403     if (_q240.d != 0.0) _y240 = 1;                                              \
404     if (_p240.d != 0.0) _y240 = 1;                                              \
405     _t240 = (DItype)_k240;                                                      \
406     _u240 = _l240.i;                                                            \
407     _v240 = _m240.i;                                                            \
408     _w240 = _n240.i;                                                            \
409     _x240 = _o240.i;                                                            \
410     R##_f1 = (_t240 << (128 - (wfracbits - 1)))                                 \
411              | ((_u240 & 0xffffff) >> ((wfracbits - 1) - 104));                 \
412     R##_f0 = ((_u240 & 0xffffff) << (168 - (wfracbits - 1)))                    \
413              | ((_v240 & 0xffffff) << (144 - (wfracbits - 1)))                  \
414              | ((_w240 & 0xffffff) << (120 - (wfracbits - 1)))                  \
415              | ((_x240 & 0xffffff) >> ((wfracbits - 1) - 96))                   \
416              | _y240;                                                           \
417     resetfe;                                                                    \
418   } while (0)
419
420 /*
421  * Division algorithms:
422  */
423
424 #define _FP_DIV_MEAT_2_udiv(fs, R, X, Y)                                \
425   do {                                                                  \
426     _FP_W_TYPE _n_f2, _n_f1, _n_f0, _r_f1, _r_f0, _m_f1, _m_f0;         \
427     if (_FP_FRAC_GT_2(X, Y))                                            \
428       {                                                                 \
429         _n_f2 = X##_f1 >> 1;                                            \
430         _n_f1 = X##_f1 << (_FP_W_TYPE_SIZE - 1) | X##_f0 >> 1;          \
431         _n_f0 = X##_f0 << (_FP_W_TYPE_SIZE - 1);                        \
432       }                                                                 \
433     else                                                                \
434       {                                                                 \
435         R##_e--;                                                        \
436         _n_f2 = X##_f1;                                                 \
437         _n_f1 = X##_f0;                                                 \
438         _n_f0 = 0;                                                      \
439       }                                                                 \
440                                                                         \
441     /* Normalize, i.e. make the most significant bit of the             \
442        denominator set. */                                              \
443     _FP_FRAC_SLL_2(Y, _FP_WFRACXBITS_##fs);                             \
444                                                                         \
445     udiv_qrnnd(R##_f1, _r_f1, _n_f2, _n_f1, Y##_f1);                    \
446     umul_ppmm(_m_f1, _m_f0, R##_f1, Y##_f0);                            \
447     _r_f0 = _n_f0;                                                      \
448     if (_FP_FRAC_GT_2(_m, _r))                                          \
449       {                                                                 \
450         R##_f1--;                                                       \
451         _FP_FRAC_ADD_2(_r, Y, _r);                                      \
452         if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r))              \
453           {                                                             \
454             R##_f1--;                                                   \
455             _FP_FRAC_ADD_2(_r, Y, _r);                                  \
456           }                                                             \
457       }                                                                 \
458     _FP_FRAC_DEC_2(_r, _m);                                             \
459                                                                         \
460     if (_r_f1 == Y##_f1)                                                \
461       {                                                                 \
462         /* This is a special case, not an optimization                  \
463            (_r/Y##_f1 would not fit into UWtype).                       \
464            As _r is guaranteed to be < Y,  R##_f0 can be either         \
465            (UWtype)-1 or (UWtype)-2.  But as we know what kind          \
466            of bits it is (sticky, guard, round),  we don't care.        \
467            We also don't care what the reminder is,  because the        \
468            guard bit will be set anyway.  -jj */                        \
469         R##_f0 = -1;                                                    \
470       }                                                                 \
471     else                                                                \
472       {                                                                 \
473         udiv_qrnnd(R##_f0, _r_f1, _r_f1, _r_f0, Y##_f1);                \
474         umul_ppmm(_m_f1, _m_f0, R##_f0, Y##_f0);                        \
475         _r_f0 = 0;                                                      \
476         if (_FP_FRAC_GT_2(_m, _r))                                      \
477           {                                                             \
478             R##_f0--;                                                   \
479             _FP_FRAC_ADD_2(_r, Y, _r);                                  \
480             if (_FP_FRAC_GE_2(_r, Y) && _FP_FRAC_GT_2(_m, _r))          \
481               {                                                         \
482                 R##_f0--;                                               \
483                 _FP_FRAC_ADD_2(_r, Y, _r);                              \
484               }                                                         \
485           }                                                             \
486         if (!_FP_FRAC_EQ_2(_r, _m))                                     \
487           R##_f0 |= _FP_WORK_STICKY;                                    \
488       }                                                                 \
489   } while (0)
490
491
492 #define _FP_DIV_MEAT_2_gmp(fs, R, X, Y)                                 \
493   do {                                                                  \
494     _FP_W_TYPE _x[4], _y[2], _z[4];                                     \
495     _y[0] = Y##_f0; _y[1] = Y##_f1;                                     \
496     _x[0] = _x[3] = 0;                                                  \
497     if (_FP_FRAC_GT_2(X, Y))                                            \
498       {                                                                 \
499         R##_e++;                                                        \
500         _x[1] = (X##_f0 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE) |   \
501                  X##_f1 >> (_FP_W_TYPE_SIZE -                           \
502                             (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE))); \
503         _x[2] = X##_f1 << (_FP_WFRACBITS_##fs-1 - _FP_W_TYPE_SIZE);     \
504       }                                                                 \
505     else                                                                \
506       {                                                                 \
507         _x[1] = (X##_f0 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE) |     \
508                  X##_f1 >> (_FP_W_TYPE_SIZE -                           \
509                             (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE)));   \
510         _x[2] = X##_f1 << (_FP_WFRACBITS_##fs - _FP_W_TYPE_SIZE);       \
511       }                                                                 \
512                                                                         \
513     (void) mpn_divrem (_z, 0, _x, 4, _y, 2);                            \
514     R##_f1 = _z[1];                                                     \
515     R##_f0 = _z[0] | ((_x[0] | _x[1]) != 0);                            \
516   } while (0)
517
518
519 /*
520  * Square root algorithms:
521  * We have just one right now, maybe Newton approximation
522  * should be added for those machines where division is fast.
523  */
524  
525 #define _FP_SQRT_MEAT_2(R, S, T, X, q)                  \
526   do {                                                  \
527     while (q)                                           \
528       {                                                 \
529         T##_f1 = S##_f1 + q;                            \
530         if (T##_f1 <= X##_f1)                           \
531           {                                             \
532             S##_f1 = T##_f1 + q;                        \
533             X##_f1 -= T##_f1;                           \
534             R##_f1 += q;                                \
535           }                                             \
536         _FP_FRAC_SLL_2(X, 1);                           \
537         q >>= 1;                                        \
538       }                                                 \
539     q = (_FP_W_TYPE)1 << (_FP_W_TYPE_SIZE - 1);         \
540     while (q != _FP_WORK_ROUND)                         \
541       {                                                 \
542         T##_f0 = S##_f0 + q;                            \
543         T##_f1 = S##_f1;                                \
544         if (T##_f1 < X##_f1 ||                          \
545             (T##_f1 == X##_f1 && T##_f0 <= X##_f0))     \
546           {                                             \
547             S##_f0 = T##_f0 + q;                        \
548             S##_f1 += (T##_f0 > S##_f0);                \
549             _FP_FRAC_DEC_2(X, T);                       \
550             R##_f0 += q;                                \
551           }                                             \
552         _FP_FRAC_SLL_2(X, 1);                           \
553         q >>= 1;                                        \
554       }                                                 \
555     if (X##_f0 | X##_f1)                                \
556       {                                                 \
557         if (S##_f1 < X##_f1 ||                          \
558             (S##_f1 == X##_f1 && S##_f0 < X##_f0))      \
559           R##_f0 |= _FP_WORK_ROUND;                     \
560         R##_f0 |= _FP_WORK_STICKY;                      \
561       }                                                 \
562   } while (0)
563
564
565 /*
566  * Assembly/disassembly for converting to/from integral types.  
567  * No shifting or overflow handled here.
568  */
569
570 #define _FP_FRAC_ASSEMBLE_2(r, X, rsize)        \
571   do {                                          \
572     if (rsize <= _FP_W_TYPE_SIZE)               \
573       r = X##_f0;                               \
574     else                                        \
575       {                                         \
576         r = X##_f1;                             \
577         r <<= _FP_W_TYPE_SIZE;                  \
578         r += X##_f0;                            \
579       }                                         \
580   } while (0)
581
582 #define _FP_FRAC_DISASSEMBLE_2(X, r, rsize)                             \
583   do {                                                                  \
584     X##_f0 = r;                                                         \
585     X##_f1 = (rsize <= _FP_W_TYPE_SIZE ? 0 : r >> _FP_W_TYPE_SIZE);     \
586   } while (0)
587
588 /*
589  * Convert FP values between word sizes
590  */
591
592 #define _FP_FRAC_CONV_1_2(dfs, sfs, D, S)                               \
593   do {                                                                  \
594     if (S##_c != FP_CLS_NAN)                                            \
595       _FP_FRAC_SRS_2(S, (_FP_WFRACBITS_##sfs - _FP_WFRACBITS_##dfs),    \
596                      _FP_WFRACBITS_##sfs);                              \
597     else                                                                \
598       _FP_FRAC_SRL_2(S, (_FP_WFRACBITS_##sfs - _FP_WFRACBITS_##dfs));   \
599     D##_f = S##_f0;                                                     \
600   } while (0)
601
602 #define _FP_FRAC_CONV_2_1(dfs, sfs, D, S)                               \
603   do {                                                                  \
604     D##_f0 = S##_f;                                                     \
605     D##_f1 = 0;                                                         \
606     _FP_FRAC_SLL_2(D, (_FP_WFRACBITS_##dfs - _FP_WFRACBITS_##sfs));     \
607   } while (0)
608