Conditionalise ARM asm on !__thumb__.
[platform/upstream/gmp.git] / longlong.h
1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
2
3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2013 Free Software
4 Foundation, Inc.
5
6 This file is part of the GNU MP Library.
7
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
10
11   * the GNU Lesser General Public License as published by the Free
12     Software Foundation; either version 3 of the License, or (at your
13     option) any later version.
14
15 or
16
17   * the GNU General Public License as published by the Free Software
18     Foundation; either version 2 of the License, or (at your option) any
19     later version.
20
21 or both in parallel, as here.
22
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
26 for more details.
27
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library.  If not,
30 see https://www.gnu.org/licenses/.  */
31
32 /* You have to define the following before including this file:
33
34    UWtype -- An unsigned type, default type for operations (typically a "word")
35    UHWtype -- An unsigned type, at least half the size of UWtype
36    UDWtype -- An unsigned type, at least twice as large a UWtype
37    W_TYPE_SIZE -- size in bits of UWtype
38
39    SItype, USItype -- Signed and unsigned 32 bit types
40    DItype, UDItype -- Signed and unsigned 64 bit types
41
42    On a 32 bit machine UWtype should typically be USItype;
43    on a 64 bit machine, UWtype should typically be UDItype.
44
45    Optionally, define:
46
47    LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
48    NO_ASM -- Disable inline asm
49
50
51    CAUTION!  Using this version of longlong.h outside of GMP is not safe.  You
52    need to include gmp.h and gmp-impl.h, or certain things might not work as
53    expected.
54 */
55
56 #define __BITS4 (W_TYPE_SIZE / 4)
57 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
58 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
59 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
60
61 /* This is used to make sure no undesirable sharing between different libraries
62    that use this file takes place.  */
63 #ifndef __MPN
64 #define __MPN(x) __##x
65 #endif
66
67 /* Define auxiliary asm macros.
68
69    1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
70    UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
71    word product in HIGH_PROD and LOW_PROD.
72
73    2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
74    UDWtype product.  This is just a variant of umul_ppmm.
75
76    3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
77    denominator) divides a UDWtype, composed by the UWtype integers
78    HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
79    in QUOTIENT and the remainder in REMAINDER.  HIGH_NUMERATOR must be less
80    than DENOMINATOR for correct operation.  If, in addition, the most
81    significant bit of DENOMINATOR must be 1, then the pre-processor symbol
82    UDIV_NEEDS_NORMALIZATION is defined to 1.
83
84    4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
85    denominator).  Like udiv_qrnnd but the numbers are signed.  The quotient
86    is rounded towards 0.
87
88    5) count_leading_zeros(count, x) counts the number of zero-bits from the
89    msb to the first non-zero bit in the UWtype X.  This is the number of
90    steps X needs to be shifted left to set the msb.  Undefined for X == 0,
91    unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
92
93    6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
94    from the least significant end.
95
96    7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
97    high_addend_2, low_addend_2) adds two UWtype integers, composed by
98    HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
99    respectively.  The result is placed in HIGH_SUM and LOW_SUM.  Overflow
100    (i.e. carry out) is not stored anywhere, and is lost.
101
102    8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
103    high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
104    composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
105    LOW_SUBTRAHEND_2 respectively.  The result is placed in HIGH_DIFFERENCE
106    and LOW_DIFFERENCE.  Overflow (i.e. carry out) is not stored anywhere,
107    and is lost.
108
109    If any of these macros are left undefined for a particular CPU,
110    C macros are used.
111
112
113    Notes:
114
115    For add_ssaaaa the two high and two low addends can both commute, but
116    unfortunately gcc only supports one "%" commutative in each asm block.
117    This has always been so but is only documented in recent versions
118    (eg. pre-release 3.3).  Having two or more "%"s can cause an internal
119    compiler error in certain rare circumstances.
120
121    Apparently it was only the last "%" that was ever actually respected, so
122    the code has been updated to leave just that.  Clearly there's a free
123    choice whether high or low should get it, if there's a reason to favour
124    one over the other.  Also obviously when the constraints on the two
125    operands are identical there's no benefit to the reloader in any "%" at
126    all.
127
128    */
129
130 /* The CPUs come in alphabetical order below.
131
132    Please add support for more CPUs here, or improve the current support
133    for the CPUs below!  */
134
135
136 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
137    3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
138    Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
139    __builtin_ctzll.
140
141    These builtins are only used when we check what code comes out, on some
142    chips they're merely libgcc calls, where we will instead want an inline
143    in that case (either asm or generic C).
144
145    These builtins are better than an asm block of the same insn, since an
146    asm block doesn't give gcc any information about scheduling or resource
147    usage.  We keep an asm block for use on prior versions of gcc though.
148
149    For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
150    it's not used (for count_leading_zeros) because it generally gives extra
151    code to ensure the result is 0 when the input is 0, which we don't need
152    or want.  */
153
154 #ifdef _LONG_LONG_LIMB
155 #define count_leading_zeros_gcc_clz(count,x)    \
156   do {                                          \
157     ASSERT ((x) != 0);                          \
158     (count) = __builtin_clzll (x);              \
159   } while (0)
160 #else
161 #define count_leading_zeros_gcc_clz(count,x)    \
162   do {                                          \
163     ASSERT ((x) != 0);                          \
164     (count) = __builtin_clzl (x);               \
165   } while (0)
166 #endif
167
168 #ifdef _LONG_LONG_LIMB
169 #define count_trailing_zeros_gcc_ctz(count,x)   \
170   do {                                          \
171     ASSERT ((x) != 0);                          \
172     (count) = __builtin_ctzll (x);              \
173   } while (0)
174 #else
175 #define count_trailing_zeros_gcc_ctz(count,x)   \
176   do {                                          \
177     ASSERT ((x) != 0);                          \
178     (count) = __builtin_ctzl (x);               \
179   } while (0)
180 #endif
181
182
183 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
184    don't need to be under !NO_ASM */
185 #if ! defined (NO_ASM)
186
187 #if defined (__alpha) && W_TYPE_SIZE == 64
188 /* Most alpha-based machines, except Cray systems. */
189 #if defined (__GNUC__)
190 #if __GMP_GNUC_PREREQ (3,3)
191 #define umul_ppmm(ph, pl, m0, m1) \
192   do {                                                                  \
193     UDItype __m0 = (m0), __m1 = (m1);                                   \
194     (ph) = __builtin_alpha_umulh (__m0, __m1);                          \
195     (pl) = __m0 * __m1;                                                 \
196   } while (0)
197 #else
198 #define umul_ppmm(ph, pl, m0, m1) \
199   do {                                                                  \
200     UDItype __m0 = (m0), __m1 = (m1);                                   \
201     __asm__ ("umulh %r1,%2,%0"                                          \
202              : "=r" (ph)                                                \
203              : "%rJ" (m0), "rI" (m1));                                  \
204     (pl) = __m0 * __m1;                                                 \
205   } while (0)
206 #endif
207 #define UMUL_TIME 18
208 #else /* ! __GNUC__ */
209 #include <machine/builtins.h>
210 #define umul_ppmm(ph, pl, m0, m1) \
211   do {                                                                  \
212     UDItype __m0 = (m0), __m1 = (m1);                                   \
213     (ph) = __UMULH (m0, m1);                                            \
214     (pl) = __m0 * __m1;                                                 \
215   } while (0)
216 #endif
217 #ifndef LONGLONG_STANDALONE
218 #define udiv_qrnnd(q, r, n1, n0, d) \
219   do { UWtype __di;                                                     \
220     __di = __MPN(invert_limb) (d);                                      \
221     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
222   } while (0)
223 #define UDIV_PREINV_ALWAYS  1
224 #define UDIV_NEEDS_NORMALIZATION 1
225 #define UDIV_TIME 220
226 #endif /* LONGLONG_STANDALONE */
227
228 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
229    always goes into libgmp.so, even when not actually used.  */
230 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
231
232 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
233 #define count_leading_zeros(COUNT,X) \
234   __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
235 #define count_trailing_zeros(COUNT,X) \
236   __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
237 #endif /* clz/ctz using cix */
238
239 #if ! defined (count_leading_zeros)                             \
240   && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
241 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
242    "$31" is written explicitly in the asm, since an "r" constraint won't
243    select reg 31.  There seems no need to worry about "r31" syntax for cray,
244    since gcc itself (pre-release 3.4) emits just $31 in various places.  */
245 #define ALPHA_CMPBGE_0(dst, src)                                        \
246   do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
247 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
248    them, locating the highest non-zero byte.  A second __clz_tab lookup
249    counts the leading zero bits in that byte, giving the result.  */
250 #define count_leading_zeros(count, x)                                   \
251   do {                                                                  \
252     UWtype  __clz__b, __clz__c, __clz__x = (x);                         \
253     ALPHA_CMPBGE_0 (__clz__b,  __clz__x);           /* zero bytes */    \
254     __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F];  /* 8 to 1 byte */   \
255     __clz__b = __clz__b * 8 - 7;                    /* 57 to 1 shift */ \
256     __clz__x >>= __clz__b;                                              \
257     __clz__c = __clz_tab [__clz__x];                /* 8 to 1 bit */    \
258     __clz__b = 65 - __clz__b;                                           \
259     (count) = __clz__b - __clz__c;                                      \
260   } while (0)
261 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
262 #endif /* clz using cmpbge */
263
264 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
265 #if HAVE_ATTRIBUTE_CONST
266 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
267 #else
268 long __MPN(count_leading_zeros) (UDItype);
269 #endif
270 #define count_leading_zeros(count, x) \
271   ((count) = __MPN(count_leading_zeros) (x))
272 #endif /* clz using mpn */
273 #endif /* __alpha */
274
275 #if defined (__AVR) && W_TYPE_SIZE == 8
276 #define umul_ppmm(ph, pl, m0, m1) \
277   do {                                                                  \
278     unsigned short __p = (unsigned short) (m0) * (m1);                  \
279     (ph) = __p >> 8;                                                    \
280     (pl) = __p;                                                         \
281   } while (0)
282 #endif /* AVR */
283
284 #if defined (_CRAY) && W_TYPE_SIZE == 64
285 #include <intrinsics.h>
286 #define UDIV_PREINV_ALWAYS  1
287 #define UDIV_NEEDS_NORMALIZATION 1
288 #define UDIV_TIME 220
289 long __MPN(count_leading_zeros) (UDItype);
290 #define count_leading_zeros(count, x) \
291   ((count) = _leadz ((UWtype) (x)))
292 #if defined (_CRAYIEEE)         /* I.e., Cray T90/ieee, T3D, and T3E */
293 #define umul_ppmm(ph, pl, m0, m1) \
294   do {                                                                  \
295     UDItype __m0 = (m0), __m1 = (m1);                                   \
296     (ph) = _int_mult_upper (m0, m1);                                    \
297     (pl) = __m0 * __m1;                                                 \
298   } while (0)
299 #ifndef LONGLONG_STANDALONE
300 #define udiv_qrnnd(q, r, n1, n0, d) \
301   do { UWtype __di;                                                     \
302     __di = __MPN(invert_limb) (d);                                      \
303     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
304   } while (0)
305 #endif /* LONGLONG_STANDALONE */
306 #endif /* _CRAYIEEE */
307 #endif /* _CRAY */
308
309 #if defined (__ia64) && W_TYPE_SIZE == 64
310 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
311    "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency.  The generic
312    code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
313    register, which takes an extra cycle.  */
314 #define sub_ddmmss(sh, sl, ah, al, bh, bl)      \
315   do {                                          \
316     UWtype __x;                                 \
317     __x = (al) - (bl);                          \
318     if ((al) < (bl))                            \
319       (sh) = (ah) - (bh) - 1;                   \
320     else                                        \
321       (sh) = (ah) - (bh);                       \
322     (sl) = __x;                                 \
323   } while (0)
324 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
325 /* Do both product parts in assembly, since that gives better code with
326    all gcc versions.  Some callers will just use the upper part, and in
327    that situation we waste an instruction, but not any cycles.  */
328 #define umul_ppmm(ph, pl, m0, m1) \
329     __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0"          \
330              : "=&f" (ph), "=f" (pl)                                    \
331              : "f" (m0), "f" (m1))
332 #define UMUL_TIME 14
333 #define count_leading_zeros(count, x) \
334   do {                                                                  \
335     UWtype _x = (x), _y, _a, _c;                                        \
336     __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x));              \
337     __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y));            \
338     _c = (_a - 1) << 3;                                                 \
339     _x >>= _c;                                                          \
340     if (_x >= 1 << 4)                                                   \
341       _x >>= 4, _c += 4;                                                \
342     if (_x >= 1 << 2)                                                   \
343       _x >>= 2, _c += 2;                                                \
344     _c += _x >> 1;                                                      \
345     (count) =  W_TYPE_SIZE - 1 - _c;                                    \
346   } while (0)
347 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
348    based, and we don't need a special case for x==0 here */
349 #define count_trailing_zeros(count, x)                                  \
350   do {                                                                  \
351     UWtype __ctz_x = (x);                                               \
352     __asm__ ("popcnt %0 = %1"                                           \
353              : "=r" (count)                                             \
354              : "r" ((__ctz_x-1) & ~__ctz_x));                           \
355   } while (0)
356 #endif
357 #if defined (__INTEL_COMPILER)
358 #include <ia64intrin.h>
359 #define umul_ppmm(ph, pl, m0, m1)                                       \
360   do {                                                                  \
361     UWtype _m0 = (m0), _m1 = (m1);                                      \
362     ph = _m64_xmahu (_m0, _m1, 0);                                      \
363     pl = _m0 * _m1;                                                     \
364   } while (0)
365 #endif
366 #ifndef LONGLONG_STANDALONE
367 #define udiv_qrnnd(q, r, n1, n0, d) \
368   do { UWtype __di;                                                     \
369     __di = __MPN(invert_limb) (d);                                      \
370     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
371   } while (0)
372 #define UDIV_PREINV_ALWAYS  1
373 #define UDIV_NEEDS_NORMALIZATION 1
374 #endif
375 #define UDIV_TIME 220
376 #endif
377
378
379 #if defined (__GNUC__)
380
381 /* We sometimes need to clobber "cc" with gcc2, but that would not be
382    understood by gcc1.  Use cpp to avoid major code duplication.  */
383 #if __GNUC__ < 2
384 #define __CLOBBER_CC
385 #define __AND_CLOBBER_CC
386 #else /* __GNUC__ >= 2 */
387 #define __CLOBBER_CC : "cc"
388 #define __AND_CLOBBER_CC , "cc"
389 #endif /* __GNUC__ < 2 */
390
391 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
392 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
393   __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3"                              \
394            : "=r" (sh), "=&r" (sl)                                      \
395            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
396 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
397   __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3"                              \
398            : "=r" (sh), "=&r" (sl)                                      \
399            : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
400 #define umul_ppmm(xh, xl, m0, m1) \
401   do {                                                                  \
402     USItype __m0 = (m0), __m1 = (m1);                                   \
403     __asm__ ("multiplu %0,%1,%2"                                        \
404              : "=r" (xl)                                                \
405              : "r" (__m0), "r" (__m1));                                 \
406     __asm__ ("multmu %0,%1,%2"                                          \
407              : "=r" (xh)                                                \
408              : "r" (__m0), "r" (__m1));                                 \
409   } while (0)
410 #define udiv_qrnnd(q, r, n1, n0, d) \
411   __asm__ ("dividu %0,%3,%4"                                            \
412            : "=r" (q), "=q" (r)                                         \
413            : "1" (n1), "r" (n0), "r" (d))
414 #define count_leading_zeros(count, x) \
415     __asm__ ("clz %0,%1"                                                \
416              : "=r" (count)                                             \
417              : "r" (x))
418 #define COUNT_LEADING_ZEROS_0 32
419 #endif /* __a29k__ */
420
421 #if defined (__arc__)
422 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
423   __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3"                       \
424            : "=r" (sh),                                                 \
425              "=&r" (sl)                                                 \
426            : "r"  ((USItype) (ah)),                                     \
427              "rIJ" ((USItype) (bh)),                                    \
428              "%r" ((USItype) (al)),                                     \
429              "rIJ" ((USItype) (bl)))
430 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
431   __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3"                       \
432            : "=r" (sh),                                                 \
433              "=&r" (sl)                                                 \
434            : "r" ((USItype) (ah)),                                      \
435              "rIJ" ((USItype) (bh)),                                    \
436              "r" ((USItype) (al)),                                      \
437              "rIJ" ((USItype) (bl)))
438 #endif
439
440 #if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32
441 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
442   __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3"                        \
443            : "=r" (sh), "=&r" (sl)                                      \
444            : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
445 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
446   do {                                                                  \
447     if (__builtin_constant_p (al))                                      \
448       {                                                                 \
449         if (__builtin_constant_p (ah))                                  \
450           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
451                    : "=r" (sh), "=&r" (sl)                              \
452                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
453         else                                                            \
454           __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3"                \
455                    : "=r" (sh), "=&r" (sl)                              \
456                    : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
457       }                                                                 \
458     else if (__builtin_constant_p (ah))                                 \
459       {                                                                 \
460         if (__builtin_constant_p (bl))                                  \
461           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
462                    : "=r" (sh), "=&r" (sl)                              \
463                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
464         else                                                            \
465           __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2"                \
466                    : "=r" (sh), "=&r" (sl)                              \
467                    : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
468       }                                                                 \
469     else if (__builtin_constant_p (bl))                                 \
470       {                                                                 \
471         if (__builtin_constant_p (bh))                                  \
472           __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                \
473                    : "=r" (sh), "=&r" (sl)                              \
474                    : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
475         else                                                            \
476           __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2"                \
477                    : "=r" (sh), "=&r" (sl)                              \
478                    : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
479       }                                                                 \
480     else /* only bh might be a constant */                              \
481       __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3"                    \
482                : "=r" (sh), "=&r" (sl)                                  \
483                : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
484     } while (0)
485 #if 1 || defined (__arm_m__)    /* `M' series has widening multiply support */
486 #define umul_ppmm(xh, xl, a, b) \
487   __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
488 #define UMUL_TIME 5
489 #define smul_ppmm(xh, xl, a, b) \
490   __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
491 #ifndef LONGLONG_STANDALONE
492 #define udiv_qrnnd(q, r, n1, n0, d) \
493   do { UWtype __di;                                                     \
494     __di = __MPN(invert_limb) (d);                                      \
495     udiv_qrnnd_preinv (q, r, n1, n0, d, __di);                          \
496   } while (0)
497 #define UDIV_PREINV_ALWAYS  1
498 #define UDIV_NEEDS_NORMALIZATION 1
499 #define UDIV_TIME 70
500 #endif /* LONGLONG_STANDALONE */
501 #else
502 #define umul_ppmm(xh, xl, a, b) \
503   __asm__ ("%@ Inlined umul_ppmm\n"                                     \
504 "       mov     %|r0, %2, lsr #16\n"                                    \
505 "       mov     %|r2, %3, lsr #16\n"                                    \
506 "       bic     %|r1, %2, %|r0, lsl #16\n"                              \
507 "       bic     %|r2, %3, %|r2, lsl #16\n"                              \
508 "       mul     %1, %|r1, %|r2\n"                                       \
509 "       mul     %|r2, %|r0, %|r2\n"                                     \
510 "       mul     %|r1, %0, %|r1\n"                                       \
511 "       mul     %0, %|r0, %0\n"                                         \
512 "       adds    %|r1, %|r2, %|r1\n"                                     \
513 "       addcs   %0, %0, #65536\n"                                       \
514 "       adds    %1, %1, %|r1, lsl #16\n"                                \
515 "       adc     %0, %0, %|r1, lsr #16"                                  \
516            : "=&r" (xh), "=r" (xl)                                      \
517            : "r" (a), "r" (b)                                           \
518            : "r0", "r1", "r2")
519 #define UMUL_TIME 20
520 #ifndef LONGLONG_STANDALONE
521 #define udiv_qrnnd(q, r, n1, n0, d) \
522   do { UWtype __r;                                                      \
523     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
524     (r) = __r;                                                          \
525   } while (0)
526 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
527 #define UDIV_TIME 200
528 #endif /* LONGLONG_STANDALONE */
529 #endif
530 /* This is a bizarre test, but GCC doesn't define any useful common symbol. */
531 #if defined (__ARM_ARCH_5__)  || defined (__ARM_ARCH_5T__) || \
532     defined (__ARM_ARCH_5E__) || defined (__ARM_ARCH_5TE__)|| \
533     defined (__ARM_ARCH_6__)  || defined (__ARM_ARCH_6J__) || \
534     defined (__ARM_ARCH_6K__) || defined (__ARM_ARCH_6Z__) || \
535     defined (__ARM_ARCH_6ZK__)|| defined (__ARM_ARCH_6T2__)|| \
536     defined (__ARM_ARCH_6M__) || defined (__ARM_ARCH_7__)  || \
537     defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7R__) || \
538     defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
539 #define count_leading_zeros(count, x) \
540   __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
541 #define COUNT_LEADING_ZEROS_0 32
542 #endif
543 #endif /* __arm__ */
544
545 #if defined (__aarch64__) && W_TYPE_SIZE == 64
546 /* FIXME: Extend the immediate range for the low word by using both
547    ADDS and SUBS, since they set carry in the same way.  */
548 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
549   __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3"                     \
550            : "=r" (sh), "=&r" (sl)                                      \
551            : "rZ" (ah), "rZ" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
552 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
553   __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3"                     \
554            : "=r,r" (sh), "=&r,&r" (sl)                                 \
555            : "rZ,rZ" (ah), "rZ,rZ" (bh), "r,Z" (al), "rI,r" (bl) __CLOBBER_CC)
556 #define umul_ppmm(ph, pl, m0, m1) \
557   do {                                                                  \
558     UDItype __m0 = (m0), __m1 = (m1);                                   \
559     __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (m0), "r" (m1));     \
560     (pl) = __m0 * __m1;                                                 \
561   } while (0)
562 #define count_leading_zeros(count, x) \
563   __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
564 #define count_trailing_zeros(count, x) \
565   __asm__ ("rbit\t%0, %1\n\tclz\t%0, %0" : "=r" (count) : "r" (x))
566 #define COUNT_LEADING_ZEROS_0 64
567 #endif /* __aarch64__ */
568
569 #if defined (__clipper__) && W_TYPE_SIZE == 32
570 #define umul_ppmm(w1, w0, u, v) \
571   ({union {UDItype __ll;                                                \
572            struct {USItype __l, __h;} __i;                              \
573           } __x;                                                        \
574   __asm__ ("mulwux %2,%0"                                               \
575            : "=r" (__x.__ll)                                            \
576            : "%0" ((USItype)(u)), "r" ((USItype)(v)));                  \
577   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
578 #define smul_ppmm(w1, w0, u, v) \
579   ({union {DItype __ll;                                                 \
580            struct {SItype __l, __h;} __i;                               \
581           } __x;                                                        \
582   __asm__ ("mulwx %2,%0"                                                \
583            : "=r" (__x.__ll)                                            \
584            : "%0" ((SItype)(u)), "r" ((SItype)(v)));                    \
585   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
586 #define __umulsidi3(u, v) \
587   ({UDItype __w;                                                        \
588     __asm__ ("mulwux %2,%0"                                             \
589              : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v)));   \
590     __w; })
591 #endif /* __clipper__ */
592
593 /* Fujitsu vector computers.  */
594 #if defined (__uxp__) && W_TYPE_SIZE == 32
595 #define umul_ppmm(ph, pl, u, v) \
596   do {                                                                  \
597     union {UDItype __ll;                                                \
598            struct {USItype __h, __l;} __i;                              \
599           } __x;                                                        \
600     __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
601     (ph) = __x.__i.__h;                                                 \
602     (pl) = __x.__i.__l;                                                 \
603   } while (0)
604 #define smul_ppmm(ph, pl, u, v) \
605   do {                                                                  \
606     union {UDItype __ll;                                                \
607            struct {USItype __h, __l;} __i;                              \
608           } __x;                                                        \
609     __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
610     (ph) = __x.__i.__h;                                                 \
611     (pl) = __x.__i.__l;                                                 \
612   } while (0)
613 #endif
614
615 #if defined (__gmicro__) && W_TYPE_SIZE == 32
616 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
617   __asm__ ("add.w %5,%1\n\taddx %3,%0"                                  \
618            : "=g" (sh), "=&g" (sl)                                      \
619            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
620              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
621 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
622   __asm__ ("sub.w %5,%1\n\tsubx %3,%0"                                  \
623            : "=g" (sh), "=&g" (sl)                                      \
624            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
625              "1" ((USItype)(al)), "g" ((USItype)(bl)))
626 #define umul_ppmm(ph, pl, m0, m1) \
627   __asm__ ("mulx %3,%0,%1"                                              \
628            : "=g" (ph), "=r" (pl)                                       \
629            : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
630 #define udiv_qrnnd(q, r, nh, nl, d) \
631   __asm__ ("divx %4,%0,%1"                                              \
632            : "=g" (q), "=r" (r)                                         \
633            : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
634 #define count_leading_zeros(count, x) \
635   __asm__ ("bsch/1 %1,%0"                                               \
636            : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
637 #endif
638
639 #if defined (__hppa) && W_TYPE_SIZE == 32
640 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
641   __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0"                        \
642            : "=r" (sh), "=&r" (sl)                                      \
643            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
644 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
645   __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0"                        \
646            : "=r" (sh), "=&r" (sl)                                      \
647            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
648 #if defined (_PA_RISC1_1)
649 #define umul_ppmm(wh, wl, u, v) \
650   do {                                                                  \
651     union {UDItype __ll;                                                \
652            struct {USItype __h, __l;} __i;                              \
653           } __x;                                                        \
654     __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
655     (wh) = __x.__i.__h;                                                 \
656     (wl) = __x.__i.__l;                                                 \
657   } while (0)
658 #define UMUL_TIME 8
659 #define UDIV_TIME 60
660 #else
661 #define UMUL_TIME 40
662 #define UDIV_TIME 80
663 #endif
664 #define count_leading_zeros(count, x) \
665   do {                                                                  \
666     USItype __tmp;                                                      \
667     __asm__ (                                                           \
668        "ldi             1,%0\n"                                         \
669 "       extru,=         %1,15,16,%%r0   ; Bits 31..16 zero?\n"          \
670 "       extru,tr        %1,15,16,%1     ; No.  Shift down, skip add.\n" \
671 "       ldo             16(%0),%0       ; Yes.  Perform add.\n"         \
672 "       extru,=         %1,23,8,%%r0    ; Bits 15..8 zero?\n"           \
673 "       extru,tr        %1,23,8,%1      ; No.  Shift down, skip add.\n" \
674 "       ldo             8(%0),%0        ; Yes.  Perform add.\n"         \
675 "       extru,=         %1,27,4,%%r0    ; Bits 7..4 zero?\n"            \
676 "       extru,tr        %1,27,4,%1      ; No.  Shift down, skip add.\n" \
677 "       ldo             4(%0),%0        ; Yes.  Perform add.\n"         \
678 "       extru,=         %1,29,2,%%r0    ; Bits 3..2 zero?\n"            \
679 "       extru,tr        %1,29,2,%1      ; No.  Shift down, skip add.\n" \
680 "       ldo             2(%0),%0        ; Yes.  Perform add.\n"         \
681 "       extru           %1,30,1,%1      ; Extract bit 1.\n"             \
682 "       sub             %0,%1,%0        ; Subtract it.\n"               \
683         : "=r" (count), "=r" (__tmp) : "1" (x));                        \
684   } while (0)
685 #endif /* hppa */
686
687 /* These macros are for ABI=2.0w.  In ABI=2.0n they can't be used, since GCC
688    (3.2) puts longlong into two adjacent 32-bit registers.  Presumably this
689    is just a case of no direct support for 2.0n but treating it like 1.0. */
690 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
691 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
692   __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0"                      \
693            : "=r" (sh), "=&r" (sl)                                      \
694            : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
695 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
696   __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0"                      \
697            : "=r" (sh), "=&r" (sl)                                      \
698            : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
699 #endif /* hppa */
700
701 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
702 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
703 #define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
704   do {                                                                  \
705 /*  if (__builtin_constant_p (bl))                                      \
706       __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3"                            \
707                : "=r" (sh), "=&r" (sl)                                  \
708                : "0"  (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
709     else                                                                \
710 */    __asm__ ("alr\t%1,%5\n\talcr\t%0,%3"                              \
711                : "=r" (sh), "=&r" (sl)                                  \
712                : "0"  (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \
713   } while (0)
714 #define sub_ddmmss(sh, sl, ah, al, bh, bl)                              \
715   do {                                                                  \
716 /*  if (__builtin_constant_p (bl))                                      \
717       __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3"                            \
718                : "=r" (sh), "=&r" (sl)                                  \
719                : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC);  \
720     else                                                                \
721 */    __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3"                              \
722                : "=r" (sh), "=&r" (sl)                                  \
723                : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC);  \
724   } while (0)
725 #if __GMP_GNUC_PREREQ (4,5)
726 #define umul_ppmm(xh, xl, m0, m1)                                       \
727   do {                                                                  \
728     union {UDItype __ll;                                                \
729            struct {USItype __h, __l;} __i;                              \
730           } __x;                                                        \
731     __x.__ll = (UDItype) (m0) * (UDItype) (m1);                         \
732     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
733   } while (0)
734 #else
735 #if 0
736 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
737    with a new enough processor pretending we have 32-bit registers.  */
738 #define umul_ppmm(xh, xl, m0, m1)                                       \
739   do {                                                                  \
740     union {UDItype __ll;                                                \
741            struct {USItype __h, __l;} __i;                              \
742           } __x;                                                        \
743     __asm__ ("mlr\t%0,%2"                                               \
744              : "=r" (__x.__ll)                                          \
745              : "%0" (m0), "r" (m1));                                    \
746     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
747   } while (0)
748 #else
749 #define umul_ppmm(xh, xl, m0, m1)                                       \
750   do {                                                                  \
751   /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
752      DImode for the product, since that would be allocated to a single 64-bit
753      register, whereas mlr uses the low 32-bits of an even-odd register pair.
754   */                                                                    \
755     register USItype __r0 __asm__ ("0");                                \
756     register USItype __r1 __asm__ ("1") = (m0);                         \
757     __asm__ ("mlr\t%0,%3"                                               \
758              : "=r" (__r0), "=r" (__r1)                                 \
759              : "r" (__r1), "r" (m1));                                   \
760     (xh) = __r0; (xl) = __r1;                                           \
761   } while (0)
762 #endif /* if 0 */
763 #endif
764 #if 0
765 /* FIXME: this fails if gcc knows about the 64-bit registers.  Use only
766    with a new enough processor pretending we have 32-bit registers.  */
767 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
768   do {                                                                  \
769     union {UDItype __ll;                                                \
770            struct {USItype __h, __l;} __i;                              \
771           } __x;                                                        \
772     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
773     __asm__ ("dlr\t%0,%2"                                               \
774              : "=r" (__x.__ll)                                          \
775              : "0" (__x.__ll), "r" (d));                                \
776     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
777   } while (0)
778 #else
779 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
780   do {                                                                  \
781     register USItype __r0 __asm__ ("0") = (n1);                         \
782     register USItype __r1 __asm__ ("1") = (n0);                         \
783     __asm__ ("dlr\t%0,%4"                                               \
784              : "=r" (__r0), "=r" (__r1)                                 \
785              : "r" (__r0), "r" (__r1), "r" (d));                        \
786     (q) = __r1; (r) = __r0;                                             \
787   } while (0)
788 #endif /* if 0 */
789 #else /* if __zarch__ */
790 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
791 #define smul_ppmm(xh, xl, m0, m1)                                       \
792   do {                                                                  \
793     union {DItype __ll;                                                 \
794            struct {USItype __h, __l;} __i;                              \
795           } __x;                                                        \
796     __asm__ ("mr\t%0,%2"                                                \
797              : "=r" (__x.__ll)                                          \
798              : "%0" (m0), "r" (m1));                                    \
799     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
800   } while (0)
801 /* FIXME: this fails if gcc knows about the 64-bit registers.  */
802 #define sdiv_qrnnd(q, r, n1, n0, d)                                     \
803   do {                                                                  \
804     union {DItype __ll;                                                 \
805            struct {USItype __h, __l;} __i;                              \
806           } __x;                                                        \
807     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
808     __asm__ ("dr\t%0,%2"                                                \
809              : "=r" (__x.__ll)                                          \
810              : "0" (__x.__ll), "r" (d));                                \
811     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
812   } while (0)
813 #endif /* if __zarch__ */
814 #endif
815
816 #if defined (__s390x__) && W_TYPE_SIZE == 64
817 /* We need to cast operands with register constraints, otherwise their types
818    will be assumed to be SImode by gcc.  For these machines, such operations
819    will insert a value into the low 32 bits, and leave the high 32 bits with
820    garbage.  */
821 #define add_ssaaaa(sh, sl, ah, al, bh, bl)                              \
822   do {                                                                  \
823     __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3"                              \
824                : "=r" (sh), "=&r" (sl)                                  \
825                : "0"  ((UDItype)(ah)), "r" ((UDItype)(bh)),             \
826                  "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
827   } while (0)
828 #define sub_ddmmss(sh, sl, ah, al, bh, bl)                              \
829   do {                                                                  \
830     __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3"                              \
831              : "=r" (sh), "=&r" (sl)                                    \
832              : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)),                \
833                "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC);  \
834   } while (0)
835 #define umul_ppmm(xh, xl, m0, m1)                                       \
836   do {                                                                  \
837     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
838            struct {UDItype __h, __l;} __i;                              \
839           } __x;                                                        \
840     __asm__ ("mlgr\t%0,%2"                                              \
841              : "=r" (__x.__ll)                                          \
842              : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1)));              \
843     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
844   } while (0)
845 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
846   do {                                                                  \
847     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
848            struct {UDItype __h, __l;} __i;                              \
849           } __x;                                                        \
850     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
851     __asm__ ("dlgr\t%0,%2"                                              \
852              : "=r" (__x.__ll)                                          \
853              : "0" (__x.__ll), "r" ((UDItype)(d)));                     \
854     (q) = __x.__i.__l; (r) = __x.__i.__h;                               \
855   } while (0)
856 #if 0 /* FIXME: Enable for z10 (?) */
857 #define count_leading_zeros(cnt, x)                                     \
858   do {                                                                  \
859     union {unsigned int __attribute__ ((mode(TI))) __ll;                \
860            struct {UDItype __h, __l;} __i;                              \
861           } __clr_cnt;                                                  \
862     __asm__ ("flogr\t%0,%1"                                             \
863              : "=r" (__clr_cnt.__ll)                                    \
864              : "r" (x) __CLOBBER_CC);                                   \
865     (cnt) = __clr_cnt.__i.__h;                                          \
866   } while (0)
867 #endif
868 #endif
869
870 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
871 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
872   __asm__ ("addl %5,%k1\n\tadcl %3,%k0"                                 \
873            : "=r" (sh), "=&r" (sl)                                      \
874            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
875              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
876 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
877   __asm__ ("subl %5,%k1\n\tsbbl %3,%k0"                                 \
878            : "=r" (sh), "=&r" (sl)                                      \
879            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
880              "1" ((USItype)(al)), "g" ((USItype)(bl)))
881 #define umul_ppmm(w1, w0, u, v) \
882   __asm__ ("mull %3"                                                    \
883            : "=a" (w0), "=d" (w1)                                       \
884            : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
885 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
886   __asm__ ("divl %4"                 /* stringification in K&R C */     \
887            : "=a" (q), "=d" (r)                                         \
888            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
889
890 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
891 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
892    significant 1 bit is, hence the use of the following alternatives.  bsfl
893    is slow too, between 18 and 42 depending where the least significant 1
894    bit is, so let the generic count_trailing_zeros below make use of the
895    count_leading_zeros here too.  */
896
897 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
898 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
899    cache miss reading from __clz_tab.  For P55 it's favoured over the float
900    below so as to avoid mixing MMX and x87, since the penalty for switching
901    between the two is about 100 cycles.
902
903    The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
904    16, -1 for 8, or 0 otherwise.  This could be written equivalently as
905    follows, but as of gcc 2.95.2 it results in conditional jumps.
906
907        __shift = -(__n < 0x1000000);
908        __shift -= (__n < 0x10000);
909        __shift -= (__n < 0x100);
910
911    The middle two sbbl and cmpl's pair, and with luck something gcc
912    generates might pair with the first cmpl and the last sbbl.  The "32+1"
913    constant could be folded into __clz_tab[], but it doesn't seem worth
914    making a different table just for that.  */
915
916 #define count_leading_zeros(c,n)                                        \
917   do {                                                                  \
918     USItype  __n = (n);                                                 \
919     USItype  __shift;                                                   \
920     __asm__ ("cmpl  $0x1000000, %1\n"                                   \
921              "sbbl  %0, %0\n"                                           \
922              "cmpl  $0x10000, %1\n"                                     \
923              "sbbl  $0, %0\n"                                           \
924              "cmpl  $0x100, %1\n"                                       \
925              "sbbl  $0, %0\n"                                           \
926              : "=&r" (__shift) : "r"  (__n));                           \
927     __shift = __shift*8 + 24 + 1;                                       \
928     (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift];                 \
929   } while (0)
930 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
931 #define COUNT_LEADING_ZEROS_0   31   /* n==0 indistinguishable from n==1 */
932
933 #else /* ! pentiummmx || LONGLONG_STANDALONE */
934 /* The following should be a fixed 14 cycles or so.  Some scheduling
935    opportunities should be available between the float load/store too.  This
936    sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
937    apparently suggested by the Intel optimizing manual (don't know exactly
938    where).  gcc 2.95 or up will be best for this, so the "double" is
939    correctly aligned on the stack.  */
940 #define count_leading_zeros(c,n)                                        \
941   do {                                                                  \
942     union {                                                             \
943       double    d;                                                      \
944       unsigned  a[2];                                                   \
945     } __u;                                                              \
946     ASSERT ((n) != 0);                                                  \
947     __u.d = (UWtype) (n);                                               \
948     (c) = 0x3FF + 31 - (__u.a[1] >> 20);                                \
949   } while (0)
950 #define COUNT_LEADING_ZEROS_0   (0x3FF + 31)
951 #endif /* pentiummx */
952
953 #else /* ! pentium */
954
955 #if __GMP_GNUC_PREREQ (3,4)  /* using bsrl */
956 #define count_leading_zeros(count,x)  count_leading_zeros_gcc_clz(count,x)
957 #endif /* gcc clz */
958
959 /* On P6, gcc prior to 3.0 generates a partial register stall for
960    __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
961    being 1 code byte smaller.  "31-__cbtmp" is a workaround, probably at the
962    cost of one extra instruction.  Do this for "i386" too, since that means
963    generic x86.  */
964 #if ! defined (count_leading_zeros) && __GNUC__ < 3                     \
965   && (HAVE_HOST_CPU_i386                                                \
966       || HAVE_HOST_CPU_i686                                             \
967       || HAVE_HOST_CPU_pentiumpro                                       \
968       || HAVE_HOST_CPU_pentium2                                         \
969       || HAVE_HOST_CPU_pentium3)
970 #define count_leading_zeros(count, x)                                   \
971   do {                                                                  \
972     USItype __cbtmp;                                                    \
973     ASSERT ((x) != 0);                                                  \
974     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
975     (count) = 31 - __cbtmp;                                             \
976   } while (0)
977 #endif /* gcc<3 asm bsrl */
978
979 #ifndef count_leading_zeros
980 #define count_leading_zeros(count, x)                                   \
981   do {                                                                  \
982     USItype __cbtmp;                                                    \
983     ASSERT ((x) != 0);                                                  \
984     __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x)));      \
985     (count) = __cbtmp ^ 31;                                             \
986   } while (0)
987 #endif /* asm bsrl */
988
989 #if __GMP_GNUC_PREREQ (3,4)  /* using bsfl */
990 #define count_trailing_zeros(count,x)  count_trailing_zeros_gcc_ctz(count,x)
991 #endif /* gcc ctz */
992
993 #ifndef count_trailing_zeros
994 #define count_trailing_zeros(count, x)                                  \
995   do {                                                                  \
996     ASSERT ((x) != 0);                                                  \
997     __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x)));       \
998   } while (0)
999 #endif /* asm bsfl */
1000
1001 #endif /* ! pentium */
1002
1003 #ifndef UMUL_TIME
1004 #define UMUL_TIME 10
1005 #endif
1006 #ifndef UDIV_TIME
1007 #define UDIV_TIME 40
1008 #endif
1009 #endif /* 80x86 */
1010
1011 #if defined (__amd64__) && W_TYPE_SIZE == 64
1012 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1013   __asm__ ("addq %5,%q1\n\tadcq %3,%q0"                                 \
1014            : "=r" (sh), "=&r" (sl)                                      \
1015            : "0"  ((UDItype)(ah)), "rme" ((UDItype)(bh)),               \
1016              "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1017 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1018   __asm__ ("subq %5,%q1\n\tsbbq %3,%q0"                                 \
1019            : "=r" (sh), "=&r" (sl)                                      \
1020            : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)),                \
1021              "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1022 #define umul_ppmm(w1, w0, u, v) \
1023   __asm__ ("mulq %3"                                                    \
1024            : "=a" (w0), "=d" (w1)                                       \
1025            : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1026 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1027   __asm__ ("divq %4"                 /* stringification in K&R C */     \
1028            : "=a" (q), "=d" (r)                                         \
1029            : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1030 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
1031 #define count_leading_zeros(count, x)                                   \
1032   do {                                                                  \
1033     UDItype __cbtmp;                                                    \
1034     ASSERT ((x) != 0);                                                  \
1035     __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x)));      \
1036     (count) = __cbtmp ^ 63;                                             \
1037   } while (0)
1038 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
1039    count is only an int. */
1040 #define count_trailing_zeros(count, x)                                  \
1041   do {                                                                  \
1042     ASSERT ((x) != 0);                                                  \
1043     __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x)));       \
1044   } while (0)
1045 #endif /* __amd64__ */
1046
1047 #if defined (__i860__) && W_TYPE_SIZE == 32
1048 #define rshift_rhlc(r,h,l,c) \
1049   __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0"                                \
1050            "=r" (r) : "r" (h), "r" (l), "rn" (c))
1051 #endif /* i860 */
1052
1053 #if defined (__i960__) && W_TYPE_SIZE == 32
1054 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1055   __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0"                     \
1056            : "=r" (sh), "=&r" (sl)                                      \
1057            : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1058 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1059   __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0"                     \
1060            : "=r" (sh), "=&r" (sl)                                      \
1061            : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1062 #define umul_ppmm(w1, w0, u, v) \
1063   ({union {UDItype __ll;                                                \
1064            struct {USItype __l, __h;} __i;                              \
1065           } __x;                                                        \
1066   __asm__ ("emul %2,%1,%0"                                              \
1067            : "=d" (__x.__ll) : "%dI" (u), "dI" (v));                    \
1068   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1069 #define __umulsidi3(u, v) \
1070   ({UDItype __w;                                                        \
1071     __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v));       \
1072     __w; })
1073 #define udiv_qrnnd(q, r, nh, nl, d) \
1074   do {                                                                  \
1075     union {UDItype __ll;                                                \
1076            struct {USItype __l, __h;} __i;                              \
1077           } __nn;                                                       \
1078     __nn.__i.__h = (nh); __nn.__i.__l = (nl);                           \
1079     __asm__ ("ediv %d,%n,%0"                                            \
1080            : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d));            \
1081     (r) = __rq.__i.__l; (q) = __rq.__i.__h;                             \
1082   } while (0)
1083 #define count_leading_zeros(count, x) \
1084   do {                                                                  \
1085     USItype __cbtmp;                                                    \
1086     __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x));               \
1087     (count) = __cbtmp ^ 31;                                             \
1088   } while (0)
1089 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1090 #if defined (__i960mx)          /* what is the proper symbol to test??? */
1091 #define rshift_rhlc(r,h,l,c) \
1092   do {                                                                  \
1093     union {UDItype __ll;                                                \
1094            struct {USItype __l, __h;} __i;                              \
1095           } __nn;                                                       \
1096     __nn.__i.__h = (h); __nn.__i.__l = (l);                             \
1097     __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c));  \
1098   }
1099 #endif /* i960mx */
1100 #endif /* i960 */
1101
1102 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1103      || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1104      || defined (__mc5307__)) && W_TYPE_SIZE == 32
1105 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1106   __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0"                              \
1107            : "=d" (sh), "=&d" (sl)                                      \
1108            : "0"  ((USItype)(ah)), "d" ((USItype)(bh)),                 \
1109              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1110 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1111   __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0"                              \
1112            : "=d" (sh), "=&d" (sl)                                      \
1113            : "0" ((USItype)(ah)), "d" ((USItype)(bh)),                  \
1114              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1115 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r.  */
1116 #if defined (__mc68020__) || defined(mc68020) \
1117      || defined (__mc68030__) || defined (mc68030) \
1118      || defined (__mc68040__) || defined (mc68040) \
1119      || defined (__mcpu32__) || defined (mcpu32) \
1120      || defined (__NeXT__)
1121 #define umul_ppmm(w1, w0, u, v) \
1122   __asm__ ("mulu%.l %3,%1:%0"                                           \
1123            : "=d" (w0), "=d" (w1)                                       \
1124            : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1125 #define UMUL_TIME 45
1126 #define udiv_qrnnd(q, r, n1, n0, d) \
1127   __asm__ ("divu%.l %4,%1:%0"                                           \
1128            : "=d" (q), "=d" (r)                                         \
1129            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1130 #define UDIV_TIME 90
1131 #define sdiv_qrnnd(q, r, n1, n0, d) \
1132   __asm__ ("divs%.l %4,%1:%0"                                           \
1133            : "=d" (q), "=d" (r)                                         \
1134            : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1135 #else /* for other 68k family members use 16x16->32 multiplication */
1136 #define umul_ppmm(xh, xl, a, b) \
1137   do { USItype __umul_tmp1, __umul_tmp2;                                \
1138         __asm__ ("| Inlined umul_ppmm\n"                                \
1139 "       move%.l %5,%3\n"                                                \
1140 "       move%.l %2,%0\n"                                                \
1141 "       move%.w %3,%1\n"                                                \
1142 "       swap    %3\n"                                                   \
1143 "       swap    %0\n"                                                   \
1144 "       mulu%.w %2,%1\n"                                                \
1145 "       mulu%.w %3,%0\n"                                                \
1146 "       mulu%.w %2,%3\n"                                                \
1147 "       swap    %2\n"                                                   \
1148 "       mulu%.w %5,%2\n"                                                \
1149 "       add%.l  %3,%2\n"                                                \
1150 "       jcc     1f\n"                                                   \
1151 "       add%.l  %#0x10000,%0\n"                                         \
1152 "1:     move%.l %2,%3\n"                                                \
1153 "       clr%.w  %2\n"                                                   \
1154 "       swap    %2\n"                                                   \
1155 "       swap    %3\n"                                                   \
1156 "       clr%.w  %3\n"                                                   \
1157 "       add%.l  %3,%1\n"                                                \
1158 "       addx%.l %2,%0\n"                                                \
1159 "       | End inlined umul_ppmm"                                        \
1160               : "=&d" (xh), "=&d" (xl),                                 \
1161                 "=d" (__umul_tmp1), "=&d" (__umul_tmp2)                 \
1162               : "%2" ((USItype)(a)), "d" ((USItype)(b)));               \
1163   } while (0)
1164 #define UMUL_TIME 100
1165 #define UDIV_TIME 400
1166 #endif /* not mc68020 */
1167 /* The '020, '030, '040 and '060 have bitfield insns.
1168    GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1169    exclude bfffo on that chip (bitfield insns not available).  */
1170 #if (defined (__mc68020__) || defined (mc68020)    \
1171      || defined (__mc68030__) || defined (mc68030) \
1172      || defined (__mc68040__) || defined (mc68040) \
1173      || defined (__mc68060__) || defined (mc68060) \
1174      || defined (__NeXT__))                        \
1175   && ! defined (__mcpu32__)
1176 #define count_leading_zeros(count, x) \
1177   __asm__ ("bfffo %1{%b2:%b2},%0"                                       \
1178            : "=d" (count)                                               \
1179            : "od" ((USItype) (x)), "n" (0))
1180 #define COUNT_LEADING_ZEROS_0 32
1181 #endif
1182 #endif /* mc68000 */
1183
1184 #if defined (__m88000__) && W_TYPE_SIZE == 32
1185 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1186   __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3"                   \
1187            : "=r" (sh), "=&r" (sl)                                      \
1188            : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1189 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1190   __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3"                   \
1191            : "=r" (sh), "=&r" (sl)                                      \
1192            : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1193 #define count_leading_zeros(count, x) \
1194   do {                                                                  \
1195     USItype __cbtmp;                                                    \
1196     __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x));                   \
1197     (count) = __cbtmp ^ 31;                                             \
1198   } while (0)
1199 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1200 #if defined (__m88110__)
1201 #define umul_ppmm(wh, wl, u, v) \
1202   do {                                                                  \
1203     union {UDItype __ll;                                                \
1204            struct {USItype __h, __l;} __i;                              \
1205           } __x;                                                        \
1206     __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v));   \
1207     (wh) = __x.__i.__h;                                                 \
1208     (wl) = __x.__i.__l;                                                 \
1209   } while (0)
1210 #define udiv_qrnnd(q, r, n1, n0, d) \
1211   ({union {UDItype __ll;                                                \
1212            struct {USItype __h, __l;} __i;                              \
1213           } __x, __q;                                                   \
1214   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1215   __asm__ ("divu.d %0,%1,%2"                                            \
1216            : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d));                \
1217   (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1218 #define UMUL_TIME 5
1219 #define UDIV_TIME 25
1220 #else
1221 #define UMUL_TIME 17
1222 #define UDIV_TIME 150
1223 #endif /* __m88110__ */
1224 #endif /* __m88000__ */
1225
1226 #if defined (__mips) && W_TYPE_SIZE == 32
1227 #if __GMP_GNUC_PREREQ (4,4)
1228 #define umul_ppmm(w1, w0, u, v) \
1229   do {                                                                  \
1230     UDItype __ll = (UDItype)(u) * (v);                                  \
1231     w1 = __ll >> 32;                                                    \
1232     w0 = __ll;                                                          \
1233   } while (0)
1234 #endif
1235 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1236 #define umul_ppmm(w1, w0, u, v) \
1237   __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1238 #endif
1239 #if !defined (umul_ppmm)
1240 #define umul_ppmm(w1, w0, u, v) \
1241   __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1"                          \
1242            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1243 #endif
1244 #define UMUL_TIME 10
1245 #define UDIV_TIME 100
1246 #endif /* __mips */
1247
1248 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1249 #if __GMP_GNUC_PREREQ (4,4)
1250 #define umul_ppmm(w1, w0, u, v) \
1251   do {                                                                  \
1252     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1253     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1254     w1 = __ll >> 64;                                                    \
1255     w0 = __ll;                                                          \
1256   } while (0)
1257 #endif
1258 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1259 #define umul_ppmm(w1, w0, u, v) \
1260   __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1261 #endif
1262 #if !defined (umul_ppmm)
1263 #define umul_ppmm(w1, w0, u, v) \
1264   __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1"                         \
1265            : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1266 #endif
1267 #define UMUL_TIME 20
1268 #define UDIV_TIME 140
1269 #endif /* __mips */
1270
1271 #if defined (__mmix__) && W_TYPE_SIZE == 64
1272 #define umul_ppmm(w1, w0, u, v) \
1273   __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1274 #endif
1275
1276 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1277 #define umul_ppmm(w1, w0, u, v) \
1278   ({union {UDItype __ll;                                                \
1279            struct {USItype __l, __h;} __i;                              \
1280           } __x;                                                        \
1281   __asm__ ("meid %2,%0"                                                 \
1282            : "=g" (__x.__ll)                                            \
1283            : "%0" ((USItype)(u)), "g" ((USItype)(v)));                  \
1284   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1285 #define __umulsidi3(u, v) \
1286   ({UDItype __w;                                                        \
1287     __asm__ ("meid %2,%0"                                               \
1288              : "=g" (__w)                                               \
1289              : "%0" ((USItype)(u)), "g" ((USItype)(v)));                \
1290     __w; })
1291 #define udiv_qrnnd(q, r, n1, n0, d) \
1292   ({union {UDItype __ll;                                                \
1293            struct {USItype __l, __h;} __i;                              \
1294           } __x;                                                        \
1295   __x.__i.__h = (n1); __x.__i.__l = (n0);                               \
1296   __asm__ ("deid %2,%0"                                                 \
1297            : "=g" (__x.__ll)                                            \
1298            : "0" (__x.__ll), "g" ((USItype)(d)));                       \
1299   (r) = __x.__i.__l; (q) = __x.__i.__h; })
1300 #define count_trailing_zeros(count,x) \
1301   do {                                                                  \
1302     __asm__ ("ffsd      %2,%0"                                          \
1303              : "=r" (count)                                             \
1304              : "0" ((USItype) 0), "r" ((USItype) (x)));                 \
1305   } while (0)
1306 #endif /* __ns32000__ */
1307
1308 /* In the past we had a block of various #defines tested
1309        _ARCH_PPC    - AIX
1310        _ARCH_PWR    - AIX
1311        __powerpc__  - gcc
1312        __POWERPC__  - BEOS
1313        __ppc__      - Darwin
1314        PPC          - old gcc, GNU/Linux, SysV
1315    The plain PPC test was not good for vxWorks, since PPC is defined on all
1316    CPUs there (eg. m68k too), as a constant one is expected to compare
1317    CPU_FAMILY against.
1318
1319    At any rate, this was pretty unattractive and a bit fragile.  The use of
1320    HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1321    getting the desired effect.
1322
1323    ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1324    the system vendor compilers.  (Is that vendor compilers with inline asm,
1325    or what?)  */
1326
1327 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc)        \
1328   && W_TYPE_SIZE == 32
1329 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1330   do {                                                                  \
1331     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1332       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"                        \
1333              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
1334     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1335       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"                        \
1336              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
1337     else                                                                \
1338       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"                      \
1339              : "=r" (sh), "=&r" (sl)                                    \
1340              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1341   } while (0)
1342 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1343   do {                                                                  \
1344     if (__builtin_constant_p (ah) && (ah) == 0)                         \
1345       __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"                      \
1346                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1347     else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0)         \
1348       __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"                      \
1349                : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1350     else if (__builtin_constant_p (bh) && (bh) == 0)                    \
1351       __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"                       \
1352                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1353     else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0)         \
1354       __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"                       \
1355                : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1356     else                                                                \
1357       __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"                    \
1358                : "=r" (sh), "=&r" (sl)                                  \
1359                : "r" (ah), "r" (bh), "rI" (al), "r" (bl));              \
1360   } while (0)
1361 #define count_leading_zeros(count, x) \
1362   __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
1363 #define COUNT_LEADING_ZEROS_0 32
1364 #if HAVE_HOST_CPU_FAMILY_powerpc
1365 #if __GMP_GNUC_PREREQ (4,4)
1366 #define umul_ppmm(w1, w0, u, v) \
1367   do {                                                                  \
1368     UDItype __ll = (UDItype)(u) * (v);                                  \
1369     w1 = __ll >> 32;                                                    \
1370     w0 = __ll;                                                          \
1371   } while (0)
1372 #endif
1373 #if !defined (umul_ppmm)
1374 #define umul_ppmm(ph, pl, m0, m1) \
1375   do {                                                                  \
1376     USItype __m0 = (m0), __m1 = (m1);                                   \
1377     __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1378     (pl) = __m0 * __m1;                                                 \
1379   } while (0)
1380 #endif
1381 #define UMUL_TIME 15
1382 #define smul_ppmm(ph, pl, m0, m1) \
1383   do {                                                                  \
1384     SItype __m0 = (m0), __m1 = (m1);                                    \
1385     __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1386     (pl) = __m0 * __m1;                                                 \
1387   } while (0)
1388 #define SMUL_TIME 14
1389 #define UDIV_TIME 120
1390 #else
1391 #define UMUL_TIME 8
1392 #define smul_ppmm(xh, xl, m0, m1) \
1393   __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1394 #define SMUL_TIME 4
1395 #define sdiv_qrnnd(q, r, nh, nl, d) \
1396   __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1397 #define UDIV_TIME 100
1398 #endif
1399 #endif /* 32-bit POWER architecture variants.  */
1400
1401 /* We should test _IBMR2 here when we add assembly support for the system
1402    vendor compilers.  */
1403 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1404 #if !defined (_LONG_LONG_LIMB)
1405 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values.  So
1406    use adde etc only when not _LONG_LONG_LIMB.  */
1407 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1408   do {                                                                  \
1409     if (__builtin_constant_p (bh) && (bh) == 0)                         \
1410       __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2"                        \
1411              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
1412     else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)         \
1413       __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2"                        \
1414              : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
1415     else                                                                \
1416       __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3"                      \
1417              : "=r" (sh), "=&r" (sl)                                    \
1418              : "r" (ah), "r" (bh), "%r" (al), "rI" (bl));               \
1419   } while (0)
1420 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1421    This might seem strange, but gcc folds away the dead code late.  */
1422 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1423   do {                                                                  \
1424     if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) {    \
1425         if (__builtin_constant_p (ah) && (ah) == 0)                     \
1426           __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2"                     \
1427                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1428         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)     \
1429           __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2"                     \
1430                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1431         else if (__builtin_constant_p (bh) && (bh) == 0)                \
1432           __asm__ ("addic %1,%3,%4\n\taddme %0,%2"                      \
1433                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1434         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)     \
1435           __asm__ ("addic %1,%3,%4\n\taddze %0,%2"                      \
1436                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1437         else                                                            \
1438           __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2"                   \
1439                    : "=r" (sh), "=&r" (sl)                              \
1440                    : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl));       \
1441     } else {                                                            \
1442         if (__builtin_constant_p (ah) && (ah) == 0)                     \
1443           __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2"                  \
1444                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)); \
1445         else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0)     \
1446           __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2"                  \
1447                    : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)); \
1448         else if (__builtin_constant_p (bh) && (bh) == 0)                \
1449           __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2"                   \
1450                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)); \
1451         else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0)     \
1452           __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2"                   \
1453                    : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)); \
1454         else                                                            \
1455           __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2"                \
1456                    : "=r" (sh), "=&r" (sl)                              \
1457                    : "r" (ah), "r" (bh), "rI" (al), "r" (bl));          \
1458     }                                                                   \
1459   } while (0)
1460 #endif /* ! _LONG_LONG_LIMB */
1461 #define count_leading_zeros(count, x) \
1462   __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1463 #define COUNT_LEADING_ZEROS_0 64
1464 #if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */
1465 #define umul_ppmm(w1, w0, u, v) \
1466   do {                                                                  \
1467     typedef unsigned int __ll_UTItype __attribute__((mode(TI)));        \
1468     __ll_UTItype __ll = (__ll_UTItype)(u) * (v);                        \
1469     w1 = __ll >> 64;                                                    \
1470     w0 = __ll;                                                          \
1471   } while (0)
1472 #endif
1473 #if !defined (umul_ppmm)
1474 #define umul_ppmm(ph, pl, m0, m1) \
1475   do {                                                                  \
1476     UDItype __m0 = (m0), __m1 = (m1);                                   \
1477     __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));      \
1478     (pl) = __m0 * __m1;                                                 \
1479   } while (0)
1480 #endif
1481 #define UMUL_TIME 15
1482 #define smul_ppmm(ph, pl, m0, m1) \
1483   do {                                                                  \
1484     DItype __m0 = (m0), __m1 = (m1);                                    \
1485     __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1));       \
1486     (pl) = __m0 * __m1;                                                 \
1487   } while (0)
1488 #define SMUL_TIME 14  /* ??? */
1489 #define UDIV_TIME 120 /* ??? */
1490 #endif /* 64-bit PowerPC.  */
1491
1492 #if defined (__pyr__) && W_TYPE_SIZE == 32
1493 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1494   __asm__ ("addw %5,%1\n\taddwc %3,%0"                                  \
1495            : "=r" (sh), "=&r" (sl)                                      \
1496            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1497              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1498 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1499   __asm__ ("subw %5,%1\n\tsubwb %3,%0"                                  \
1500            : "=r" (sh), "=&r" (sl)                                      \
1501            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1502              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1503 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP.  */
1504 #define umul_ppmm(w1, w0, u, v) \
1505   ({union {UDItype __ll;                                                \
1506            struct {USItype __h, __l;} __i;                              \
1507           } __x;                                                        \
1508   __asm__ ("movw %1,%R0\n\tuemul %2,%0"                                 \
1509            : "=&r" (__x.__ll)                                           \
1510            : "g" ((USItype) (u)), "g" ((USItype)(v)));                  \
1511   (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1512 #endif /* __pyr__ */
1513
1514 #if defined (__ibm032__) /* RT/ROMP */  && W_TYPE_SIZE == 32
1515 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1516   __asm__ ("a %1,%5\n\tae %0,%3"                                        \
1517            : "=r" (sh), "=&r" (sl)                                      \
1518            : "0"  ((USItype)(ah)), "r" ((USItype)(bh)),                 \
1519              "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1520 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1521   __asm__ ("s %1,%5\n\tse %0,%3"                                        \
1522            : "=r" (sh), "=&r" (sl)                                      \
1523            : "0" ((USItype)(ah)), "r" ((USItype)(bh)),                  \
1524              "1" ((USItype)(al)), "r" ((USItype)(bl)))
1525 #define smul_ppmm(ph, pl, m0, m1) \
1526   __asm__ (                                                             \
1527        "s       r2,r2\n"                                                \
1528 "       mts r10,%2\n"                                                   \
1529 "       m       r2,%3\n"                                                \
1530 "       m       r2,%3\n"                                                \
1531 "       m       r2,%3\n"                                                \
1532 "       m       r2,%3\n"                                                \
1533 "       m       r2,%3\n"                                                \
1534 "       m       r2,%3\n"                                                \
1535 "       m       r2,%3\n"                                                \
1536 "       m       r2,%3\n"                                                \
1537 "       m       r2,%3\n"                                                \
1538 "       m       r2,%3\n"                                                \
1539 "       m       r2,%3\n"                                                \
1540 "       m       r2,%3\n"                                                \
1541 "       m       r2,%3\n"                                                \
1542 "       m       r2,%3\n"                                                \
1543 "       m       r2,%3\n"                                                \
1544 "       m       r2,%3\n"                                                \
1545 "       cas     %0,r2,r0\n"                                             \
1546 "       mfs     r10,%1"                                                 \
1547            : "=r" (ph), "=r" (pl)                                       \
1548            : "%r" ((USItype)(m0)), "r" ((USItype)(m1))                  \
1549            : "r2")
1550 #define UMUL_TIME 20
1551 #define UDIV_TIME 200
1552 #define count_leading_zeros(count, x) \
1553   do {                                                                  \
1554     if ((x) >= 0x10000)                                                 \
1555       __asm__ ("clz     %0,%1"                                          \
1556                : "=r" (count) : "r" ((USItype)(x) >> 16));              \
1557     else                                                                \
1558       {                                                                 \
1559         __asm__ ("clz   %0,%1"                                          \
1560                  : "=r" (count) : "r" ((USItype)(x)));                  \
1561         (count) += 16;                                                  \
1562       }                                                                 \
1563   } while (0)
1564 #endif /* RT/ROMP */
1565
1566 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1567 #define umul_ppmm(w1, w0, u, v) \
1568   __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0"                \
1569            : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1570 #define UMUL_TIME 5
1571 #endif
1572
1573 #if defined (__sparc__) && W_TYPE_SIZE == 32
1574 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1575   __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0"                          \
1576            : "=r" (sh), "=&r" (sl)                                      \
1577            : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl)                 \
1578            __CLOBBER_CC)
1579 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1580   __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0"                          \
1581            : "=r" (sh), "=&r" (sl)                                      \
1582            : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1583            __CLOBBER_CC)
1584 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1585    doesn't define anything to indicate that to us, it only sets __sparcv8. */
1586 #if defined (__sparc_v9__) || defined (__sparcv9)
1587 /* Perhaps we should use floating-point operations here?  */
1588 #if 0
1589 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1590    Perhaps we simply need explicitly zero-extend the inputs?  */
1591 #define umul_ppmm(w1, w0, u, v) \
1592   __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" :          \
1593            "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1594 #else
1595 /* Use v8 umul until above bug is fixed.  */
1596 #define umul_ppmm(w1, w0, u, v) \
1597   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1598 #endif
1599 /* Use a plain v8 divide for v9.  */
1600 #define udiv_qrnnd(q, r, n1, n0, d) \
1601   do {                                                                  \
1602     USItype __q;                                                        \
1603     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1604              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1605     (r) = (n0) - __q * (d);                                             \
1606     (q) = __q;                                                          \
1607   } while (0)
1608 #else
1609 #if defined (__sparc_v8__)   /* gcc normal */                           \
1610   || defined (__sparcv8)     /* gcc solaris */                          \
1611   || HAVE_HOST_CPU_supersparc
1612 /* Don't match immediate range because, 1) it is not often useful,
1613    2) the 'I' flag thinks of the range as a 13 bit signed interval,
1614    while we want to match a 13 bit interval, sign extended to 32 bits,
1615    but INTERPRETED AS UNSIGNED.  */
1616 #define umul_ppmm(w1, w0, u, v) \
1617   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1618 #define UMUL_TIME 5
1619
1620 #if HAVE_HOST_CPU_supersparc
1621 #define UDIV_TIME 60            /* SuperSPARC timing */
1622 #else
1623 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1624    dividends and will trap to the kernel for the rest. */
1625 #define udiv_qrnnd(q, r, n1, n0, d) \
1626   do {                                                                  \
1627     USItype __q;                                                        \
1628     __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0"                     \
1629              : "=r" (__q) : "r" (n1), "r" (n0), "r" (d));               \
1630     (r) = (n0) - __q * (d);                                             \
1631     (q) = __q;                                                          \
1632   } while (0)
1633 #define UDIV_TIME 25
1634 #endif /* HAVE_HOST_CPU_supersparc */
1635
1636 #else /* ! __sparc_v8__ */
1637 #if defined (__sparclite__)
1638 /* This has hardware multiply but not divide.  It also has two additional
1639    instructions scan (ffs from high bit) and divscc.  */
1640 #define umul_ppmm(w1, w0, u, v) \
1641   __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1642 #define UMUL_TIME 5
1643 #define udiv_qrnnd(q, r, n1, n0, d) \
1644   __asm__ ("! Inlined udiv_qrnnd\n"                                     \
1645 "       wr      %%g0,%2,%%y     ! Not a delayed write for sparclite\n"  \
1646 "       tst     %%g0\n"                                                 \
1647 "       divscc  %3,%4,%%g1\n"                                           \
1648 "       divscc  %%g1,%4,%%g1\n"                                         \
1649 "       divscc  %%g1,%4,%%g1\n"                                         \
1650 "       divscc  %%g1,%4,%%g1\n"                                         \
1651 "       divscc  %%g1,%4,%%g1\n"                                         \
1652 "       divscc  %%g1,%4,%%g1\n"                                         \
1653 "       divscc  %%g1,%4,%%g1\n"                                         \
1654 "       divscc  %%g1,%4,%%g1\n"                                         \
1655 "       divscc  %%g1,%4,%%g1\n"                                         \
1656 "       divscc  %%g1,%4,%%g1\n"                                         \
1657 "       divscc  %%g1,%4,%%g1\n"                                         \
1658 "       divscc  %%g1,%4,%%g1\n"                                         \
1659 "       divscc  %%g1,%4,%%g1\n"                                         \
1660 "       divscc  %%g1,%4,%%g1\n"                                         \
1661 "       divscc  %%g1,%4,%%g1\n"                                         \
1662 "       divscc  %%g1,%4,%%g1\n"                                         \
1663 "       divscc  %%g1,%4,%%g1\n"                                         \
1664 "       divscc  %%g1,%4,%%g1\n"                                         \
1665 "       divscc  %%g1,%4,%%g1\n"                                         \
1666 "       divscc  %%g1,%4,%%g1\n"                                         \
1667 "       divscc  %%g1,%4,%%g1\n"                                         \
1668 "       divscc  %%g1,%4,%%g1\n"                                         \
1669 "       divscc  %%g1,%4,%%g1\n"                                         \
1670 "       divscc  %%g1,%4,%%g1\n"                                         \
1671 "       divscc  %%g1,%4,%%g1\n"                                         \
1672 "       divscc  %%g1,%4,%%g1\n"                                         \
1673 "       divscc  %%g1,%4,%%g1\n"                                         \
1674 "       divscc  %%g1,%4,%%g1\n"                                         \
1675 "       divscc  %%g1,%4,%%g1\n"                                         \
1676 "       divscc  %%g1,%4,%%g1\n"                                         \
1677 "       divscc  %%g1,%4,%%g1\n"                                         \
1678 "       divscc  %%g1,%4,%0\n"                                           \
1679 "       rd      %%y,%1\n"                                               \
1680 "       bl,a 1f\n"                                                      \
1681 "       add     %1,%4,%1\n"                                             \
1682 "1:     ! End of inline udiv_qrnnd"                                     \
1683            : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d)          \
1684            : "%g1" __AND_CLOBBER_CC)
1685 #define UDIV_TIME 37
1686 #define count_leading_zeros(count, x) \
1687   __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1688 /* Early sparclites return 63 for an argument of 0, but they warn that future
1689    implementations might change this.  Therefore, leave COUNT_LEADING_ZEROS_0
1690    undefined.  */
1691 #endif /* __sparclite__ */
1692 #endif /* __sparc_v8__ */
1693 #endif /* __sparc_v9__ */
1694 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd.  */
1695 #ifndef umul_ppmm
1696 #define umul_ppmm(w1, w0, u, v) \
1697   __asm__ ("! Inlined umul_ppmm\n"                                      \
1698 "       wr      %%g0,%2,%%y     ! SPARC has 0-3 delay insn after a wr\n" \
1699 "       sra     %3,31,%%g2      ! Don't move this insn\n"               \
1700 "       and     %2,%%g2,%%g2    ! Don't move this insn\n"               \
1701 "       andcc   %%g0,0,%%g1     ! Don't move this insn\n"               \
1702 "       mulscc  %%g1,%3,%%g1\n"                                         \
1703 "       mulscc  %%g1,%3,%%g1\n"                                         \
1704 "       mulscc  %%g1,%3,%%g1\n"                                         \
1705 "       mulscc  %%g1,%3,%%g1\n"                                         \
1706 "       mulscc  %%g1,%3,%%g1\n"                                         \
1707 "       mulscc  %%g1,%3,%%g1\n"                                         \
1708 "       mulscc  %%g1,%3,%%g1\n"                                         \
1709 "       mulscc  %%g1,%3,%%g1\n"                                         \
1710 "       mulscc  %%g1,%3,%%g1\n"                                         \
1711 "       mulscc  %%g1,%3,%%g1\n"                                         \
1712 "       mulscc  %%g1,%3,%%g1\n"                                         \
1713 "       mulscc  %%g1,%3,%%g1\n"                                         \
1714 "       mulscc  %%g1,%3,%%g1\n"                                         \
1715 "       mulscc  %%g1,%3,%%g1\n"                                         \
1716 "       mulscc  %%g1,%3,%%g1\n"                                         \
1717 "       mulscc  %%g1,%3,%%g1\n"                                         \
1718 "       mulscc  %%g1,%3,%%g1\n"                                         \
1719 "       mulscc  %%g1,%3,%%g1\n"                                         \
1720 "       mulscc  %%g1,%3,%%g1\n"                                         \
1721 "       mulscc  %%g1,%3,%%g1\n"                                         \
1722 "       mulscc  %%g1,%3,%%g1\n"                                         \
1723 "       mulscc  %%g1,%3,%%g1\n"                                         \
1724 "       mulscc  %%g1,%3,%%g1\n"                                         \
1725 "       mulscc  %%g1,%3,%%g1\n"                                         \
1726 "       mulscc  %%g1,%3,%%g1\n"                                         \
1727 "       mulscc  %%g1,%3,%%g1\n"                                         \
1728 "       mulscc  %%g1,%3,%%g1\n"                                         \
1729 "       mulscc  %%g1,%3,%%g1\n"                                         \
1730 "       mulscc  %%g1,%3,%%g1\n"                                         \
1731 "       mulscc  %%g1,%3,%%g1\n"                                         \
1732 "       mulscc  %%g1,%3,%%g1\n"                                         \
1733 "       mulscc  %%g1,%3,%%g1\n"                                         \
1734 "       mulscc  %%g1,0,%%g1\n"                                          \
1735 "       add     %%g1,%%g2,%0\n"                                         \
1736 "       rd      %%y,%1"                                                 \
1737            : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v)                  \
1738            : "%g1", "%g2" __AND_CLOBBER_CC)
1739 #define UMUL_TIME 39            /* 39 instructions */
1740 #endif
1741 #ifndef udiv_qrnnd
1742 #ifndef LONGLONG_STANDALONE
1743 #define udiv_qrnnd(q, r, n1, n0, d) \
1744   do { UWtype __r;                                                      \
1745     (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d));                    \
1746     (r) = __r;                                                          \
1747   } while (0)
1748 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1749 #ifndef UDIV_TIME
1750 #define UDIV_TIME 140
1751 #endif
1752 #endif /* LONGLONG_STANDALONE */
1753 #endif /* udiv_qrnnd */
1754 #endif /* __sparc__ */
1755
1756 #if defined (__sparc__) && W_TYPE_SIZE == 64
1757 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1758   __asm__ (                                                             \
1759        "addcc   %r4,%5,%1\n"                                            \
1760       " addccc  %r6,%7,%%g0\n"                                          \
1761       " addc    %r2,%3,%0"                                              \
1762           : "=r" (sh), "=&r" (sl)                                       \
1763           : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl),                \
1764             "%rJ" ((al) >> 32), "rI" ((bl) >> 32)                       \
1765            __CLOBBER_CC)
1766 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1767   __asm__ (                                                             \
1768        "subcc   %r4,%5,%1\n"                                            \
1769       " subccc  %r6,%7,%%g0\n"                                          \
1770       " subc    %r2,%3,%0"                                              \
1771           : "=r" (sh), "=&r" (sl)                                       \
1772           : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl),                 \
1773             "rJ" ((al) >> 32), "rI" ((bl) >> 32)                        \
1774            __CLOBBER_CC)
1775 #if __VIS__ >= 0x300
1776 #undef add_ssaaaa
1777 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1778   __asm__ (                                                             \
1779        "addcc   %r4, %5, %1\n"                                          \
1780       " addxc   %r2, %r3, %0"                                           \
1781           : "=r" (sh), "=&r" (sl)                                       \
1782           : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rI" (bl) __CLOBBER_CC)
1783 #define umul_ppmm(ph, pl, m0, m1) \
1784   do {                                                                  \
1785     UDItype __m0 = (m0), __m1 = (m1);                                   \
1786     (pl) = __m0 * __m1;                                                 \
1787     __asm__ ("umulxhi\t%2, %1, %0"                                      \
1788              : "=r" (ph)                                                \
1789              : "%r" (__m0), "r" (__m1));                                \
1790   } while (0)
1791 #define count_leading_zeros(count, x) \
1792   __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
1793 #endif
1794 #endif
1795
1796 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
1797 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1798   __asm__ ("addl2 %5,%1\n\tadwc %3,%0"                                  \
1799            : "=g" (sh), "=&g" (sl)                                      \
1800            : "0"  ((USItype)(ah)), "g" ((USItype)(bh)),                 \
1801              "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1802 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1803   __asm__ ("subl2 %5,%1\n\tsbwc %3,%0"                                  \
1804            : "=g" (sh), "=&g" (sl)                                      \
1805            : "0" ((USItype)(ah)), "g" ((USItype)(bh)),                  \
1806              "1" ((USItype)(al)), "g" ((USItype)(bl)))
1807 #define smul_ppmm(xh, xl, m0, m1) \
1808   do {                                                                  \
1809     union {UDItype __ll;                                                \
1810            struct {USItype __l, __h;} __i;                              \
1811           } __x;                                                        \
1812     USItype __m0 = (m0), __m1 = (m1);                                   \
1813     __asm__ ("emul %1,%2,$0,%0"                                         \
1814              : "=g" (__x.__ll) : "g" (__m0), "g" (__m1));               \
1815     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1816   } while (0)
1817 #define sdiv_qrnnd(q, r, n1, n0, d) \
1818   do {                                                                  \
1819     union {DItype __ll;                                                 \
1820            struct {SItype __l, __h;} __i;                               \
1821           } __x;                                                        \
1822     __x.__i.__h = n1; __x.__i.__l = n0;                                 \
1823     __asm__ ("ediv %3,%2,%0,%1"                                         \
1824              : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d));           \
1825   } while (0)
1826 #if 0
1827 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1828    8800 maybe). */
1829 #define count_trailing_zeros(count,x)                                   \
1830   do {                                                                  \
1831     __asm__ ("ffs 0, 31, %1, %0"                                        \
1832              : "=g" (count)                                             \
1833              : "g" ((USItype) (x)));                                    \
1834   } while (0)
1835 #endif
1836 #endif /* vax */
1837
1838 #if defined (__z8000__) && W_TYPE_SIZE == 16
1839 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1840   __asm__ ("add %H1,%H5\n\tadc  %H0,%H3"                                \
1841            : "=r" (sh), "=&r" (sl)                                      \
1842            : "0"  ((unsigned int)(ah)), "r" ((unsigned int)(bh)),       \
1843              "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1844 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1845   __asm__ ("sub %H1,%H5\n\tsbc  %H0,%H3"                                \
1846            : "=r" (sh), "=&r" (sl)                                      \
1847            : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)),        \
1848              "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1849 #define umul_ppmm(xh, xl, m0, m1) \
1850   do {                                                                  \
1851     union {long int __ll;                                               \
1852            struct {unsigned int __h, __l;} __i;                         \
1853           } __x;                                                        \
1854     unsigned int __m0 = (m0), __m1 = (m1);                              \
1855     __asm__ ("mult      %S0,%H3"                                        \
1856              : "=r" (__x.__i.__h), "=r" (__x.__i.__l)                   \
1857              : "%1" (m0), "rQR" (m1));                                  \
1858     (xh) = __x.__i.__h; (xl) = __x.__i.__l;                             \
1859     (xh) += ((((signed int) __m0 >> 15) & __m1)                         \
1860              + (((signed int) __m1 >> 15) & __m0));                     \
1861   } while (0)
1862 #endif /* __z8000__ */
1863
1864 #endif /* __GNUC__ */
1865
1866 #endif /* NO_ASM */
1867
1868
1869 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti".  */
1870 #if !defined (umul_ppmm) && defined (__umulsidi3)
1871 #define umul_ppmm(ph, pl, m0, m1) \
1872   {                                                                     \
1873     UDWtype __ll = __umulsidi3 (m0, m1);                                \
1874     ph = (UWtype) (__ll >> W_TYPE_SIZE);                                \
1875     pl = (UWtype) __ll;                                                 \
1876   }
1877 #endif
1878
1879 #if !defined (__umulsidi3)
1880 #define __umulsidi3(u, v) \
1881   ({UWtype __hi, __lo;                                                  \
1882     umul_ppmm (__hi, __lo, u, v);                                       \
1883     ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1884 #endif
1885
1886
1887 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist.  The "_r"
1888    forms have "reversed" arguments, meaning the pointer is last, which
1889    sometimes allows better parameter passing, in particular on 64-bit
1890    hppa. */
1891
1892 #define mpn_umul_ppmm  __MPN(umul_ppmm)
1893 extern UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
1894
1895 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm  \
1896   && ! defined (LONGLONG_STANDALONE)
1897 #define umul_ppmm(wh, wl, u, v)                                         \
1898   do {                                                                  \
1899     UWtype __umul_ppmm__p0;                                             \
1900     (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
1901     (wl) = __umul_ppmm__p0;                                             \
1902   } while (0)
1903 #endif
1904
1905 #define mpn_umul_ppmm_r  __MPN(umul_ppmm_r)
1906 extern UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
1907
1908 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r        \
1909   && ! defined (LONGLONG_STANDALONE)
1910 #define umul_ppmm(wh, wl, u, v)                                         \
1911   do {                                                                  \
1912     UWtype __umul_p0;                                                   \
1913     (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0);    \
1914     (wl) = __umul_p0;                                                   \
1915   } while (0)
1916 #endif
1917
1918 #define mpn_udiv_qrnnd  __MPN(udiv_qrnnd)
1919 extern UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
1920
1921 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd        \
1922   && ! defined (LONGLONG_STANDALONE)
1923 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1924   do {                                                                  \
1925     UWtype __udiv_qrnnd_r;                                              \
1926     (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r,                              \
1927                           (UWtype) (n1), (UWtype) (n0), (UWtype) d);    \
1928     (r) = __udiv_qrnnd_r;                                               \
1929   } while (0)
1930 #endif
1931
1932 #define mpn_udiv_qrnnd_r  __MPN(udiv_qrnnd_r)
1933 extern UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
1934
1935 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r      \
1936   && ! defined (LONGLONG_STANDALONE)
1937 #define udiv_qrnnd(q, r, n1, n0, d)                                     \
1938   do {                                                                  \
1939     UWtype __udiv_qrnnd_r;                                              \
1940     (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d,   \
1941                             &__udiv_qrnnd_r);                           \
1942     (r) = __udiv_qrnnd_r;                                               \
1943   } while (0)
1944 #endif
1945
1946
1947 /* If this machine has no inline assembler, use C macros.  */
1948
1949 #if !defined (add_ssaaaa)
1950 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1951   do {                                                                  \
1952     UWtype __x;                                                         \
1953     __x = (al) + (bl);                                                  \
1954     (sh) = (ah) + (bh) + (__x < (al));                                  \
1955     (sl) = __x;                                                         \
1956   } while (0)
1957 #endif
1958
1959 #if !defined (sub_ddmmss)
1960 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1961   do {                                                                  \
1962     UWtype __x;                                                         \
1963     __x = (al) - (bl);                                                  \
1964     (sh) = (ah) - (bh) - ((al) < (bl));                                 \
1965     (sl) = __x;                                                         \
1966   } while (0)
1967 #endif
1968
1969 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1970    smul_ppmm.  */
1971 #if !defined (umul_ppmm) && defined (smul_ppmm)
1972 #define umul_ppmm(w1, w0, u, v)                                         \
1973   do {                                                                  \
1974     UWtype __w1;                                                        \
1975     UWtype __xm0 = (u), __xm1 = (v);                                    \
1976     smul_ppmm (__w1, w0, __xm0, __xm1);                                 \
1977     (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
1978                 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
1979   } while (0)
1980 #endif
1981
1982 /* If we still don't have umul_ppmm, define it using plain C.
1983
1984    For reference, when this code is used for squaring (ie. u and v identical
1985    expressions), gcc recognises __x1 and __x2 are the same and generates 3
1986    multiplies, not 4.  The subsequent additions could be optimized a bit,
1987    but the only place GMP currently uses such a square is mpn_sqr_basecase,
1988    and chips obliged to use this generic C umul will have plenty of worse
1989    performance problems than a couple of extra instructions on the diagonal
1990    of sqr_basecase.  */
1991
1992 #if !defined (umul_ppmm)
1993 #define umul_ppmm(w1, w0, u, v)                                         \
1994   do {                                                                  \
1995     UWtype __x0, __x1, __x2, __x3;                                      \
1996     UHWtype __ul, __vl, __uh, __vh;                                     \
1997     UWtype __u = (u), __v = (v);                                        \
1998                                                                         \
1999     __ul = __ll_lowpart (__u);                                          \
2000     __uh = __ll_highpart (__u);                                         \
2001     __vl = __ll_lowpart (__v);                                          \
2002     __vh = __ll_highpart (__v);                                         \
2003                                                                         \
2004     __x0 = (UWtype) __ul * __vl;                                        \
2005     __x1 = (UWtype) __ul * __vh;                                        \
2006     __x2 = (UWtype) __uh * __vl;                                        \
2007     __x3 = (UWtype) __uh * __vh;                                        \
2008                                                                         \
2009     __x1 += __ll_highpart (__x0);/* this can't give carry */            \
2010     __x1 += __x2;               /* but this indeed can */               \
2011     if (__x1 < __x2)            /* did we get it? */                    \
2012       __x3 += __ll_B;           /* yes, add it in the proper pos. */    \
2013                                                                         \
2014     (w1) = __x3 + __ll_highpart (__x1);                                 \
2015     (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0);               \
2016   } while (0)
2017 #endif
2018
2019 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
2020    exist in one form or another.  */
2021 #if !defined (smul_ppmm)
2022 #define smul_ppmm(w1, w0, u, v)                                         \
2023   do {                                                                  \
2024     UWtype __w1;                                                        \
2025     UWtype __xm0 = (u), __xm1 = (v);                                    \
2026     umul_ppmm (__w1, w0, __xm0, __xm1);                                 \
2027     (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1)               \
2028                 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0);              \
2029   } while (0)
2030 #endif
2031
2032 /* Define this unconditionally, so it can be used for debugging.  */
2033 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
2034   do {                                                                  \
2035     UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m;                     \
2036                                                                         \
2037     ASSERT ((d) != 0);                                                  \
2038     ASSERT ((n1) < (d));                                                \
2039                                                                         \
2040     __d1 = __ll_highpart (d);                                           \
2041     __d0 = __ll_lowpart (d);                                            \
2042                                                                         \
2043     __q1 = (n1) / __d1;                                                 \
2044     __r1 = (n1) - __q1 * __d1;                                          \
2045     __m = __q1 * __d0;                                                  \
2046     __r1 = __r1 * __ll_B | __ll_highpart (n0);                          \
2047     if (__r1 < __m)                                                     \
2048       {                                                                 \
2049         __q1--, __r1 += (d);                                            \
2050         if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2051           if (__r1 < __m)                                               \
2052             __q1--, __r1 += (d);                                        \
2053       }                                                                 \
2054     __r1 -= __m;                                                        \
2055                                                                         \
2056     __q0 = __r1 / __d1;                                                 \
2057     __r0 = __r1  - __q0 * __d1;                                         \
2058     __m = __q0 * __d0;                                                  \
2059     __r0 = __r0 * __ll_B | __ll_lowpart (n0);                           \
2060     if (__r0 < __m)                                                     \
2061       {                                                                 \
2062         __q0--, __r0 += (d);                                            \
2063         if (__r0 >= (d))                                                \
2064           if (__r0 < __m)                                               \
2065             __q0--, __r0 += (d);                                        \
2066       }                                                                 \
2067     __r0 -= __m;                                                        \
2068                                                                         \
2069     (q) = __q1 * __ll_B | __q0;                                         \
2070     (r) = __r0;                                                         \
2071   } while (0)
2072
2073 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2074    __udiv_w_sdiv (defined in libgcc or elsewhere).  */
2075 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
2076 #define udiv_qrnnd(q, r, nh, nl, d) \
2077   do {                                                                  \
2078     UWtype __r;                                                         \
2079     (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d);                         \
2080     (r) = __r;                                                          \
2081   } while (0)
2082 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2083 #endif
2084
2085 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c.  */
2086 #if !defined (udiv_qrnnd)
2087 #define UDIV_NEEDS_NORMALIZATION 1
2088 #define udiv_qrnnd __udiv_qrnnd_c
2089 #endif
2090
2091 #if !defined (count_leading_zeros)
2092 #define count_leading_zeros(count, x) \
2093   do {                                                                  \
2094     UWtype __xr = (x);                                                  \
2095     UWtype __a;                                                         \
2096                                                                         \
2097     if (W_TYPE_SIZE == 32)                                              \
2098       {                                                                 \
2099         __a = __xr < ((UWtype) 1 << 2*__BITS4)                          \
2100           ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1)          \
2101           : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1           \
2102           : 3*__BITS4 + 1);                                             \
2103       }                                                                 \
2104     else                                                                \
2105       {                                                                 \
2106         for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8)                  \
2107           if (((__xr >> __a) & 0xff) != 0)                              \
2108             break;                                                      \
2109         ++__a;                                                          \
2110       }                                                                 \
2111                                                                         \
2112     (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a];           \
2113   } while (0)
2114 /* This version gives a well-defined value for zero. */
2115 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2116 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2117 #define COUNT_LEADING_ZEROS_SLOW
2118 #endif
2119
2120 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2121 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2122 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2123 #endif
2124
2125 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2126 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
2127 #endif
2128
2129 #if !defined (count_trailing_zeros)
2130 #if !defined (COUNT_LEADING_ZEROS_SLOW)
2131 /* Define count_trailing_zeros using an asm count_leading_zeros.  */
2132 #define count_trailing_zeros(count, x)                                  \
2133   do {                                                                  \
2134     UWtype __ctz_x = (x);                                               \
2135     UWtype __ctz_c;                                                     \
2136     ASSERT (__ctz_x != 0);                                              \
2137     count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x);                  \
2138     (count) = W_TYPE_SIZE - 1 - __ctz_c;                                \
2139   } while (0)
2140 #else
2141 /* Define count_trailing_zeros in plain C, assuming small counts are common.
2142    We use clz_tab without ado, since the C count_leading_zeros above will have
2143    pulled it in.  */
2144 #define count_trailing_zeros(count, x)                                  \
2145   do {                                                                  \
2146     UWtype __ctz_x = (x);                                               \
2147     int __ctz_c;                                                        \
2148                                                                         \
2149     if (LIKELY ((__ctz_x & 0xff) != 0))                                 \
2150       (count) = __clz_tab[__ctz_x & -__ctz_x] - 2;                      \
2151     else                                                                \
2152       {                                                                 \
2153         for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8)  \
2154           {                                                             \
2155             __ctz_x >>= 8;                                              \
2156             if (LIKELY ((__ctz_x & 0xff) != 0))                         \
2157               break;                                                    \
2158           }                                                             \
2159                                                                         \
2160         (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x];              \
2161       }                                                                 \
2162   } while (0)
2163 #endif
2164 #endif
2165
2166 #ifndef UDIV_NEEDS_NORMALIZATION
2167 #define UDIV_NEEDS_NORMALIZATION 0
2168 #endif
2169
2170 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2171    that hence the latter should always be used.  */
2172 #ifndef UDIV_PREINV_ALWAYS
2173 #define UDIV_PREINV_ALWAYS 0
2174 #endif
2175
2176 /* Give defaults for UMUL_TIME and UDIV_TIME.  */
2177 #ifndef UMUL_TIME
2178 #define UMUL_TIME 1
2179 #endif
2180
2181 #ifndef UDIV_TIME
2182 #define UDIV_TIME UMUL_TIME
2183 #endif