1 /* longlong.h -- definitions for mixed size 32/64 bit arithmetic.
3 Copyright 1991-1994, 1996, 1997, 1999-2005, 2007-2009, 2011-2013 Free Software
6 This file is part of the GNU MP Library.
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of either:
11 * the GNU Lesser General Public License as published by the Free
12 Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
17 * the GNU General Public License as published by the Free Software
18 Foundation; either version 2 of the License, or (at your option) any
21 or both in parallel, as here.
23 The GNU MP Library is distributed in the hope that it will be useful, but
24 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
25 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
28 You should have received copies of the GNU General Public License and the
29 GNU Lesser General Public License along with the GNU MP Library. If not,
30 see https://www.gnu.org/licenses/. */
32 /* You have to define the following before including this file:
34 UWtype -- An unsigned type, default type for operations (typically a "word")
35 UHWtype -- An unsigned type, at least half the size of UWtype
36 UDWtype -- An unsigned type, at least twice as large a UWtype
37 W_TYPE_SIZE -- size in bits of UWtype
39 SItype, USItype -- Signed and unsigned 32 bit types
40 DItype, UDItype -- Signed and unsigned 64 bit types
42 On a 32 bit machine UWtype should typically be USItype;
43 on a 64 bit machine, UWtype should typically be UDItype.
47 LONGLONG_STANDALONE -- Avoid code that needs machine-dependent support files
48 NO_ASM -- Disable inline asm
51 CAUTION! Using this version of longlong.h outside of GMP is not safe. You
52 need to include gmp.h and gmp-impl.h, or certain things might not work as
56 #define __BITS4 (W_TYPE_SIZE / 4)
57 #define __ll_B ((UWtype) 1 << (W_TYPE_SIZE / 2))
58 #define __ll_lowpart(t) ((UWtype) (t) & (__ll_B - 1))
59 #define __ll_highpart(t) ((UWtype) (t) >> (W_TYPE_SIZE / 2))
61 /* This is used to make sure no undesirable sharing between different libraries
62 that use this file takes place. */
64 #define __MPN(x) __##x
67 /* Define auxiliary asm macros.
69 1) umul_ppmm(high_prod, low_prod, multiplier, multiplicand) multiplies two
70 UWtype integers MULTIPLIER and MULTIPLICAND, and generates a two UWtype
71 word product in HIGH_PROD and LOW_PROD.
73 2) __umulsidi3(a,b) multiplies two UWtype integers A and B, and returns a
74 UDWtype product. This is just a variant of umul_ppmm.
76 3) udiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
77 denominator) divides a UDWtype, composed by the UWtype integers
78 HIGH_NUMERATOR and LOW_NUMERATOR, by DENOMINATOR and places the quotient
79 in QUOTIENT and the remainder in REMAINDER. HIGH_NUMERATOR must be less
80 than DENOMINATOR for correct operation. If, in addition, the most
81 significant bit of DENOMINATOR must be 1, then the pre-processor symbol
82 UDIV_NEEDS_NORMALIZATION is defined to 1.
84 4) sdiv_qrnnd(quotient, remainder, high_numerator, low_numerator,
85 denominator). Like udiv_qrnnd but the numbers are signed. The quotient
88 5) count_leading_zeros(count, x) counts the number of zero-bits from the
89 msb to the first non-zero bit in the UWtype X. This is the number of
90 steps X needs to be shifted left to set the msb. Undefined for X == 0,
91 unless the symbol COUNT_LEADING_ZEROS_0 is defined to some value.
93 6) count_trailing_zeros(count, x) like count_leading_zeros, but counts
94 from the least significant end.
96 7) add_ssaaaa(high_sum, low_sum, high_addend_1, low_addend_1,
97 high_addend_2, low_addend_2) adds two UWtype integers, composed by
98 HIGH_ADDEND_1 and LOW_ADDEND_1, and HIGH_ADDEND_2 and LOW_ADDEND_2
99 respectively. The result is placed in HIGH_SUM and LOW_SUM. Overflow
100 (i.e. carry out) is not stored anywhere, and is lost.
102 8) sub_ddmmss(high_difference, low_difference, high_minuend, low_minuend,
103 high_subtrahend, low_subtrahend) subtracts two two-word UWtype integers,
104 composed by HIGH_MINUEND_1 and LOW_MINUEND_1, and HIGH_SUBTRAHEND_2 and
105 LOW_SUBTRAHEND_2 respectively. The result is placed in HIGH_DIFFERENCE
106 and LOW_DIFFERENCE. Overflow (i.e. carry out) is not stored anywhere,
109 If any of these macros are left undefined for a particular CPU,
115 For add_ssaaaa the two high and two low addends can both commute, but
116 unfortunately gcc only supports one "%" commutative in each asm block.
117 This has always been so but is only documented in recent versions
118 (eg. pre-release 3.3). Having two or more "%"s can cause an internal
119 compiler error in certain rare circumstances.
121 Apparently it was only the last "%" that was ever actually respected, so
122 the code has been updated to leave just that. Clearly there's a free
123 choice whether high or low should get it, if there's a reason to favour
124 one over the other. Also obviously when the constraints on the two
125 operands are identical there's no benefit to the reloader in any "%" at
130 /* The CPUs come in alphabetical order below.
132 Please add support for more CPUs here, or improve the current support
133 for the CPUs below! */
136 /* count_leading_zeros_gcc_clz is count_leading_zeros implemented with gcc
137 3.4 __builtin_clzl or __builtin_clzll, according to our limb size.
138 Similarly count_trailing_zeros_gcc_ctz using __builtin_ctzl or
141 These builtins are only used when we check what code comes out, on some
142 chips they're merely libgcc calls, where we will instead want an inline
143 in that case (either asm or generic C).
145 These builtins are better than an asm block of the same insn, since an
146 asm block doesn't give gcc any information about scheduling or resource
147 usage. We keep an asm block for use on prior versions of gcc though.
149 For reference, __builtin_ffs existed in gcc prior to __builtin_clz, but
150 it's not used (for count_leading_zeros) because it generally gives extra
151 code to ensure the result is 0 when the input is 0, which we don't need
154 #ifdef _LONG_LONG_LIMB
155 #define count_leading_zeros_gcc_clz(count,x) \
158 (count) = __builtin_clzll (x); \
161 #define count_leading_zeros_gcc_clz(count,x) \
164 (count) = __builtin_clzl (x); \
168 #ifdef _LONG_LONG_LIMB
169 #define count_trailing_zeros_gcc_ctz(count,x) \
172 (count) = __builtin_ctzll (x); \
175 #define count_trailing_zeros_gcc_ctz(count,x) \
178 (count) = __builtin_ctzl (x); \
183 /* FIXME: The macros using external routines like __MPN(count_leading_zeros)
184 don't need to be under !NO_ASM */
185 #if ! defined (NO_ASM)
187 #if defined (__alpha) && W_TYPE_SIZE == 64
188 /* Most alpha-based machines, except Cray systems. */
189 #if defined (__GNUC__)
190 #if __GMP_GNUC_PREREQ (3,3)
191 #define umul_ppmm(ph, pl, m0, m1) \
193 UDItype __m0 = (m0), __m1 = (m1); \
194 (ph) = __builtin_alpha_umulh (__m0, __m1); \
195 (pl) = __m0 * __m1; \
198 #define umul_ppmm(ph, pl, m0, m1) \
200 UDItype __m0 = (m0), __m1 = (m1); \
201 __asm__ ("umulh %r1,%2,%0" \
203 : "%rJ" (m0), "rI" (m1)); \
204 (pl) = __m0 * __m1; \
208 #else /* ! __GNUC__ */
209 #include <machine/builtins.h>
210 #define umul_ppmm(ph, pl, m0, m1) \
212 UDItype __m0 = (m0), __m1 = (m1); \
213 (ph) = __UMULH (m0, m1); \
214 (pl) = __m0 * __m1; \
217 #ifndef LONGLONG_STANDALONE
218 #define udiv_qrnnd(q, r, n1, n0, d) \
220 __di = __MPN(invert_limb) (d); \
221 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
223 #define UDIV_PREINV_ALWAYS 1
224 #define UDIV_NEEDS_NORMALIZATION 1
225 #define UDIV_TIME 220
226 #endif /* LONGLONG_STANDALONE */
228 /* clz_tab is required in all configurations, since mpn/alpha/cntlz.asm
229 always goes into libgmp.so, even when not actually used. */
230 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
232 #if defined (__GNUC__) && HAVE_HOST_CPU_alpha_CIX
233 #define count_leading_zeros(COUNT,X) \
234 __asm__("ctlz %1,%0" : "=r"(COUNT) : "r"(X))
235 #define count_trailing_zeros(COUNT,X) \
236 __asm__("cttz %1,%0" : "=r"(COUNT) : "r"(X))
237 #endif /* clz/ctz using cix */
239 #if ! defined (count_leading_zeros) \
240 && defined (__GNUC__) && ! defined (LONGLONG_STANDALONE)
241 /* ALPHA_CMPBGE_0 gives "cmpbge $31,src,dst", ie. test src bytes == 0.
242 "$31" is written explicitly in the asm, since an "r" constraint won't
243 select reg 31. There seems no need to worry about "r31" syntax for cray,
244 since gcc itself (pre-release 3.4) emits just $31 in various places. */
245 #define ALPHA_CMPBGE_0(dst, src) \
246 do { asm ("cmpbge $31, %1, %0" : "=r" (dst) : "r" (src)); } while (0)
247 /* Zero bytes are turned into bits with cmpbge, a __clz_tab lookup counts
248 them, locating the highest non-zero byte. A second __clz_tab lookup
249 counts the leading zero bits in that byte, giving the result. */
250 #define count_leading_zeros(count, x) \
252 UWtype __clz__b, __clz__c, __clz__x = (x); \
253 ALPHA_CMPBGE_0 (__clz__b, __clz__x); /* zero bytes */ \
254 __clz__b = __clz_tab [(__clz__b >> 1) ^ 0x7F]; /* 8 to 1 byte */ \
255 __clz__b = __clz__b * 8 - 7; /* 57 to 1 shift */ \
256 __clz__x >>= __clz__b; \
257 __clz__c = __clz_tab [__clz__x]; /* 8 to 1 bit */ \
258 __clz__b = 65 - __clz__b; \
259 (count) = __clz__b - __clz__c; \
261 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
262 #endif /* clz using cmpbge */
264 #if ! defined (count_leading_zeros) && ! defined (LONGLONG_STANDALONE)
265 #if HAVE_ATTRIBUTE_CONST
266 long __MPN(count_leading_zeros) (UDItype) __attribute__ ((const));
268 long __MPN(count_leading_zeros) (UDItype);
270 #define count_leading_zeros(count, x) \
271 ((count) = __MPN(count_leading_zeros) (x))
272 #endif /* clz using mpn */
275 #if defined (__AVR) && W_TYPE_SIZE == 8
276 #define umul_ppmm(ph, pl, m0, m1) \
278 unsigned short __p = (unsigned short) (m0) * (m1); \
284 #if defined (_CRAY) && W_TYPE_SIZE == 64
285 #include <intrinsics.h>
286 #define UDIV_PREINV_ALWAYS 1
287 #define UDIV_NEEDS_NORMALIZATION 1
288 #define UDIV_TIME 220
289 long __MPN(count_leading_zeros) (UDItype);
290 #define count_leading_zeros(count, x) \
291 ((count) = _leadz ((UWtype) (x)))
292 #if defined (_CRAYIEEE) /* I.e., Cray T90/ieee, T3D, and T3E */
293 #define umul_ppmm(ph, pl, m0, m1) \
295 UDItype __m0 = (m0), __m1 = (m1); \
296 (ph) = _int_mult_upper (m0, m1); \
297 (pl) = __m0 * __m1; \
299 #ifndef LONGLONG_STANDALONE
300 #define udiv_qrnnd(q, r, n1, n0, d) \
302 __di = __MPN(invert_limb) (d); \
303 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
305 #endif /* LONGLONG_STANDALONE */
306 #endif /* _CRAYIEEE */
309 #if defined (__ia64) && W_TYPE_SIZE == 64
310 /* This form encourages gcc (pre-release 3.4 at least) to emit predicated
311 "sub r=r,r" and "sub r=r,r,1", giving a 2 cycle latency. The generic
312 code using "al<bl" arithmetically comes out making an actual 0 or 1 in a
313 register, which takes an extra cycle. */
314 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
319 (sh) = (ah) - (bh) - 1; \
321 (sh) = (ah) - (bh); \
324 #if defined (__GNUC__) && ! defined (__INTEL_COMPILER)
325 /* Do both product parts in assembly, since that gives better code with
326 all gcc versions. Some callers will just use the upper part, and in
327 that situation we waste an instruction, but not any cycles. */
328 #define umul_ppmm(ph, pl, m0, m1) \
329 __asm__ ("xma.hu %0 = %2, %3, f0\n\txma.l %1 = %2, %3, f0" \
330 : "=&f" (ph), "=f" (pl) \
331 : "f" (m0), "f" (m1))
333 #define count_leading_zeros(count, x) \
335 UWtype _x = (x), _y, _a, _c; \
336 __asm__ ("mux1 %0 = %1, @rev" : "=r" (_y) : "r" (_x)); \
337 __asm__ ("czx1.l %0 = %1" : "=r" (_a) : "r" (-_y | _y)); \
338 _c = (_a - 1) << 3; \
345 (count) = W_TYPE_SIZE - 1 - _c; \
347 /* similar to what gcc does for __builtin_ffs, but 0 based rather than 1
348 based, and we don't need a special case for x==0 here */
349 #define count_trailing_zeros(count, x) \
351 UWtype __ctz_x = (x); \
352 __asm__ ("popcnt %0 = %1" \
354 : "r" ((__ctz_x-1) & ~__ctz_x)); \
357 #if defined (__INTEL_COMPILER)
358 #include <ia64intrin.h>
359 #define umul_ppmm(ph, pl, m0, m1) \
361 UWtype _m0 = (m0), _m1 = (m1); \
362 ph = _m64_xmahu (_m0, _m1, 0); \
366 #ifndef LONGLONG_STANDALONE
367 #define udiv_qrnnd(q, r, n1, n0, d) \
369 __di = __MPN(invert_limb) (d); \
370 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
372 #define UDIV_PREINV_ALWAYS 1
373 #define UDIV_NEEDS_NORMALIZATION 1
375 #define UDIV_TIME 220
379 #if defined (__GNUC__)
381 /* We sometimes need to clobber "cc" with gcc2, but that would not be
382 understood by gcc1. Use cpp to avoid major code duplication. */
385 #define __AND_CLOBBER_CC
386 #else /* __GNUC__ >= 2 */
387 #define __CLOBBER_CC : "cc"
388 #define __AND_CLOBBER_CC , "cc"
389 #endif /* __GNUC__ < 2 */
391 #if (defined (__a29k__) || defined (_AM29K)) && W_TYPE_SIZE == 32
392 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
393 __asm__ ("add %1,%4,%5\n\taddc %0,%2,%3" \
394 : "=r" (sh), "=&r" (sl) \
395 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl))
396 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
397 __asm__ ("sub %1,%4,%5\n\tsubc %0,%2,%3" \
398 : "=r" (sh), "=&r" (sl) \
399 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl))
400 #define umul_ppmm(xh, xl, m0, m1) \
402 USItype __m0 = (m0), __m1 = (m1); \
403 __asm__ ("multiplu %0,%1,%2" \
405 : "r" (__m0), "r" (__m1)); \
406 __asm__ ("multmu %0,%1,%2" \
408 : "r" (__m0), "r" (__m1)); \
410 #define udiv_qrnnd(q, r, n1, n0, d) \
411 __asm__ ("dividu %0,%3,%4" \
412 : "=r" (q), "=q" (r) \
413 : "1" (n1), "r" (n0), "r" (d))
414 #define count_leading_zeros(count, x) \
415 __asm__ ("clz %0,%1" \
418 #define COUNT_LEADING_ZEROS_0 32
419 #endif /* __a29k__ */
421 #if defined (__arc__)
422 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
423 __asm__ ("add.f\t%1, %4, %5\n\tadc\t%0, %2, %3" \
426 : "r" ((USItype) (ah)), \
427 "rIJ" ((USItype) (bh)), \
428 "%r" ((USItype) (al)), \
429 "rIJ" ((USItype) (bl)))
430 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
431 __asm__ ("sub.f\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
434 : "r" ((USItype) (ah)), \
435 "rIJ" ((USItype) (bh)), \
436 "r" ((USItype) (al)), \
437 "rIJ" ((USItype) (bl)))
440 #if defined (__arm__) && !defined (__thumb__) && W_TYPE_SIZE == 32
441 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
442 __asm__ ("adds\t%1, %4, %5\n\tadc\t%0, %2, %3" \
443 : "=r" (sh), "=&r" (sl) \
444 : "r" (ah), "rI" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
445 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
447 if (__builtin_constant_p (al)) \
449 if (__builtin_constant_p (ah)) \
450 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
451 : "=r" (sh), "=&r" (sl) \
452 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
454 __asm__ ("rsbs\t%1, %5, %4\n\tsbc\t%0, %2, %3" \
455 : "=r" (sh), "=&r" (sl) \
456 : "r" (ah), "rI" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
458 else if (__builtin_constant_p (ah)) \
460 if (__builtin_constant_p (bl)) \
461 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \
462 : "=r" (sh), "=&r" (sl) \
463 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
465 __asm__ ("rsbs\t%1, %5, %4\n\trsc\t%0, %3, %2" \
466 : "=r" (sh), "=&r" (sl) \
467 : "rI" (ah), "r" (bh), "rI" (al), "r" (bl) __CLOBBER_CC); \
469 else if (__builtin_constant_p (bl)) \
471 if (__builtin_constant_p (bh)) \
472 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
473 : "=r" (sh), "=&r" (sl) \
474 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
476 __asm__ ("subs\t%1, %4, %5\n\trsc\t%0, %3, %2" \
477 : "=r" (sh), "=&r" (sl) \
478 : "rI" (ah), "r" (bh), "r" (al), "rI" (bl) __CLOBBER_CC); \
480 else /* only bh might be a constant */ \
481 __asm__ ("subs\t%1, %4, %5\n\tsbc\t%0, %2, %3" \
482 : "=r" (sh), "=&r" (sl) \
483 : "r" (ah), "rI" (bh), "r" (al), "rI" (bl) __CLOBBER_CC);\
485 #if 1 || defined (__arm_m__) /* `M' series has widening multiply support */
486 #define umul_ppmm(xh, xl, a, b) \
487 __asm__ ("umull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
489 #define smul_ppmm(xh, xl, a, b) \
490 __asm__ ("smull %0,%1,%2,%3" : "=&r" (xl), "=&r" (xh) : "r" (a), "r" (b))
491 #ifndef LONGLONG_STANDALONE
492 #define udiv_qrnnd(q, r, n1, n0, d) \
494 __di = __MPN(invert_limb) (d); \
495 udiv_qrnnd_preinv (q, r, n1, n0, d, __di); \
497 #define UDIV_PREINV_ALWAYS 1
498 #define UDIV_NEEDS_NORMALIZATION 1
500 #endif /* LONGLONG_STANDALONE */
502 #define umul_ppmm(xh, xl, a, b) \
503 __asm__ ("%@ Inlined umul_ppmm\n" \
504 " mov %|r0, %2, lsr #16\n" \
505 " mov %|r2, %3, lsr #16\n" \
506 " bic %|r1, %2, %|r0, lsl #16\n" \
507 " bic %|r2, %3, %|r2, lsl #16\n" \
508 " mul %1, %|r1, %|r2\n" \
509 " mul %|r2, %|r0, %|r2\n" \
510 " mul %|r1, %0, %|r1\n" \
511 " mul %0, %|r0, %0\n" \
512 " adds %|r1, %|r2, %|r1\n" \
513 " addcs %0, %0, #65536\n" \
514 " adds %1, %1, %|r1, lsl #16\n" \
515 " adc %0, %0, %|r1, lsr #16" \
516 : "=&r" (xh), "=r" (xl) \
520 #ifndef LONGLONG_STANDALONE
521 #define udiv_qrnnd(q, r, n1, n0, d) \
523 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
526 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
527 #define UDIV_TIME 200
528 #endif /* LONGLONG_STANDALONE */
530 /* This is a bizarre test, but GCC doesn't define any useful common symbol. */
531 #if defined (__ARM_ARCH_5__) || defined (__ARM_ARCH_5T__) || \
532 defined (__ARM_ARCH_5E__) || defined (__ARM_ARCH_5TE__)|| \
533 defined (__ARM_ARCH_6__) || defined (__ARM_ARCH_6J__) || \
534 defined (__ARM_ARCH_6K__) || defined (__ARM_ARCH_6Z__) || \
535 defined (__ARM_ARCH_6ZK__)|| defined (__ARM_ARCH_6T2__)|| \
536 defined (__ARM_ARCH_6M__) || defined (__ARM_ARCH_7__) || \
537 defined (__ARM_ARCH_7A__) || defined (__ARM_ARCH_7R__) || \
538 defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
539 #define count_leading_zeros(count, x) \
540 __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
541 #define COUNT_LEADING_ZEROS_0 32
545 #if defined (__aarch64__) && W_TYPE_SIZE == 64
546 /* FIXME: Extend the immediate range for the low word by using both
547 ADDS and SUBS, since they set carry in the same way. */
548 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
549 __asm__ ("adds\t%1, %x4, %5\n\tadc\t%0, %x2, %x3" \
550 : "=r" (sh), "=&r" (sl) \
551 : "rZ" (ah), "rZ" (bh), "%r" (al), "rI" (bl) __CLOBBER_CC)
552 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
553 __asm__ ("subs\t%1, %x4, %5\n\tsbc\t%0, %x2, %x3" \
554 : "=r,r" (sh), "=&r,&r" (sl) \
555 : "rZ,rZ" (ah), "rZ,rZ" (bh), "r,Z" (al), "rI,r" (bl) __CLOBBER_CC)
556 #define umul_ppmm(ph, pl, m0, m1) \
558 UDItype __m0 = (m0), __m1 = (m1); \
559 __asm__ ("umulh\t%0, %1, %2" : "=r" (ph) : "r" (m0), "r" (m1)); \
560 (pl) = __m0 * __m1; \
562 #define count_leading_zeros(count, x) \
563 __asm__ ("clz\t%0, %1" : "=r" (count) : "r" (x))
564 #define count_trailing_zeros(count, x) \
565 __asm__ ("rbit\t%0, %1\n\tclz\t%0, %0" : "=r" (count) : "r" (x))
566 #define COUNT_LEADING_ZEROS_0 64
567 #endif /* __aarch64__ */
569 #if defined (__clipper__) && W_TYPE_SIZE == 32
570 #define umul_ppmm(w1, w0, u, v) \
571 ({union {UDItype __ll; \
572 struct {USItype __l, __h;} __i; \
574 __asm__ ("mulwux %2,%0" \
576 : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
577 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
578 #define smul_ppmm(w1, w0, u, v) \
579 ({union {DItype __ll; \
580 struct {SItype __l, __h;} __i; \
582 __asm__ ("mulwx %2,%0" \
584 : "%0" ((SItype)(u)), "r" ((SItype)(v))); \
585 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
586 #define __umulsidi3(u, v) \
588 __asm__ ("mulwux %2,%0" \
589 : "=r" (__w) : "%0" ((USItype)(u)), "r" ((USItype)(v))); \
591 #endif /* __clipper__ */
593 /* Fujitsu vector computers. */
594 #if defined (__uxp__) && W_TYPE_SIZE == 32
595 #define umul_ppmm(ph, pl, u, v) \
597 union {UDItype __ll; \
598 struct {USItype __h, __l;} __i; \
600 __asm__ ("mult.lu %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v));\
601 (ph) = __x.__i.__h; \
602 (pl) = __x.__i.__l; \
604 #define smul_ppmm(ph, pl, u, v) \
606 union {UDItype __ll; \
607 struct {USItype __h, __l;} __i; \
609 __asm__ ("mult.l %1,%2,%0" : "=r" (__x.__ll) : "%r" (u), "rK" (v)); \
610 (ph) = __x.__i.__h; \
611 (pl) = __x.__i.__l; \
615 #if defined (__gmicro__) && W_TYPE_SIZE == 32
616 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
617 __asm__ ("add.w %5,%1\n\taddx %3,%0" \
618 : "=g" (sh), "=&g" (sl) \
619 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
620 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
621 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
622 __asm__ ("sub.w %5,%1\n\tsubx %3,%0" \
623 : "=g" (sh), "=&g" (sl) \
624 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
625 "1" ((USItype)(al)), "g" ((USItype)(bl)))
626 #define umul_ppmm(ph, pl, m0, m1) \
627 __asm__ ("mulx %3,%0,%1" \
628 : "=g" (ph), "=r" (pl) \
629 : "%0" ((USItype)(m0)), "g" ((USItype)(m1)))
630 #define udiv_qrnnd(q, r, nh, nl, d) \
631 __asm__ ("divx %4,%0,%1" \
632 : "=g" (q), "=r" (r) \
633 : "1" ((USItype)(nh)), "0" ((USItype)(nl)), "g" ((USItype)(d)))
634 #define count_leading_zeros(count, x) \
635 __asm__ ("bsch/1 %1,%0" \
636 : "=g" (count) : "g" ((USItype)(x)), "0" ((USItype)0))
639 #if defined (__hppa) && W_TYPE_SIZE == 32
640 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
641 __asm__ ("add%I5 %5,%r4,%1\n\taddc %r2,%r3,%0" \
642 : "=r" (sh), "=&r" (sl) \
643 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
644 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
645 __asm__ ("sub%I4 %4,%r5,%1\n\tsubb %r2,%r3,%0" \
646 : "=r" (sh), "=&r" (sl) \
647 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
648 #if defined (_PA_RISC1_1)
649 #define umul_ppmm(wh, wl, u, v) \
651 union {UDItype __ll; \
652 struct {USItype __h, __l;} __i; \
654 __asm__ ("xmpyu %1,%2,%0" : "=*f" (__x.__ll) : "*f" (u), "*f" (v)); \
655 (wh) = __x.__i.__h; \
656 (wl) = __x.__i.__l; \
664 #define count_leading_zeros(count, x) \
669 " extru,= %1,15,16,%%r0 ; Bits 31..16 zero?\n" \
670 " extru,tr %1,15,16,%1 ; No. Shift down, skip add.\n" \
671 " ldo 16(%0),%0 ; Yes. Perform add.\n" \
672 " extru,= %1,23,8,%%r0 ; Bits 15..8 zero?\n" \
673 " extru,tr %1,23,8,%1 ; No. Shift down, skip add.\n" \
674 " ldo 8(%0),%0 ; Yes. Perform add.\n" \
675 " extru,= %1,27,4,%%r0 ; Bits 7..4 zero?\n" \
676 " extru,tr %1,27,4,%1 ; No. Shift down, skip add.\n" \
677 " ldo 4(%0),%0 ; Yes. Perform add.\n" \
678 " extru,= %1,29,2,%%r0 ; Bits 3..2 zero?\n" \
679 " extru,tr %1,29,2,%1 ; No. Shift down, skip add.\n" \
680 " ldo 2(%0),%0 ; Yes. Perform add.\n" \
681 " extru %1,30,1,%1 ; Extract bit 1.\n" \
682 " sub %0,%1,%0 ; Subtract it.\n" \
683 : "=r" (count), "=r" (__tmp) : "1" (x)); \
687 /* These macros are for ABI=2.0w. In ABI=2.0n they can't be used, since GCC
688 (3.2) puts longlong into two adjacent 32-bit registers. Presumably this
689 is just a case of no direct support for 2.0n but treating it like 1.0. */
690 #if defined (__hppa) && W_TYPE_SIZE == 64 && ! defined (_LONG_LONG_LIMB)
691 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
692 __asm__ ("add%I5 %5,%r4,%1\n\tadd,dc %r2,%r3,%0" \
693 : "=r" (sh), "=&r" (sl) \
694 : "rM" (ah), "rM" (bh), "%rM" (al), "rI" (bl))
695 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
696 __asm__ ("sub%I4 %4,%r5,%1\n\tsub,db %r2,%r3,%0" \
697 : "=r" (sh), "=&r" (sl) \
698 : "rM" (ah), "rM" (bh), "rI" (al), "rM" (bl))
701 #if (defined (__i370__) || defined (__s390__) || defined (__mvs__)) && W_TYPE_SIZE == 32
702 #if defined (__zarch__) || defined (HAVE_HOST_CPU_s390_zarch)
703 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
705 /* if (__builtin_constant_p (bl)) \
706 __asm__ ("alfi\t%1,%o5\n\talcr\t%0,%3" \
707 : "=r" (sh), "=&r" (sl) \
708 : "0" (ah), "r" (bh), "%1" (al), "n" (bl) __CLOBBER_CC);\
710 */ __asm__ ("alr\t%1,%5\n\talcr\t%0,%3" \
711 : "=r" (sh), "=&r" (sl) \
712 : "0" (ah), "r" (bh), "%1" (al), "r" (bl)__CLOBBER_CC); \
714 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
716 /* if (__builtin_constant_p (bl)) \
717 __asm__ ("slfi\t%1,%o5\n\tslbr\t%0,%3" \
718 : "=r" (sh), "=&r" (sl) \
719 : "0" (ah), "r" (bh), "1" (al), "n" (bl) __CLOBBER_CC); \
721 */ __asm__ ("slr\t%1,%5\n\tslbr\t%0,%3" \
722 : "=r" (sh), "=&r" (sl) \
723 : "0" (ah), "r" (bh), "1" (al), "r" (bl) __CLOBBER_CC); \
725 #if __GMP_GNUC_PREREQ (4,5)
726 #define umul_ppmm(xh, xl, m0, m1) \
728 union {UDItype __ll; \
729 struct {USItype __h, __l;} __i; \
731 __x.__ll = (UDItype) (m0) * (UDItype) (m1); \
732 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
736 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only
737 with a new enough processor pretending we have 32-bit registers. */
738 #define umul_ppmm(xh, xl, m0, m1) \
740 union {UDItype __ll; \
741 struct {USItype __h, __l;} __i; \
743 __asm__ ("mlr\t%0,%2" \
745 : "%0" (m0), "r" (m1)); \
746 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
749 #define umul_ppmm(xh, xl, m0, m1) \
751 /* When we have 64-bit regs and gcc is aware of that, we cannot simply use
752 DImode for the product, since that would be allocated to a single 64-bit
753 register, whereas mlr uses the low 32-bits of an even-odd register pair.
755 register USItype __r0 __asm__ ("0"); \
756 register USItype __r1 __asm__ ("1") = (m0); \
757 __asm__ ("mlr\t%0,%3" \
758 : "=r" (__r0), "=r" (__r1) \
759 : "r" (__r1), "r" (m1)); \
760 (xh) = __r0; (xl) = __r1; \
765 /* FIXME: this fails if gcc knows about the 64-bit registers. Use only
766 with a new enough processor pretending we have 32-bit registers. */
767 #define udiv_qrnnd(q, r, n1, n0, d) \
769 union {UDItype __ll; \
770 struct {USItype __h, __l;} __i; \
772 __x.__i.__h = n1; __x.__i.__l = n0; \
773 __asm__ ("dlr\t%0,%2" \
775 : "0" (__x.__ll), "r" (d)); \
776 (q) = __x.__i.__l; (r) = __x.__i.__h; \
779 #define udiv_qrnnd(q, r, n1, n0, d) \
781 register USItype __r0 __asm__ ("0") = (n1); \
782 register USItype __r1 __asm__ ("1") = (n0); \
783 __asm__ ("dlr\t%0,%4" \
784 : "=r" (__r0), "=r" (__r1) \
785 : "r" (__r0), "r" (__r1), "r" (d)); \
786 (q) = __r1; (r) = __r0; \
789 #else /* if __zarch__ */
790 /* FIXME: this fails if gcc knows about the 64-bit registers. */
791 #define smul_ppmm(xh, xl, m0, m1) \
793 union {DItype __ll; \
794 struct {USItype __h, __l;} __i; \
796 __asm__ ("mr\t%0,%2" \
798 : "%0" (m0), "r" (m1)); \
799 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
801 /* FIXME: this fails if gcc knows about the 64-bit registers. */
802 #define sdiv_qrnnd(q, r, n1, n0, d) \
804 union {DItype __ll; \
805 struct {USItype __h, __l;} __i; \
807 __x.__i.__h = n1; __x.__i.__l = n0; \
808 __asm__ ("dr\t%0,%2" \
810 : "0" (__x.__ll), "r" (d)); \
811 (q) = __x.__i.__l; (r) = __x.__i.__h; \
813 #endif /* if __zarch__ */
816 #if defined (__s390x__) && W_TYPE_SIZE == 64
817 /* We need to cast operands with register constraints, otherwise their types
818 will be assumed to be SImode by gcc. For these machines, such operations
819 will insert a value into the low 32 bits, and leave the high 32 bits with
821 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
823 __asm__ ("algr\t%1,%5\n\talcgr\t%0,%3" \
824 : "=r" (sh), "=&r" (sl) \
825 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
826 "%1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
828 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
830 __asm__ ("slgr\t%1,%5\n\tslbgr\t%0,%3" \
831 : "=r" (sh), "=&r" (sl) \
832 : "0" ((UDItype)(ah)), "r" ((UDItype)(bh)), \
833 "1" ((UDItype)(al)), "r" ((UDItype)(bl)) __CLOBBER_CC); \
835 #define umul_ppmm(xh, xl, m0, m1) \
837 union {unsigned int __attribute__ ((mode(TI))) __ll; \
838 struct {UDItype __h, __l;} __i; \
840 __asm__ ("mlgr\t%0,%2" \
842 : "%0" ((UDItype)(m0)), "r" ((UDItype)(m1))); \
843 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
845 #define udiv_qrnnd(q, r, n1, n0, d) \
847 union {unsigned int __attribute__ ((mode(TI))) __ll; \
848 struct {UDItype __h, __l;} __i; \
850 __x.__i.__h = n1; __x.__i.__l = n0; \
851 __asm__ ("dlgr\t%0,%2" \
853 : "0" (__x.__ll), "r" ((UDItype)(d))); \
854 (q) = __x.__i.__l; (r) = __x.__i.__h; \
856 #if 0 /* FIXME: Enable for z10 (?) */
857 #define count_leading_zeros(cnt, x) \
859 union {unsigned int __attribute__ ((mode(TI))) __ll; \
860 struct {UDItype __h, __l;} __i; \
862 __asm__ ("flogr\t%0,%1" \
863 : "=r" (__clr_cnt.__ll) \
864 : "r" (x) __CLOBBER_CC); \
865 (cnt) = __clr_cnt.__i.__h; \
870 #if (defined (__i386__) || defined (__i486__)) && W_TYPE_SIZE == 32
871 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
872 __asm__ ("addl %5,%k1\n\tadcl %3,%k0" \
873 : "=r" (sh), "=&r" (sl) \
874 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
875 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
876 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
877 __asm__ ("subl %5,%k1\n\tsbbl %3,%k0" \
878 : "=r" (sh), "=&r" (sl) \
879 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
880 "1" ((USItype)(al)), "g" ((USItype)(bl)))
881 #define umul_ppmm(w1, w0, u, v) \
883 : "=a" (w0), "=d" (w1) \
884 : "%0" ((USItype)(u)), "rm" ((USItype)(v)))
885 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
886 __asm__ ("divl %4" /* stringification in K&R C */ \
887 : "=a" (q), "=d" (r) \
888 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "rm" ((USItype)(dx)))
890 #if HAVE_HOST_CPU_i586 || HAVE_HOST_CPU_pentium || HAVE_HOST_CPU_pentiummmx
891 /* Pentium bsrl takes between 10 and 72 cycles depending where the most
892 significant 1 bit is, hence the use of the following alternatives. bsfl
893 is slow too, between 18 and 42 depending where the least significant 1
894 bit is, so let the generic count_trailing_zeros below make use of the
895 count_leading_zeros here too. */
897 #if HAVE_HOST_CPU_pentiummmx && ! defined (LONGLONG_STANDALONE)
898 /* The following should be a fixed 14 or 15 cycles, but possibly plus an L1
899 cache miss reading from __clz_tab. For P55 it's favoured over the float
900 below so as to avoid mixing MMX and x87, since the penalty for switching
901 between the two is about 100 cycles.
903 The asm block sets __shift to -3 if the high 24 bits are clear, -2 for
904 16, -1 for 8, or 0 otherwise. This could be written equivalently as
905 follows, but as of gcc 2.95.2 it results in conditional jumps.
907 __shift = -(__n < 0x1000000);
908 __shift -= (__n < 0x10000);
909 __shift -= (__n < 0x100);
911 The middle two sbbl and cmpl's pair, and with luck something gcc
912 generates might pair with the first cmpl and the last sbbl. The "32+1"
913 constant could be folded into __clz_tab[], but it doesn't seem worth
914 making a different table just for that. */
916 #define count_leading_zeros(c,n) \
920 __asm__ ("cmpl $0x1000000, %1\n" \
922 "cmpl $0x10000, %1\n" \
924 "cmpl $0x100, %1\n" \
926 : "=&r" (__shift) : "r" (__n)); \
927 __shift = __shift*8 + 24 + 1; \
928 (c) = 32 + 1 - __shift - __clz_tab[__n >> __shift]; \
930 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
931 #define COUNT_LEADING_ZEROS_0 31 /* n==0 indistinguishable from n==1 */
933 #else /* ! pentiummmx || LONGLONG_STANDALONE */
934 /* The following should be a fixed 14 cycles or so. Some scheduling
935 opportunities should be available between the float load/store too. This
936 sort of code is used in gcc 3 for __builtin_ffs (with "n&-n") and is
937 apparently suggested by the Intel optimizing manual (don't know exactly
938 where). gcc 2.95 or up will be best for this, so the "double" is
939 correctly aligned on the stack. */
940 #define count_leading_zeros(c,n) \
947 __u.d = (UWtype) (n); \
948 (c) = 0x3FF + 31 - (__u.a[1] >> 20); \
950 #define COUNT_LEADING_ZEROS_0 (0x3FF + 31)
951 #endif /* pentiummx */
953 #else /* ! pentium */
955 #if __GMP_GNUC_PREREQ (3,4) /* using bsrl */
956 #define count_leading_zeros(count,x) count_leading_zeros_gcc_clz(count,x)
959 /* On P6, gcc prior to 3.0 generates a partial register stall for
960 __cbtmp^31, due to using "xorb $31" instead of "xorl $31", the former
961 being 1 code byte smaller. "31-__cbtmp" is a workaround, probably at the
962 cost of one extra instruction. Do this for "i386" too, since that means
964 #if ! defined (count_leading_zeros) && __GNUC__ < 3 \
965 && (HAVE_HOST_CPU_i386 \
966 || HAVE_HOST_CPU_i686 \
967 || HAVE_HOST_CPU_pentiumpro \
968 || HAVE_HOST_CPU_pentium2 \
969 || HAVE_HOST_CPU_pentium3)
970 #define count_leading_zeros(count, x) \
974 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
975 (count) = 31 - __cbtmp; \
977 #endif /* gcc<3 asm bsrl */
979 #ifndef count_leading_zeros
980 #define count_leading_zeros(count, x) \
984 __asm__ ("bsrl %1,%0" : "=r" (__cbtmp) : "rm" ((USItype)(x))); \
985 (count) = __cbtmp ^ 31; \
987 #endif /* asm bsrl */
989 #if __GMP_GNUC_PREREQ (3,4) /* using bsfl */
990 #define count_trailing_zeros(count,x) count_trailing_zeros_gcc_ctz(count,x)
993 #ifndef count_trailing_zeros
994 #define count_trailing_zeros(count, x) \
997 __asm__ ("bsfl %1,%k0" : "=r" (count) : "rm" ((USItype)(x))); \
999 #endif /* asm bsfl */
1001 #endif /* ! pentium */
1004 #define UMUL_TIME 10
1007 #define UDIV_TIME 40
1011 #if defined (__amd64__) && W_TYPE_SIZE == 64
1012 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1013 __asm__ ("addq %5,%q1\n\tadcq %3,%q0" \
1014 : "=r" (sh), "=&r" (sl) \
1015 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
1016 "%1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1017 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1018 __asm__ ("subq %5,%q1\n\tsbbq %3,%q0" \
1019 : "=r" (sh), "=&r" (sl) \
1020 : "0" ((UDItype)(ah)), "rme" ((UDItype)(bh)), \
1021 "1" ((UDItype)(al)), "rme" ((UDItype)(bl)))
1022 #define umul_ppmm(w1, w0, u, v) \
1023 __asm__ ("mulq %3" \
1024 : "=a" (w0), "=d" (w1) \
1025 : "%0" ((UDItype)(u)), "rm" ((UDItype)(v)))
1026 #define udiv_qrnnd(q, r, n1, n0, dx) /* d renamed to dx avoiding "=d" */\
1027 __asm__ ("divq %4" /* stringification in K&R C */ \
1028 : "=a" (q), "=d" (r) \
1029 : "0" ((UDItype)(n0)), "1" ((UDItype)(n1)), "rm" ((UDItype)(dx)))
1030 /* bsrq destination must be a 64-bit register, hence UDItype for __cbtmp. */
1031 #define count_leading_zeros(count, x) \
1034 ASSERT ((x) != 0); \
1035 __asm__ ("bsrq %1,%0" : "=r" (__cbtmp) : "rm" ((UDItype)(x))); \
1036 (count) = __cbtmp ^ 63; \
1038 /* bsfq destination must be a 64-bit register, "%q0" forces this in case
1039 count is only an int. */
1040 #define count_trailing_zeros(count, x) \
1042 ASSERT ((x) != 0); \
1043 __asm__ ("bsfq %1,%q0" : "=r" (count) : "rm" ((UDItype)(x))); \
1045 #endif /* __amd64__ */
1047 #if defined (__i860__) && W_TYPE_SIZE == 32
1048 #define rshift_rhlc(r,h,l,c) \
1049 __asm__ ("shr %3,r0,r0\;shrd %1,%2,%0" \
1050 "=r" (r) : "r" (h), "r" (l), "rn" (c))
1053 #if defined (__i960__) && W_TYPE_SIZE == 32
1054 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1055 __asm__ ("cmpo 1,0\;addc %5,%4,%1\;addc %3,%2,%0" \
1056 : "=r" (sh), "=&r" (sl) \
1057 : "dI" (ah), "dI" (bh), "%dI" (al), "dI" (bl))
1058 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1059 __asm__ ("cmpo 0,0\;subc %5,%4,%1\;subc %3,%2,%0" \
1060 : "=r" (sh), "=&r" (sl) \
1061 : "dI" (ah), "dI" (bh), "dI" (al), "dI" (bl))
1062 #define umul_ppmm(w1, w0, u, v) \
1063 ({union {UDItype __ll; \
1064 struct {USItype __l, __h;} __i; \
1066 __asm__ ("emul %2,%1,%0" \
1067 : "=d" (__x.__ll) : "%dI" (u), "dI" (v)); \
1068 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1069 #define __umulsidi3(u, v) \
1071 __asm__ ("emul %2,%1,%0" : "=d" (__w) : "%dI" (u), "dI" (v)); \
1073 #define udiv_qrnnd(q, r, nh, nl, d) \
1075 union {UDItype __ll; \
1076 struct {USItype __l, __h;} __i; \
1078 __nn.__i.__h = (nh); __nn.__i.__l = (nl); \
1079 __asm__ ("ediv %d,%n,%0" \
1080 : "=d" (__rq.__ll) : "dI" (__nn.__ll), "dI" (d)); \
1081 (r) = __rq.__i.__l; (q) = __rq.__i.__h; \
1083 #define count_leading_zeros(count, x) \
1086 __asm__ ("scanbit %1,%0" : "=r" (__cbtmp) : "r" (x)); \
1087 (count) = __cbtmp ^ 31; \
1089 #define COUNT_LEADING_ZEROS_0 (-32) /* sic */
1090 #if defined (__i960mx) /* what is the proper symbol to test??? */
1091 #define rshift_rhlc(r,h,l,c) \
1093 union {UDItype __ll; \
1094 struct {USItype __l, __h;} __i; \
1096 __nn.__i.__h = (h); __nn.__i.__l = (l); \
1097 __asm__ ("shre %2,%1,%0" : "=d" (r) : "dI" (__nn.__ll), "dI" (c)); \
1102 #if (defined (__mc68000__) || defined (__mc68020__) || defined(mc68020) \
1103 || defined (__m68k__) || defined (__mc5200__) || defined (__mc5206e__) \
1104 || defined (__mc5307__)) && W_TYPE_SIZE == 32
1105 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1106 __asm__ ("add%.l %5,%1\n\taddx%.l %3,%0" \
1107 : "=d" (sh), "=&d" (sl) \
1108 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
1109 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1110 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1111 __asm__ ("sub%.l %5,%1\n\tsubx%.l %3,%0" \
1112 : "=d" (sh), "=&d" (sl) \
1113 : "0" ((USItype)(ah)), "d" ((USItype)(bh)), \
1114 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1115 /* The '020, '030, '040 and CPU32 have 32x32->64 and 64/32->32q-32r. */
1116 #if defined (__mc68020__) || defined(mc68020) \
1117 || defined (__mc68030__) || defined (mc68030) \
1118 || defined (__mc68040__) || defined (mc68040) \
1119 || defined (__mcpu32__) || defined (mcpu32) \
1120 || defined (__NeXT__)
1121 #define umul_ppmm(w1, w0, u, v) \
1122 __asm__ ("mulu%.l %3,%1:%0" \
1123 : "=d" (w0), "=d" (w1) \
1124 : "%0" ((USItype)(u)), "dmi" ((USItype)(v)))
1125 #define UMUL_TIME 45
1126 #define udiv_qrnnd(q, r, n1, n0, d) \
1127 __asm__ ("divu%.l %4,%1:%0" \
1128 : "=d" (q), "=d" (r) \
1129 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1130 #define UDIV_TIME 90
1131 #define sdiv_qrnnd(q, r, n1, n0, d) \
1132 __asm__ ("divs%.l %4,%1:%0" \
1133 : "=d" (q), "=d" (r) \
1134 : "0" ((USItype)(n0)), "1" ((USItype)(n1)), "dmi" ((USItype)(d)))
1135 #else /* for other 68k family members use 16x16->32 multiplication */
1136 #define umul_ppmm(xh, xl, a, b) \
1137 do { USItype __umul_tmp1, __umul_tmp2; \
1138 __asm__ ("| Inlined umul_ppmm\n" \
1139 " move%.l %5,%3\n" \
1140 " move%.l %2,%0\n" \
1141 " move%.w %3,%1\n" \
1144 " mulu%.w %2,%1\n" \
1145 " mulu%.w %3,%0\n" \
1146 " mulu%.w %2,%3\n" \
1148 " mulu%.w %5,%2\n" \
1151 " add%.l %#0x10000,%0\n" \
1152 "1: move%.l %2,%3\n" \
1158 " addx%.l %2,%0\n" \
1159 " | End inlined umul_ppmm" \
1160 : "=&d" (xh), "=&d" (xl), \
1161 "=d" (__umul_tmp1), "=&d" (__umul_tmp2) \
1162 : "%2" ((USItype)(a)), "d" ((USItype)(b))); \
1164 #define UMUL_TIME 100
1165 #define UDIV_TIME 400
1166 #endif /* not mc68020 */
1167 /* The '020, '030, '040 and '060 have bitfield insns.
1168 GCC 3.4 defines __mc68020__ when in CPU32 mode, check for __mcpu32__ to
1169 exclude bfffo on that chip (bitfield insns not available). */
1170 #if (defined (__mc68020__) || defined (mc68020) \
1171 || defined (__mc68030__) || defined (mc68030) \
1172 || defined (__mc68040__) || defined (mc68040) \
1173 || defined (__mc68060__) || defined (mc68060) \
1174 || defined (__NeXT__)) \
1175 && ! defined (__mcpu32__)
1176 #define count_leading_zeros(count, x) \
1177 __asm__ ("bfffo %1{%b2:%b2},%0" \
1179 : "od" ((USItype) (x)), "n" (0))
1180 #define COUNT_LEADING_ZEROS_0 32
1182 #endif /* mc68000 */
1184 #if defined (__m88000__) && W_TYPE_SIZE == 32
1185 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1186 __asm__ ("addu.co %1,%r4,%r5\n\taddu.ci %0,%r2,%r3" \
1187 : "=r" (sh), "=&r" (sl) \
1188 : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rJ" (bl))
1189 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1190 __asm__ ("subu.co %1,%r4,%r5\n\tsubu.ci %0,%r2,%r3" \
1191 : "=r" (sh), "=&r" (sl) \
1192 : "rJ" (ah), "rJ" (bh), "rJ" (al), "rJ" (bl))
1193 #define count_leading_zeros(count, x) \
1196 __asm__ ("ff1 %0,%1" : "=r" (__cbtmp) : "r" (x)); \
1197 (count) = __cbtmp ^ 31; \
1199 #define COUNT_LEADING_ZEROS_0 63 /* sic */
1200 #if defined (__m88110__)
1201 #define umul_ppmm(wh, wl, u, v) \
1203 union {UDItype __ll; \
1204 struct {USItype __h, __l;} __i; \
1206 __asm__ ("mulu.d %0,%1,%2" : "=r" (__x.__ll) : "r" (u), "r" (v)); \
1207 (wh) = __x.__i.__h; \
1208 (wl) = __x.__i.__l; \
1210 #define udiv_qrnnd(q, r, n1, n0, d) \
1211 ({union {UDItype __ll; \
1212 struct {USItype __h, __l;} __i; \
1214 __x.__i.__h = (n1); __x.__i.__l = (n0); \
1215 __asm__ ("divu.d %0,%1,%2" \
1216 : "=r" (__q.__ll) : "r" (__x.__ll), "r" (d)); \
1217 (r) = (n0) - __q.__l * (d); (q) = __q.__l; })
1219 #define UDIV_TIME 25
1221 #define UMUL_TIME 17
1222 #define UDIV_TIME 150
1223 #endif /* __m88110__ */
1224 #endif /* __m88000__ */
1226 #if defined (__mips) && W_TYPE_SIZE == 32
1227 #if __GMP_GNUC_PREREQ (4,4)
1228 #define umul_ppmm(w1, w0, u, v) \
1230 UDItype __ll = (UDItype)(u) * (v); \
1235 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1236 #define umul_ppmm(w1, w0, u, v) \
1237 __asm__ ("multu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1239 #if !defined (umul_ppmm)
1240 #define umul_ppmm(w1, w0, u, v) \
1241 __asm__ ("multu %2,%3\n\tmflo %0\n\tmfhi %1" \
1242 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1244 #define UMUL_TIME 10
1245 #define UDIV_TIME 100
1248 #if (defined (__mips) && __mips >= 3) && W_TYPE_SIZE == 64
1249 #if __GMP_GNUC_PREREQ (4,4)
1250 #define umul_ppmm(w1, w0, u, v) \
1252 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
1253 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
1258 #if !defined (umul_ppmm) && __GMP_GNUC_PREREQ (2,7)
1259 #define umul_ppmm(w1, w0, u, v) \
1260 __asm__ ("dmultu %2,%3" : "=l" (w0), "=h" (w1) : "d" (u), "d" (v))
1262 #if !defined (umul_ppmm)
1263 #define umul_ppmm(w1, w0, u, v) \
1264 __asm__ ("dmultu %2,%3\n\tmflo %0\n\tmfhi %1" \
1265 : "=d" (w0), "=d" (w1) : "d" (u), "d" (v))
1267 #define UMUL_TIME 20
1268 #define UDIV_TIME 140
1271 #if defined (__mmix__) && W_TYPE_SIZE == 64
1272 #define umul_ppmm(w1, w0, u, v) \
1273 __asm__ ("MULU %0,%2,%3" : "=r" (w0), "=z" (w1) : "r" (u), "r" (v))
1276 #if defined (__ns32000__) && W_TYPE_SIZE == 32
1277 #define umul_ppmm(w1, w0, u, v) \
1278 ({union {UDItype __ll; \
1279 struct {USItype __l, __h;} __i; \
1281 __asm__ ("meid %2,%0" \
1283 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1284 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1285 #define __umulsidi3(u, v) \
1287 __asm__ ("meid %2,%0" \
1289 : "%0" ((USItype)(u)), "g" ((USItype)(v))); \
1291 #define udiv_qrnnd(q, r, n1, n0, d) \
1292 ({union {UDItype __ll; \
1293 struct {USItype __l, __h;} __i; \
1295 __x.__i.__h = (n1); __x.__i.__l = (n0); \
1296 __asm__ ("deid %2,%0" \
1298 : "0" (__x.__ll), "g" ((USItype)(d))); \
1299 (r) = __x.__i.__l; (q) = __x.__i.__h; })
1300 #define count_trailing_zeros(count,x) \
1302 __asm__ ("ffsd %2,%0" \
1304 : "0" ((USItype) 0), "r" ((USItype) (x))); \
1306 #endif /* __ns32000__ */
1308 /* In the past we had a block of various #defines tested
1314 PPC - old gcc, GNU/Linux, SysV
1315 The plain PPC test was not good for vxWorks, since PPC is defined on all
1316 CPUs there (eg. m68k too), as a constant one is expected to compare
1319 At any rate, this was pretty unattractive and a bit fragile. The use of
1320 HAVE_HOST_CPU_FAMILY is designed to cut through it all and be sure of
1321 getting the desired effect.
1323 ENHANCE-ME: We should test _IBMR2 here when we add assembly support for
1324 the system vendor compilers. (Is that vendor compilers with inline asm,
1327 #if (HAVE_HOST_CPU_FAMILY_power || HAVE_HOST_CPU_FAMILY_powerpc) \
1328 && W_TYPE_SIZE == 32
1329 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1331 if (__builtin_constant_p (bh) && (bh) == 0) \
1332 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \
1333 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
1334 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1335 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \
1336 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
1338 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \
1339 : "=r" (sh), "=&r" (sl) \
1340 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \
1342 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1344 if (__builtin_constant_p (ah) && (ah) == 0) \
1345 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \
1346 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1347 else if (__builtin_constant_p (ah) && (ah) == ~(USItype) 0) \
1348 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \
1349 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl));\
1350 else if (__builtin_constant_p (bh) && (bh) == 0) \
1351 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \
1352 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1353 else if (__builtin_constant_p (bh) && (bh) == ~(USItype) 0) \
1354 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \
1355 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl));\
1357 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \
1358 : "=r" (sh), "=&r" (sl) \
1359 : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \
1361 #define count_leading_zeros(count, x) \
1362 __asm__ ("cntlzw %0,%1" : "=r" (count) : "r" (x))
1363 #define COUNT_LEADING_ZEROS_0 32
1364 #if HAVE_HOST_CPU_FAMILY_powerpc
1365 #if __GMP_GNUC_PREREQ (4,4)
1366 #define umul_ppmm(w1, w0, u, v) \
1368 UDItype __ll = (UDItype)(u) * (v); \
1373 #if !defined (umul_ppmm)
1374 #define umul_ppmm(ph, pl, m0, m1) \
1376 USItype __m0 = (m0), __m1 = (m1); \
1377 __asm__ ("mulhwu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1378 (pl) = __m0 * __m1; \
1381 #define UMUL_TIME 15
1382 #define smul_ppmm(ph, pl, m0, m1) \
1384 SItype __m0 = (m0), __m1 = (m1); \
1385 __asm__ ("mulhw %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1386 (pl) = __m0 * __m1; \
1388 #define SMUL_TIME 14
1389 #define UDIV_TIME 120
1392 #define smul_ppmm(xh, xl, m0, m1) \
1393 __asm__ ("mul %0,%2,%3" : "=r" (xh), "=q" (xl) : "r" (m0), "r" (m1))
1395 #define sdiv_qrnnd(q, r, nh, nl, d) \
1396 __asm__ ("div %0,%2,%4" : "=r" (q), "=q" (r) : "r" (nh), "1" (nl), "r" (d))
1397 #define UDIV_TIME 100
1399 #endif /* 32-bit POWER architecture variants. */
1401 /* We should test _IBMR2 here when we add assembly support for the system
1402 vendor compilers. */
1403 #if HAVE_HOST_CPU_FAMILY_powerpc && W_TYPE_SIZE == 64
1404 #if !defined (_LONG_LONG_LIMB)
1405 /* _LONG_LONG_LIMB is ABI=mode32 where adde operates on 32-bit values. So
1406 use adde etc only when not _LONG_LONG_LIMB. */
1407 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1409 if (__builtin_constant_p (bh) && (bh) == 0) \
1410 __asm__ ("add%I4c %1,%3,%4\n\taddze %0,%2" \
1411 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
1412 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1413 __asm__ ("add%I4c %1,%3,%4\n\taddme %0,%2" \
1414 : "=r" (sh), "=&r" (sl) : "r" (ah), "%r" (al), "rI" (bl)); \
1416 __asm__ ("add%I5c %1,%4,%5\n\tadde %0,%2,%3" \
1417 : "=r" (sh), "=&r" (sl) \
1418 : "r" (ah), "r" (bh), "%r" (al), "rI" (bl)); \
1420 /* We use "*rI" for the constant operand here, since with just "I", gcc barfs.
1421 This might seem strange, but gcc folds away the dead code late. */
1422 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1424 if (__builtin_constant_p (bl) && bl > -0x8000 && bl <= 0x8000) { \
1425 if (__builtin_constant_p (ah) && (ah) == 0) \
1426 __asm__ ("addic %1,%3,%4\n\tsubfze %0,%2" \
1427 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1428 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
1429 __asm__ ("addic %1,%3,%4\n\tsubfme %0,%2" \
1430 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "*rI" (-bl)); \
1431 else if (__builtin_constant_p (bh) && (bh) == 0) \
1432 __asm__ ("addic %1,%3,%4\n\taddme %0,%2" \
1433 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1434 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1435 __asm__ ("addic %1,%3,%4\n\taddze %0,%2" \
1436 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "*rI" (-bl)); \
1438 __asm__ ("addic %1,%4,%5\n\tsubfe %0,%3,%2" \
1439 : "=r" (sh), "=&r" (sl) \
1440 : "r" (ah), "r" (bh), "rI" (al), "*rI" (-bl)); \
1442 if (__builtin_constant_p (ah) && (ah) == 0) \
1443 __asm__ ("subf%I3c %1,%4,%3\n\tsubfze %0,%2" \
1444 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)); \
1445 else if (__builtin_constant_p (ah) && (ah) == ~(UDItype) 0) \
1446 __asm__ ("subf%I3c %1,%4,%3\n\tsubfme %0,%2" \
1447 : "=r" (sh), "=&r" (sl) : "r" (bh), "rI" (al), "r" (bl)); \
1448 else if (__builtin_constant_p (bh) && (bh) == 0) \
1449 __asm__ ("subf%I3c %1,%4,%3\n\taddme %0,%2" \
1450 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)); \
1451 else if (__builtin_constant_p (bh) && (bh) == ~(UDItype) 0) \
1452 __asm__ ("subf%I3c %1,%4,%3\n\taddze %0,%2" \
1453 : "=r" (sh), "=&r" (sl) : "r" (ah), "rI" (al), "r" (bl)); \
1455 __asm__ ("subf%I4c %1,%5,%4\n\tsubfe %0,%3,%2" \
1456 : "=r" (sh), "=&r" (sl) \
1457 : "r" (ah), "r" (bh), "rI" (al), "r" (bl)); \
1460 #endif /* ! _LONG_LONG_LIMB */
1461 #define count_leading_zeros(count, x) \
1462 __asm__ ("cntlzd %0,%1" : "=r" (count) : "r" (x))
1463 #define COUNT_LEADING_ZEROS_0 64
1464 #if 0 && __GMP_GNUC_PREREQ (4,4) /* Disable, this results in libcalls! */
1465 #define umul_ppmm(w1, w0, u, v) \
1467 typedef unsigned int __ll_UTItype __attribute__((mode(TI))); \
1468 __ll_UTItype __ll = (__ll_UTItype)(u) * (v); \
1473 #if !defined (umul_ppmm)
1474 #define umul_ppmm(ph, pl, m0, m1) \
1476 UDItype __m0 = (m0), __m1 = (m1); \
1477 __asm__ ("mulhdu %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1478 (pl) = __m0 * __m1; \
1481 #define UMUL_TIME 15
1482 #define smul_ppmm(ph, pl, m0, m1) \
1484 DItype __m0 = (m0), __m1 = (m1); \
1485 __asm__ ("mulhd %0,%1,%2" : "=r" (ph) : "%r" (m0), "r" (m1)); \
1486 (pl) = __m0 * __m1; \
1488 #define SMUL_TIME 14 /* ??? */
1489 #define UDIV_TIME 120 /* ??? */
1490 #endif /* 64-bit PowerPC. */
1492 #if defined (__pyr__) && W_TYPE_SIZE == 32
1493 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1494 __asm__ ("addw %5,%1\n\taddwc %3,%0" \
1495 : "=r" (sh), "=&r" (sl) \
1496 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1497 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1498 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1499 __asm__ ("subw %5,%1\n\tsubwb %3,%0" \
1500 : "=r" (sh), "=&r" (sl) \
1501 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1502 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1503 /* This insn works on Pyramids with AP, XP, or MI CPUs, but not with SP. */
1504 #define umul_ppmm(w1, w0, u, v) \
1505 ({union {UDItype __ll; \
1506 struct {USItype __h, __l;} __i; \
1508 __asm__ ("movw %1,%R0\n\tuemul %2,%0" \
1509 : "=&r" (__x.__ll) \
1510 : "g" ((USItype) (u)), "g" ((USItype)(v))); \
1511 (w1) = __x.__i.__h; (w0) = __x.__i.__l;})
1512 #endif /* __pyr__ */
1514 #if defined (__ibm032__) /* RT/ROMP */ && W_TYPE_SIZE == 32
1515 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1516 __asm__ ("a %1,%5\n\tae %0,%3" \
1517 : "=r" (sh), "=&r" (sl) \
1518 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1519 "%1" ((USItype)(al)), "r" ((USItype)(bl)))
1520 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1521 __asm__ ("s %1,%5\n\tse %0,%3" \
1522 : "=r" (sh), "=&r" (sl) \
1523 : "0" ((USItype)(ah)), "r" ((USItype)(bh)), \
1524 "1" ((USItype)(al)), "r" ((USItype)(bl)))
1525 #define smul_ppmm(ph, pl, m0, m1) \
1547 : "=r" (ph), "=r" (pl) \
1548 : "%r" ((USItype)(m0)), "r" ((USItype)(m1)) \
1550 #define UMUL_TIME 20
1551 #define UDIV_TIME 200
1552 #define count_leading_zeros(count, x) \
1554 if ((x) >= 0x10000) \
1555 __asm__ ("clz %0,%1" \
1556 : "=r" (count) : "r" ((USItype)(x) >> 16)); \
1559 __asm__ ("clz %0,%1" \
1560 : "=r" (count) : "r" ((USItype)(x))); \
1564 #endif /* RT/ROMP */
1566 #if (defined (__SH2__) || defined (__SH3__) || defined (__SH4__)) && W_TYPE_SIZE == 32
1567 #define umul_ppmm(w1, w0, u, v) \
1568 __asm__ ("dmulu.l %2,%3\n\tsts macl,%1\n\tsts mach,%0" \
1569 : "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "macl", "mach")
1573 #if defined (__sparc__) && W_TYPE_SIZE == 32
1574 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1575 __asm__ ("addcc %r4,%5,%1\n\taddx %r2,%3,%0" \
1576 : "=r" (sh), "=&r" (sl) \
1577 : "rJ" (ah), "rI" (bh),"%rJ" (al), "rI" (bl) \
1579 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1580 __asm__ ("subcc %r4,%5,%1\n\tsubx %r2,%3,%0" \
1581 : "=r" (sh), "=&r" (sl) \
1582 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl) \
1584 /* FIXME: When gcc -mcpu=v9 is used on solaris, gcc/config/sol2-sld-64.h
1585 doesn't define anything to indicate that to us, it only sets __sparcv8. */
1586 #if defined (__sparc_v9__) || defined (__sparcv9)
1587 /* Perhaps we should use floating-point operations here? */
1589 /* Triggers a bug making mpz/tests/t-gcd.c fail.
1590 Perhaps we simply need explicitly zero-extend the inputs? */
1591 #define umul_ppmm(w1, w0, u, v) \
1592 __asm__ ("mulx %2,%3,%%g1; srl %%g1,0,%1; srlx %%g1,32,%0" : \
1593 "=r" (w1), "=r" (w0) : "r" (u), "r" (v) : "g1")
1595 /* Use v8 umul until above bug is fixed. */
1596 #define umul_ppmm(w1, w0, u, v) \
1597 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1599 /* Use a plain v8 divide for v9. */
1600 #define udiv_qrnnd(q, r, n1, n0, d) \
1603 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1604 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1605 (r) = (n0) - __q * (d); \
1609 #if defined (__sparc_v8__) /* gcc normal */ \
1610 || defined (__sparcv8) /* gcc solaris */ \
1611 || HAVE_HOST_CPU_supersparc
1612 /* Don't match immediate range because, 1) it is not often useful,
1613 2) the 'I' flag thinks of the range as a 13 bit signed interval,
1614 while we want to match a 13 bit interval, sign extended to 32 bits,
1615 but INTERPRETED AS UNSIGNED. */
1616 #define umul_ppmm(w1, w0, u, v) \
1617 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1620 #if HAVE_HOST_CPU_supersparc
1621 #define UDIV_TIME 60 /* SuperSPARC timing */
1623 /* Don't use this on SuperSPARC because its udiv only handles 53 bit
1624 dividends and will trap to the kernel for the rest. */
1625 #define udiv_qrnnd(q, r, n1, n0, d) \
1628 __asm__ ("mov %1,%%y;nop;nop;nop;udiv %2,%3,%0" \
1629 : "=r" (__q) : "r" (n1), "r" (n0), "r" (d)); \
1630 (r) = (n0) - __q * (d); \
1633 #define UDIV_TIME 25
1634 #endif /* HAVE_HOST_CPU_supersparc */
1636 #else /* ! __sparc_v8__ */
1637 #if defined (__sparclite__)
1638 /* This has hardware multiply but not divide. It also has two additional
1639 instructions scan (ffs from high bit) and divscc. */
1640 #define umul_ppmm(w1, w0, u, v) \
1641 __asm__ ("umul %2,%3,%1;rd %%y,%0" : "=r" (w1), "=r" (w0) : "r" (u), "r" (v))
1643 #define udiv_qrnnd(q, r, n1, n0, d) \
1644 __asm__ ("! Inlined udiv_qrnnd\n" \
1645 " wr %%g0,%2,%%y ! Not a delayed write for sparclite\n" \
1647 " divscc %3,%4,%%g1\n" \
1648 " divscc %%g1,%4,%%g1\n" \
1649 " divscc %%g1,%4,%%g1\n" \
1650 " divscc %%g1,%4,%%g1\n" \
1651 " divscc %%g1,%4,%%g1\n" \
1652 " divscc %%g1,%4,%%g1\n" \
1653 " divscc %%g1,%4,%%g1\n" \
1654 " divscc %%g1,%4,%%g1\n" \
1655 " divscc %%g1,%4,%%g1\n" \
1656 " divscc %%g1,%4,%%g1\n" \
1657 " divscc %%g1,%4,%%g1\n" \
1658 " divscc %%g1,%4,%%g1\n" \
1659 " divscc %%g1,%4,%%g1\n" \
1660 " divscc %%g1,%4,%%g1\n" \
1661 " divscc %%g1,%4,%%g1\n" \
1662 " divscc %%g1,%4,%%g1\n" \
1663 " divscc %%g1,%4,%%g1\n" \
1664 " divscc %%g1,%4,%%g1\n" \
1665 " divscc %%g1,%4,%%g1\n" \
1666 " divscc %%g1,%4,%%g1\n" \
1667 " divscc %%g1,%4,%%g1\n" \
1668 " divscc %%g1,%4,%%g1\n" \
1669 " divscc %%g1,%4,%%g1\n" \
1670 " divscc %%g1,%4,%%g1\n" \
1671 " divscc %%g1,%4,%%g1\n" \
1672 " divscc %%g1,%4,%%g1\n" \
1673 " divscc %%g1,%4,%%g1\n" \
1674 " divscc %%g1,%4,%%g1\n" \
1675 " divscc %%g1,%4,%%g1\n" \
1676 " divscc %%g1,%4,%%g1\n" \
1677 " divscc %%g1,%4,%%g1\n" \
1678 " divscc %%g1,%4,%0\n" \
1682 "1: ! End of inline udiv_qrnnd" \
1683 : "=r" (q), "=r" (r) : "r" (n1), "r" (n0), "rI" (d) \
1684 : "%g1" __AND_CLOBBER_CC)
1685 #define UDIV_TIME 37
1686 #define count_leading_zeros(count, x) \
1687 __asm__ ("scan %1,1,%0" : "=r" (count) : "r" (x))
1688 /* Early sparclites return 63 for an argument of 0, but they warn that future
1689 implementations might change this. Therefore, leave COUNT_LEADING_ZEROS_0
1691 #endif /* __sparclite__ */
1692 #endif /* __sparc_v8__ */
1693 #endif /* __sparc_v9__ */
1694 /* Default to sparc v7 versions of umul_ppmm and udiv_qrnnd. */
1696 #define umul_ppmm(w1, w0, u, v) \
1697 __asm__ ("! Inlined umul_ppmm\n" \
1698 " wr %%g0,%2,%%y ! SPARC has 0-3 delay insn after a wr\n" \
1699 " sra %3,31,%%g2 ! Don't move this insn\n" \
1700 " and %2,%%g2,%%g2 ! Don't move this insn\n" \
1701 " andcc %%g0,0,%%g1 ! Don't move this insn\n" \
1702 " mulscc %%g1,%3,%%g1\n" \
1703 " mulscc %%g1,%3,%%g1\n" \
1704 " mulscc %%g1,%3,%%g1\n" \
1705 " mulscc %%g1,%3,%%g1\n" \
1706 " mulscc %%g1,%3,%%g1\n" \
1707 " mulscc %%g1,%3,%%g1\n" \
1708 " mulscc %%g1,%3,%%g1\n" \
1709 " mulscc %%g1,%3,%%g1\n" \
1710 " mulscc %%g1,%3,%%g1\n" \
1711 " mulscc %%g1,%3,%%g1\n" \
1712 " mulscc %%g1,%3,%%g1\n" \
1713 " mulscc %%g1,%3,%%g1\n" \
1714 " mulscc %%g1,%3,%%g1\n" \
1715 " mulscc %%g1,%3,%%g1\n" \
1716 " mulscc %%g1,%3,%%g1\n" \
1717 " mulscc %%g1,%3,%%g1\n" \
1718 " mulscc %%g1,%3,%%g1\n" \
1719 " mulscc %%g1,%3,%%g1\n" \
1720 " mulscc %%g1,%3,%%g1\n" \
1721 " mulscc %%g1,%3,%%g1\n" \
1722 " mulscc %%g1,%3,%%g1\n" \
1723 " mulscc %%g1,%3,%%g1\n" \
1724 " mulscc %%g1,%3,%%g1\n" \
1725 " mulscc %%g1,%3,%%g1\n" \
1726 " mulscc %%g1,%3,%%g1\n" \
1727 " mulscc %%g1,%3,%%g1\n" \
1728 " mulscc %%g1,%3,%%g1\n" \
1729 " mulscc %%g1,%3,%%g1\n" \
1730 " mulscc %%g1,%3,%%g1\n" \
1731 " mulscc %%g1,%3,%%g1\n" \
1732 " mulscc %%g1,%3,%%g1\n" \
1733 " mulscc %%g1,%3,%%g1\n" \
1734 " mulscc %%g1,0,%%g1\n" \
1735 " add %%g1,%%g2,%0\n" \
1737 : "=r" (w1), "=r" (w0) : "%rI" (u), "r" (v) \
1738 : "%g1", "%g2" __AND_CLOBBER_CC)
1739 #define UMUL_TIME 39 /* 39 instructions */
1742 #ifndef LONGLONG_STANDALONE
1743 #define udiv_qrnnd(q, r, n1, n0, d) \
1745 (q) = __MPN(udiv_qrnnd) (&__r, (n1), (n0), (d)); \
1748 extern UWtype __MPN(udiv_qrnnd) (UWtype *, UWtype, UWtype, UWtype);
1750 #define UDIV_TIME 140
1752 #endif /* LONGLONG_STANDALONE */
1753 #endif /* udiv_qrnnd */
1754 #endif /* __sparc__ */
1756 #if defined (__sparc__) && W_TYPE_SIZE == 64
1757 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1759 "addcc %r4,%5,%1\n" \
1760 " addccc %r6,%7,%%g0\n" \
1762 : "=r" (sh), "=&r" (sl) \
1763 : "rJ" (ah), "rI" (bh), "%rJ" (al), "rI" (bl), \
1764 "%rJ" ((al) >> 32), "rI" ((bl) >> 32) \
1766 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1768 "subcc %r4,%5,%1\n" \
1769 " subccc %r6,%7,%%g0\n" \
1771 : "=r" (sh), "=&r" (sl) \
1772 : "rJ" (ah), "rI" (bh), "rJ" (al), "rI" (bl), \
1773 "rJ" ((al) >> 32), "rI" ((bl) >> 32) \
1775 #if __VIS__ >= 0x300
1777 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1779 "addcc %r4, %5, %1\n" \
1780 " addxc %r2, %r3, %0" \
1781 : "=r" (sh), "=&r" (sl) \
1782 : "rJ" (ah), "rJ" (bh), "%rJ" (al), "rI" (bl) __CLOBBER_CC)
1783 #define umul_ppmm(ph, pl, m0, m1) \
1785 UDItype __m0 = (m0), __m1 = (m1); \
1786 (pl) = __m0 * __m1; \
1787 __asm__ ("umulxhi\t%2, %1, %0" \
1789 : "%r" (__m0), "r" (__m1)); \
1791 #define count_leading_zeros(count, x) \
1792 __asm__ ("lzd\t%1,%0" : "=r" (count) : "r" (x))
1796 #if (defined (__vax) || defined (__vax__)) && W_TYPE_SIZE == 32
1797 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1798 __asm__ ("addl2 %5,%1\n\tadwc %3,%0" \
1799 : "=g" (sh), "=&g" (sl) \
1800 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1801 "%1" ((USItype)(al)), "g" ((USItype)(bl)))
1802 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1803 __asm__ ("subl2 %5,%1\n\tsbwc %3,%0" \
1804 : "=g" (sh), "=&g" (sl) \
1805 : "0" ((USItype)(ah)), "g" ((USItype)(bh)), \
1806 "1" ((USItype)(al)), "g" ((USItype)(bl)))
1807 #define smul_ppmm(xh, xl, m0, m1) \
1809 union {UDItype __ll; \
1810 struct {USItype __l, __h;} __i; \
1812 USItype __m0 = (m0), __m1 = (m1); \
1813 __asm__ ("emul %1,%2,$0,%0" \
1814 : "=g" (__x.__ll) : "g" (__m0), "g" (__m1)); \
1815 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1817 #define sdiv_qrnnd(q, r, n1, n0, d) \
1819 union {DItype __ll; \
1820 struct {SItype __l, __h;} __i; \
1822 __x.__i.__h = n1; __x.__i.__l = n0; \
1823 __asm__ ("ediv %3,%2,%0,%1" \
1824 : "=g" (q), "=g" (r) : "g" (__x.__ll), "g" (d)); \
1827 /* FIXME: This instruction appears to be unimplemented on some systems (vax
1829 #define count_trailing_zeros(count,x) \
1831 __asm__ ("ffs 0, 31, %1, %0" \
1833 : "g" ((USItype) (x))); \
1838 #if defined (__z8000__) && W_TYPE_SIZE == 16
1839 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1840 __asm__ ("add %H1,%H5\n\tadc %H0,%H3" \
1841 : "=r" (sh), "=&r" (sl) \
1842 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1843 "%1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1844 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1845 __asm__ ("sub %H1,%H5\n\tsbc %H0,%H3" \
1846 : "=r" (sh), "=&r" (sl) \
1847 : "0" ((unsigned int)(ah)), "r" ((unsigned int)(bh)), \
1848 "1" ((unsigned int)(al)), "rQR" ((unsigned int)(bl)))
1849 #define umul_ppmm(xh, xl, m0, m1) \
1851 union {long int __ll; \
1852 struct {unsigned int __h, __l;} __i; \
1854 unsigned int __m0 = (m0), __m1 = (m1); \
1855 __asm__ ("mult %S0,%H3" \
1856 : "=r" (__x.__i.__h), "=r" (__x.__i.__l) \
1857 : "%1" (m0), "rQR" (m1)); \
1858 (xh) = __x.__i.__h; (xl) = __x.__i.__l; \
1859 (xh) += ((((signed int) __m0 >> 15) & __m1) \
1860 + (((signed int) __m1 >> 15) & __m0)); \
1862 #endif /* __z8000__ */
1864 #endif /* __GNUC__ */
1869 /* FIXME: "sidi" here is highly doubtful, should sometimes be "diti". */
1870 #if !defined (umul_ppmm) && defined (__umulsidi3)
1871 #define umul_ppmm(ph, pl, m0, m1) \
1873 UDWtype __ll = __umulsidi3 (m0, m1); \
1874 ph = (UWtype) (__ll >> W_TYPE_SIZE); \
1875 pl = (UWtype) __ll; \
1879 #if !defined (__umulsidi3)
1880 #define __umulsidi3(u, v) \
1881 ({UWtype __hi, __lo; \
1882 umul_ppmm (__hi, __lo, u, v); \
1883 ((UDWtype) __hi << W_TYPE_SIZE) | __lo; })
1887 /* Use mpn_umul_ppmm or mpn_udiv_qrnnd functions, if they exist. The "_r"
1888 forms have "reversed" arguments, meaning the pointer is last, which
1889 sometimes allows better parameter passing, in particular on 64-bit
1892 #define mpn_umul_ppmm __MPN(umul_ppmm)
1893 extern UWtype mpn_umul_ppmm (UWtype *, UWtype, UWtype);
1895 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm \
1896 && ! defined (LONGLONG_STANDALONE)
1897 #define umul_ppmm(wh, wl, u, v) \
1899 UWtype __umul_ppmm__p0; \
1900 (wh) = mpn_umul_ppmm (&__umul_ppmm__p0, (UWtype) (u), (UWtype) (v));\
1901 (wl) = __umul_ppmm__p0; \
1905 #define mpn_umul_ppmm_r __MPN(umul_ppmm_r)
1906 extern UWtype mpn_umul_ppmm_r (UWtype, UWtype, UWtype *);
1908 #if ! defined (umul_ppmm) && HAVE_NATIVE_mpn_umul_ppmm_r \
1909 && ! defined (LONGLONG_STANDALONE)
1910 #define umul_ppmm(wh, wl, u, v) \
1913 (wh) = mpn_umul_ppmm_r ((UWtype) (u), (UWtype) (v), &__umul_p0); \
1918 #define mpn_udiv_qrnnd __MPN(udiv_qrnnd)
1919 extern UWtype mpn_udiv_qrnnd (UWtype *, UWtype, UWtype, UWtype);
1921 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd \
1922 && ! defined (LONGLONG_STANDALONE)
1923 #define udiv_qrnnd(q, r, n1, n0, d) \
1925 UWtype __udiv_qrnnd_r; \
1926 (q) = mpn_udiv_qrnnd (&__udiv_qrnnd_r, \
1927 (UWtype) (n1), (UWtype) (n0), (UWtype) d); \
1928 (r) = __udiv_qrnnd_r; \
1932 #define mpn_udiv_qrnnd_r __MPN(udiv_qrnnd_r)
1933 extern UWtype mpn_udiv_qrnnd_r (UWtype, UWtype, UWtype, UWtype *);
1935 #if ! defined (udiv_qrnnd) && HAVE_NATIVE_mpn_udiv_qrnnd_r \
1936 && ! defined (LONGLONG_STANDALONE)
1937 #define udiv_qrnnd(q, r, n1, n0, d) \
1939 UWtype __udiv_qrnnd_r; \
1940 (q) = mpn_udiv_qrnnd_r ((UWtype) (n1), (UWtype) (n0), (UWtype) d, \
1942 (r) = __udiv_qrnnd_r; \
1947 /* If this machine has no inline assembler, use C macros. */
1949 #if !defined (add_ssaaaa)
1950 #define add_ssaaaa(sh, sl, ah, al, bh, bl) \
1953 __x = (al) + (bl); \
1954 (sh) = (ah) + (bh) + (__x < (al)); \
1959 #if !defined (sub_ddmmss)
1960 #define sub_ddmmss(sh, sl, ah, al, bh, bl) \
1963 __x = (al) - (bl); \
1964 (sh) = (ah) - (bh) - ((al) < (bl)); \
1969 /* If we lack umul_ppmm but have smul_ppmm, define umul_ppmm in terms of
1971 #if !defined (umul_ppmm) && defined (smul_ppmm)
1972 #define umul_ppmm(w1, w0, u, v) \
1975 UWtype __xm0 = (u), __xm1 = (v); \
1976 smul_ppmm (__w1, w0, __xm0, __xm1); \
1977 (w1) = __w1 + (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
1978 + (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
1982 /* If we still don't have umul_ppmm, define it using plain C.
1984 For reference, when this code is used for squaring (ie. u and v identical
1985 expressions), gcc recognises __x1 and __x2 are the same and generates 3
1986 multiplies, not 4. The subsequent additions could be optimized a bit,
1987 but the only place GMP currently uses such a square is mpn_sqr_basecase,
1988 and chips obliged to use this generic C umul will have plenty of worse
1989 performance problems than a couple of extra instructions on the diagonal
1992 #if !defined (umul_ppmm)
1993 #define umul_ppmm(w1, w0, u, v) \
1995 UWtype __x0, __x1, __x2, __x3; \
1996 UHWtype __ul, __vl, __uh, __vh; \
1997 UWtype __u = (u), __v = (v); \
1999 __ul = __ll_lowpart (__u); \
2000 __uh = __ll_highpart (__u); \
2001 __vl = __ll_lowpart (__v); \
2002 __vh = __ll_highpart (__v); \
2004 __x0 = (UWtype) __ul * __vl; \
2005 __x1 = (UWtype) __ul * __vh; \
2006 __x2 = (UWtype) __uh * __vl; \
2007 __x3 = (UWtype) __uh * __vh; \
2009 __x1 += __ll_highpart (__x0);/* this can't give carry */ \
2010 __x1 += __x2; /* but this indeed can */ \
2011 if (__x1 < __x2) /* did we get it? */ \
2012 __x3 += __ll_B; /* yes, add it in the proper pos. */ \
2014 (w1) = __x3 + __ll_highpart (__x1); \
2015 (w0) = (__x1 << W_TYPE_SIZE/2) + __ll_lowpart (__x0); \
2019 /* If we don't have smul_ppmm, define it using umul_ppmm (which surely will
2020 exist in one form or another. */
2021 #if !defined (smul_ppmm)
2022 #define smul_ppmm(w1, w0, u, v) \
2025 UWtype __xm0 = (u), __xm1 = (v); \
2026 umul_ppmm (__w1, w0, __xm0, __xm1); \
2027 (w1) = __w1 - (-(__xm0 >> (W_TYPE_SIZE - 1)) & __xm1) \
2028 - (-(__xm1 >> (W_TYPE_SIZE - 1)) & __xm0); \
2032 /* Define this unconditionally, so it can be used for debugging. */
2033 #define __udiv_qrnnd_c(q, r, n1, n0, d) \
2035 UWtype __d1, __d0, __q1, __q0, __r1, __r0, __m; \
2037 ASSERT ((d) != 0); \
2038 ASSERT ((n1) < (d)); \
2040 __d1 = __ll_highpart (d); \
2041 __d0 = __ll_lowpart (d); \
2043 __q1 = (n1) / __d1; \
2044 __r1 = (n1) - __q1 * __d1; \
2045 __m = __q1 * __d0; \
2046 __r1 = __r1 * __ll_B | __ll_highpart (n0); \
2049 __q1--, __r1 += (d); \
2050 if (__r1 >= (d)) /* i.e. we didn't get carry when adding to __r1 */\
2052 __q1--, __r1 += (d); \
2056 __q0 = __r1 / __d1; \
2057 __r0 = __r1 - __q0 * __d1; \
2058 __m = __q0 * __d0; \
2059 __r0 = __r0 * __ll_B | __ll_lowpart (n0); \
2062 __q0--, __r0 += (d); \
2065 __q0--, __r0 += (d); \
2069 (q) = __q1 * __ll_B | __q0; \
2073 /* If the processor has no udiv_qrnnd but sdiv_qrnnd, go through
2074 __udiv_w_sdiv (defined in libgcc or elsewhere). */
2075 #if !defined (udiv_qrnnd) && defined (sdiv_qrnnd)
2076 #define udiv_qrnnd(q, r, nh, nl, d) \
2079 (q) = __MPN(udiv_w_sdiv) (&__r, nh, nl, d); \
2082 __GMP_DECLSPEC UWtype __MPN(udiv_w_sdiv) (UWtype *, UWtype, UWtype, UWtype);
2085 /* If udiv_qrnnd was not defined for this processor, use __udiv_qrnnd_c. */
2086 #if !defined (udiv_qrnnd)
2087 #define UDIV_NEEDS_NORMALIZATION 1
2088 #define udiv_qrnnd __udiv_qrnnd_c
2091 #if !defined (count_leading_zeros)
2092 #define count_leading_zeros(count, x) \
2094 UWtype __xr = (x); \
2097 if (W_TYPE_SIZE == 32) \
2099 __a = __xr < ((UWtype) 1 << 2*__BITS4) \
2100 ? (__xr < ((UWtype) 1 << __BITS4) ? 1 : __BITS4 + 1) \
2101 : (__xr < ((UWtype) 1 << 3*__BITS4) ? 2*__BITS4 + 1 \
2106 for (__a = W_TYPE_SIZE - 8; __a > 0; __a -= 8) \
2107 if (((__xr >> __a) & 0xff) != 0) \
2112 (count) = W_TYPE_SIZE + 1 - __a - __clz_tab[__xr >> __a]; \
2114 /* This version gives a well-defined value for zero. */
2115 #define COUNT_LEADING_ZEROS_0 (W_TYPE_SIZE - 1)
2116 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2117 #define COUNT_LEADING_ZEROS_SLOW
2120 /* clz_tab needed by mpn/x86/pentium/mod_1.asm in a fat binary */
2121 #if HAVE_HOST_CPU_FAMILY_x86 && WANT_FAT_BINARY
2122 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2125 #ifdef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
2126 extern const unsigned char __GMP_DECLSPEC __clz_tab[129];
2129 #if !defined (count_trailing_zeros)
2130 #if !defined (COUNT_LEADING_ZEROS_SLOW)
2131 /* Define count_trailing_zeros using an asm count_leading_zeros. */
2132 #define count_trailing_zeros(count, x) \
2134 UWtype __ctz_x = (x); \
2136 ASSERT (__ctz_x != 0); \
2137 count_leading_zeros (__ctz_c, __ctz_x & -__ctz_x); \
2138 (count) = W_TYPE_SIZE - 1 - __ctz_c; \
2141 /* Define count_trailing_zeros in plain C, assuming small counts are common.
2142 We use clz_tab without ado, since the C count_leading_zeros above will have
2144 #define count_trailing_zeros(count, x) \
2146 UWtype __ctz_x = (x); \
2149 if (LIKELY ((__ctz_x & 0xff) != 0)) \
2150 (count) = __clz_tab[__ctz_x & -__ctz_x] - 2; \
2153 for (__ctz_c = 8 - 2; __ctz_c < W_TYPE_SIZE - 2; __ctz_c += 8) \
2156 if (LIKELY ((__ctz_x & 0xff) != 0)) \
2160 (count) = __ctz_c + __clz_tab[__ctz_x & -__ctz_x]; \
2166 #ifndef UDIV_NEEDS_NORMALIZATION
2167 #define UDIV_NEEDS_NORMALIZATION 0
2170 /* Whether udiv_qrnnd is actually implemented with udiv_qrnnd_preinv, and
2171 that hence the latter should always be used. */
2172 #ifndef UDIV_PREINV_ALWAYS
2173 #define UDIV_PREINV_ALWAYS 0
2176 /* Give defaults for UMUL_TIME and UDIV_TIME. */
2182 #define UDIV_TIME UMUL_TIME