Upload Tizen:Base source
[external/gmp.git] / tune / speed.h
1 /* Header for speed and threshold things.
2
3 Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010 Free
4 Software Foundation, Inc.
5
6 This file is part of the GNU MP Library.
7
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
12
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
20
21 #ifndef __SPEED_H__
22 #define __SPEED_H__
23
24
25 /* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
26    newsize long. */
27 #define MPN_ZERO_EXTEND(ptr, oldsize, newsize)          \
28   do {                                                  \
29     ASSERT ((newsize) >= (oldsize));                    \
30     MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize));    \
31   } while (0)
32
33 /* A mask of the least significant n bits.  Note 1<<32 doesn't give zero on
34    x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */
35 #define MP_LIMB_T_LOWBITMASK(n) \
36   ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
37
38
39 /* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
40
41 #define TMP_ALLOC_ALIGNED(bytes, align) \
42   align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
43 #define TMP_ALLOC_LIMBS_ALIGNED(limbs, align)   \
44   ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
45
46 /* CACHE_LINE_SIZE is our default alignment for speed operands, and the
47    limit on what s->align_xp etc and then request for off-alignment.  Maybe
48    this should be an option of some sort, but in any case here are some line
49    sizes,
50
51        bytes
52          32   pentium
53          64   athlon
54          64   itanium-2 L1
55         128   itanium-2 L2
56 */
57 #define CACHE_LINE_SIZE   64 /* bytes */
58
59 #define SPEED_TMP_ALLOC_ADJUST_MASK  (CACHE_LINE_SIZE/BYTES_PER_MP_LIMB - 1)
60
61 /* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb
62    alignment.  */
63 #define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align)                        \
64   do {                                                                  \
65     mp_ptr     __ptr;                                                   \
66     mp_size_t  __ptr_align, __ptr_add;                                  \
67                                                                         \
68     ASSERT ((CACHE_LINE_SIZE % BYTES_PER_MP_LIMB) == 0);                \
69     __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK);    \
70     __ptr_align = (__ptr - (mp_ptr) NULL);                              \
71     __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK;  \
72     (ptr) = __ptr + __ptr_add;                                          \
73   } while (0)
74
75
76 /* This is the size for s->xp_block and s->yp_block, used in certain
77    routines that want to run across many different data values and use
78    s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
79
80    512 means 2kbytes of data for each of xp_block and yp_block, making 4k
81    total, which should fit easily in any L1 data cache. */
82
83 #define SPEED_BLOCK_SIZE   512 /* limbs */
84
85
86 extern double  speed_unittime;
87 extern double  speed_cycletime;
88 extern int     speed_precision;
89 extern char    speed_time_string[];
90 void speed_time_init __GMP_PROTO ((void));
91 void speed_cycletime_fail __GMP_PROTO ((const char *str));
92 void speed_cycletime_init __GMP_PROTO ((void));
93 void speed_cycletime_need_cycles __GMP_PROTO ((void));
94 void speed_cycletime_need_seconds __GMP_PROTO ((void));
95 void speed_starttime __GMP_PROTO ((void));
96 double speed_endtime __GMP_PROTO ((void));
97
98
99 struct speed_params {
100   unsigned   reps;      /* how many times to run the routine */
101   mp_ptr     xp;        /* first argument */
102   mp_ptr     yp;        /* second argument */
103   mp_size_t  size;      /* size of both arguments */
104   mp_limb_t  r;         /* user supplied parameter */
105   mp_size_t  align_xp;  /* alignment of xp */
106   mp_size_t  align_yp;  /* alignment of yp */
107   mp_size_t  align_wp;  /* intended alignment of wp */
108   mp_size_t  align_wp2; /* intended alignment of wp2 */
109   mp_ptr     xp_block;  /* first special SPEED_BLOCK_SIZE block */
110   mp_ptr     yp_block;  /* second special SPEED_BLOCK_SIZE block */
111
112   double     time_divisor; /* optionally set by the speed routine */
113
114   /* used by the cache priming things */
115   int        cache;
116   unsigned   src_num, dst_num;
117   struct {
118     mp_ptr    ptr;
119     mp_size_t size;
120   } src[3], dst[3];
121 };
122
123 typedef double (*speed_function_t) __GMP_PROTO ((struct speed_params *s));
124
125 double speed_measure __GMP_PROTO ((speed_function_t fun, struct speed_params *s));
126
127 /* Prototypes for speed measuring routines */
128
129 double speed_back_to_back __GMP_PROTO ((struct speed_params *s));
130 double speed_count_leading_zeros __GMP_PROTO ((struct speed_params *s));
131 double speed_count_trailing_zeros __GMP_PROTO ((struct speed_params *s));
132 double speed_find_a __GMP_PROTO ((struct speed_params *s));
133 double speed_gmp_allocate_free __GMP_PROTO ((struct speed_params *s));
134 double speed_gmp_allocate_reallocate_free __GMP_PROTO ((struct speed_params *s));
135 double speed_invert_limb __GMP_PROTO ((struct speed_params *s));
136 double speed_malloc_free __GMP_PROTO ((struct speed_params *s));
137 double speed_malloc_realloc_free __GMP_PROTO ((struct speed_params *s));
138 double speed_memcpy __GMP_PROTO ((struct speed_params *s));
139 double speed_binvert_limb __GMP_PROTO ((struct speed_params *s));
140 double speed_binvert_limb_mul1 __GMP_PROTO ((struct speed_params *s));
141 double speed_binvert_limb_loop __GMP_PROTO ((struct speed_params *s));
142 double speed_binvert_limb_cond __GMP_PROTO ((struct speed_params *s));
143 double speed_binvert_limb_arith __GMP_PROTO ((struct speed_params *s));
144
145 double speed_mpf_init_clear __GMP_PROTO ((struct speed_params *s));
146
147 double speed_mpn_add_n __GMP_PROTO ((struct speed_params *s));
148 double speed_mpn_addlsh1_n __GMP_PROTO ((struct speed_params *s));
149 double speed_mpn_addlsh2_n __GMP_PROTO ((struct speed_params *s));
150 double speed_mpn_add_n_sub_n __GMP_PROTO ((struct speed_params *s));
151 double speed_mpn_and_n __GMP_PROTO ((struct speed_params *s));
152 double speed_mpn_andn_n __GMP_PROTO ((struct speed_params *s));
153 double speed_mpn_addmul_1 __GMP_PROTO ((struct speed_params *s));
154 double speed_mpn_addmul_2 __GMP_PROTO ((struct speed_params *s));
155 double speed_mpn_addmul_3 __GMP_PROTO ((struct speed_params *s));
156 double speed_mpn_addmul_4 __GMP_PROTO ((struct speed_params *s));
157 double speed_mpn_addmul_5 __GMP_PROTO ((struct speed_params *s));
158 double speed_mpn_addmul_6 __GMP_PROTO ((struct speed_params *s));
159 double speed_mpn_addmul_7 __GMP_PROTO ((struct speed_params *s));
160 double speed_mpn_addmul_8 __GMP_PROTO ((struct speed_params *s));
161 double speed_mpn_com __GMP_PROTO ((struct speed_params *s));
162 double speed_mpn_copyd __GMP_PROTO ((struct speed_params *s));
163 double speed_mpn_copyi __GMP_PROTO ((struct speed_params *s));
164 double speed_MPN_COPY __GMP_PROTO ((struct speed_params *s));
165 double speed_MPN_COPY_DECR __GMP_PROTO ((struct speed_params *s));
166 double speed_MPN_COPY_INCR __GMP_PROTO ((struct speed_params *s));
167 double speed_mpn_divexact_1 __GMP_PROTO ((struct speed_params *s));
168 double speed_mpn_divexact_by3 __GMP_PROTO ((struct speed_params *s));
169 double speed_mpn_bdiv_q_1 __GMP_PROTO ((struct speed_params *));
170 double speed_mpn_pi1_bdiv_q_1 __GMP_PROTO ((struct speed_params *));
171 double speed_mpn_bdiv_dbm1c __GMP_PROTO ((struct speed_params *s));
172 double speed_mpn_divrem_1 __GMP_PROTO ((struct speed_params *s));
173 double speed_mpn_divrem_1f __GMP_PROTO ((struct speed_params *s));
174 double speed_mpn_divrem_1c __GMP_PROTO ((struct speed_params *s));
175 double speed_mpn_divrem_1cf __GMP_PROTO ((struct speed_params *s));
176 double speed_mpn_divrem_1_div __GMP_PROTO ((struct speed_params *s));
177 double speed_mpn_divrem_1f_div __GMP_PROTO ((struct speed_params *s));
178 double speed_mpn_divrem_1_inv __GMP_PROTO ((struct speed_params *s));
179 double speed_mpn_divrem_1f_inv __GMP_PROTO ((struct speed_params *s));
180 double speed_mpn_divrem_2 __GMP_PROTO ((struct speed_params *s));
181 double speed_mpn_divrem_2_div __GMP_PROTO ((struct speed_params *s));
182 double speed_mpn_divrem_2_inv __GMP_PROTO ((struct speed_params *s));
183 double speed_mpn_fib2_ui __GMP_PROTO ((struct speed_params *s));
184 double speed_mpn_matrix22_mul __GMP_PROTO ((struct speed_params *s));
185 double speed_mpn_hgcd __GMP_PROTO ((struct speed_params *s));
186 double speed_mpn_hgcd_lehmer __GMP_PROTO ((struct speed_params *s));
187 double speed_mpn_gcd __GMP_PROTO ((struct speed_params *s));
188 double speed_mpn_gcd_1 __GMP_PROTO ((struct speed_params *s));
189 double speed_mpn_gcd_1N __GMP_PROTO ((struct speed_params *s));
190 double speed_mpn_gcdext __GMP_PROTO ((struct speed_params *s));
191 double speed_mpn_gcdext_double __GMP_PROTO ((struct speed_params *s));
192 double speed_mpn_gcdext_one_double __GMP_PROTO ((struct speed_params *s));
193 double speed_mpn_gcdext_one_single __GMP_PROTO ((struct speed_params *s));
194 double speed_mpn_gcdext_single __GMP_PROTO ((struct speed_params *s));
195 double speed_mpn_get_str __GMP_PROTO ((struct speed_params *s));
196 double speed_mpn_hamdist __GMP_PROTO ((struct speed_params *s));
197 double speed_mpn_ior_n __GMP_PROTO ((struct speed_params *s));
198 double speed_mpn_iorn_n __GMP_PROTO ((struct speed_params *s));
199 double speed_mpn_jacobi_base __GMP_PROTO ((struct speed_params *s));
200 double speed_mpn_jacobi_base_1 __GMP_PROTO ((struct speed_params *s));
201 double speed_mpn_jacobi_base_2 __GMP_PROTO ((struct speed_params *s));
202 double speed_mpn_jacobi_base_3 __GMP_PROTO ((struct speed_params *s));
203 double speed_mpn_lshift __GMP_PROTO ((struct speed_params *s));
204 double speed_mpn_lshiftc __GMP_PROTO ((struct speed_params *s));
205 double speed_mpn_mod_1 __GMP_PROTO ((struct speed_params *s));
206 double speed_mpn_mod_1c __GMP_PROTO ((struct speed_params *s));
207 double speed_mpn_mod_1_div __GMP_PROTO ((struct speed_params *s));
208 double speed_mpn_mod_1_inv __GMP_PROTO ((struct speed_params *s));
209 double speed_mpn_mod_1_1 __GMP_PROTO ((struct speed_params *s));
210 double speed_mpn_mod_1_2 __GMP_PROTO ((struct speed_params *s));
211 double speed_mpn_mod_1_3 __GMP_PROTO ((struct speed_params *s));
212 double speed_mpn_mod_1_4 __GMP_PROTO ((struct speed_params *s));
213 double speed_mpn_mod_34lsub1 __GMP_PROTO ((struct speed_params *s));
214 double speed_mpn_modexact_1_odd __GMP_PROTO ((struct speed_params *s));
215 double speed_mpn_modexact_1c_odd __GMP_PROTO ((struct speed_params *s));
216 double speed_mpn_mul_1 __GMP_PROTO ((struct speed_params *s));
217 double speed_mpn_mul_1_inplace __GMP_PROTO ((struct speed_params *s));
218 double speed_mpn_mul_2 __GMP_PROTO ((struct speed_params *s));
219 double speed_mpn_mul_3 __GMP_PROTO ((struct speed_params *s));
220 double speed_mpn_mul_4 __GMP_PROTO ((struct speed_params *s));
221 double speed_mpn_mul __GMP_PROTO ((struct speed_params *s));
222 double speed_mpn_mul_basecase __GMP_PROTO ((struct speed_params *s));
223 double speed_mpn_mul_fft __GMP_PROTO ((struct speed_params *s));
224 double speed_mpn_mul_fft_sqr __GMP_PROTO ((struct speed_params *s));
225 double speed_mpn_fft_mul __GMP_PROTO ((struct speed_params *s));
226 double speed_mpn_fft_sqr __GMP_PROTO ((struct speed_params *s));
227 #if WANT_OLD_FFT_FULL
228 double speed_mpn_mul_fft_full __GMP_PROTO ((struct speed_params *s));
229 double speed_mpn_mul_fft_full_sqr __GMP_PROTO ((struct speed_params *s));
230 #endif
231 double speed_mpn_nussbaumer_mul __GMP_PROTO ((struct speed_params *s));
232 double speed_mpn_nussbaumer_mul_sqr __GMP_PROTO ((struct speed_params *s));
233 double speed_mpn_mul_n __GMP_PROTO ((struct speed_params *s));
234 double speed_mpn_mul_n_sqr __GMP_PROTO ((struct speed_params *s));
235 double speed_mpn_mullo_n __GMP_PROTO ((struct speed_params *s));
236 double speed_mpn_mullo_basecase __GMP_PROTO ((struct speed_params *s));
237 double speed_mpn_nand_n __GMP_PROTO ((struct speed_params *s));
238 double speed_mpn_nior_n __GMP_PROTO ((struct speed_params *s));
239 double speed_mpn_popcount __GMP_PROTO ((struct speed_params *s));
240 double speed_mpn_preinv_divrem_1 __GMP_PROTO ((struct speed_params *s));
241 double speed_mpn_preinv_divrem_1f __GMP_PROTO ((struct speed_params *s));
242 double speed_mpn_preinv_mod_1 __GMP_PROTO ((struct speed_params *s));
243 double speed_mpn_sbpi1_div_qr __GMP_PROTO ((struct speed_params *s));
244 double speed_mpn_dcpi1_div_qr __GMP_PROTO ((struct speed_params *s));
245 double speed_mpn_sbpi1_divappr_q __GMP_PROTO ((struct speed_params *s));
246 double speed_mpn_dcpi1_divappr_q __GMP_PROTO ((struct speed_params *s));
247 double speed_mpn_mu_div_qr __GMP_PROTO ((struct speed_params *s));
248 double speed_mpn_mu_divappr_q __GMP_PROTO ((struct speed_params *s));
249 double speed_mpn_mupi_div_qr __GMP_PROTO ((struct speed_params *s));
250 double speed_mpn_mu_div_q __GMP_PROTO ((struct speed_params *s));
251 double speed_mpn_sbpi1_bdiv_qr __GMP_PROTO ((struct speed_params *s));
252 double speed_mpn_dcpi1_bdiv_qr __GMP_PROTO ((struct speed_params *s));
253 double speed_mpn_sbpi1_bdiv_q __GMP_PROTO ((struct speed_params *s));
254 double speed_mpn_dcpi1_bdiv_q __GMP_PROTO ((struct speed_params *s));
255 double speed_mpn_mu_bdiv_q __GMP_PROTO ((struct speed_params *s));
256 double speed_mpn_mu_bdiv_qr __GMP_PROTO ((struct speed_params *s));
257 double speed_mpn_invert __GMP_PROTO ((struct speed_params *s));
258 double speed_mpn_invertappr __GMP_PROTO ((struct speed_params *s));
259 double speed_mpn_ni_invertappr __GMP_PROTO ((struct speed_params *s));
260 double speed_mpn_binvert __GMP_PROTO ((struct speed_params *s));
261 double speed_mpn_redc_1 __GMP_PROTO ((struct speed_params *s));
262 double speed_mpn_redc_2 __GMP_PROTO ((struct speed_params *s));
263 double speed_mpn_redc_n __GMP_PROTO ((struct speed_params *s));
264 double speed_mpn_rsblsh1_n __GMP_PROTO ((struct speed_params *s));
265 double speed_mpn_rsblsh2_n __GMP_PROTO ((struct speed_params *s));
266 double speed_mpn_rsh1add_n __GMP_PROTO ((struct speed_params *s));
267 double speed_mpn_rsh1sub_n __GMP_PROTO ((struct speed_params *s));
268 double speed_mpn_rshift __GMP_PROTO ((struct speed_params *s));
269 double speed_mpn_sb_divrem_m3 __GMP_PROTO ((struct speed_params *s));
270 double speed_mpn_sb_divrem_m3_div __GMP_PROTO ((struct speed_params *s));
271 double speed_mpn_sb_divrem_m3_inv __GMP_PROTO ((struct speed_params *s));
272 double speed_mpn_set_str __GMP_PROTO ((struct speed_params *s));
273 double speed_mpn_bc_set_str __GMP_PROTO ((struct speed_params *s));
274 double speed_mpn_dc_set_str __GMP_PROTO ((struct speed_params *s));
275 double speed_mpn_set_str_pre __GMP_PROTO ((struct speed_params *s));
276 double speed_mpn_sqr_basecase __GMP_PROTO ((struct speed_params *s));
277 double speed_mpn_sqr_diagonal __GMP_PROTO ((struct speed_params *s));
278 double speed_mpn_sqr __GMP_PROTO ((struct speed_params *s));
279 double speed_mpn_sqrtrem __GMP_PROTO ((struct speed_params *s));
280 double speed_mpn_rootrem __GMP_PROTO ((struct speed_params *s));
281 double speed_mpn_sub_n __GMP_PROTO ((struct speed_params *s));
282 double speed_mpn_sublsh1_n __GMP_PROTO ((struct speed_params *s));
283 double speed_mpn_sublsh2_n __GMP_PROTO ((struct speed_params *s));
284 double speed_mpn_submul_1 __GMP_PROTO ((struct speed_params *s));
285 double speed_mpn_toom2_sqr __GMP_PROTO ((struct speed_params *s));
286 double speed_mpn_toom3_sqr __GMP_PROTO ((struct speed_params *s));
287 double speed_mpn_toom4_sqr __GMP_PROTO ((struct speed_params *s));
288 double speed_mpn_toom6_sqr __GMP_PROTO ((struct speed_params *s));
289 double speed_mpn_toom8_sqr __GMP_PROTO ((struct speed_params *s));
290 double speed_mpn_toom22_mul __GMP_PROTO ((struct speed_params *s));
291 double speed_mpn_toom33_mul __GMP_PROTO ((struct speed_params *s));
292 double speed_mpn_toom44_mul __GMP_PROTO ((struct speed_params *s));
293 double speed_mpn_toom6h_mul __GMP_PROTO ((struct speed_params *s));
294 double speed_mpn_toom8h_mul __GMP_PROTO ((struct speed_params *s));
295 double speed_mpn_toom32_mul __GMP_PROTO ((struct speed_params *s));
296 double speed_mpn_toom42_mul __GMP_PROTO ((struct speed_params *s));
297 double speed_mpn_toom43_mul __GMP_PROTO ((struct speed_params *s));
298 double speed_mpn_toom63_mul __GMP_PROTO ((struct speed_params *s));
299 double speed_mpn_toom32_for_toom43_mul __GMP_PROTO ((struct speed_params *s));
300 double speed_mpn_toom43_for_toom32_mul __GMP_PROTO ((struct speed_params *s));
301 double speed_mpn_toom32_for_toom53_mul __GMP_PROTO ((struct speed_params *s));
302 double speed_mpn_toom53_for_toom32_mul __GMP_PROTO ((struct speed_params *s));
303 double speed_mpn_toom42_for_toom53_mul __GMP_PROTO ((struct speed_params *s));
304 double speed_mpn_toom53_for_toom42_mul __GMP_PROTO ((struct speed_params *s));
305 double speed_mpn_mulmod_bnm1 __GMP_PROTO ((struct speed_params *s));
306 double speed_mpn_bc_mulmod_bnm1 __GMP_PROTO ((struct speed_params *s));
307 double speed_mpn_mulmod_bnm1_rounded __GMP_PROTO ((struct speed_params *s));
308 double speed_mpn_sqrmod_bnm1 __GMP_PROTO ((struct speed_params *s));
309 double speed_mpn_udiv_qrnnd __GMP_PROTO ((struct speed_params *s));
310 double speed_mpn_udiv_qrnnd_r __GMP_PROTO ((struct speed_params *s));
311 double speed_mpn_umul_ppmm __GMP_PROTO ((struct speed_params *s));
312 double speed_mpn_umul_ppmm_r __GMP_PROTO ((struct speed_params *s));
313 double speed_mpn_xnor_n __GMP_PROTO ((struct speed_params *s));
314 double speed_mpn_xor_n __GMP_PROTO ((struct speed_params *s));
315 double speed_MPN_ZERO __GMP_PROTO ((struct speed_params *s));
316
317 double speed_mpq_init_clear __GMP_PROTO ((struct speed_params *s));
318
319 double speed_mpz_add __GMP_PROTO ((struct speed_params *s));
320 double speed_mpz_bin_uiui __GMP_PROTO ((struct speed_params *s));
321 double speed_mpz_fac_ui __GMP_PROTO ((struct speed_params *s));
322 double speed_mpz_fib_ui __GMP_PROTO ((struct speed_params *s));
323 double speed_mpz_fib2_ui __GMP_PROTO ((struct speed_params *s));
324 double speed_mpz_init_clear __GMP_PROTO ((struct speed_params *s));
325 double speed_mpz_init_realloc_clear __GMP_PROTO ((struct speed_params *s));
326 double speed_mpz_jacobi __GMP_PROTO ((struct speed_params *s));
327 double speed_mpz_lucnum_ui __GMP_PROTO ((struct speed_params *s));
328 double speed_mpz_lucnum2_ui __GMP_PROTO ((struct speed_params *s));
329 double speed_mpz_mod __GMP_PROTO ((struct speed_params *s));
330 double speed_mpz_powm __GMP_PROTO ((struct speed_params *s));
331 double speed_mpz_powm_mod __GMP_PROTO ((struct speed_params *s));
332 double speed_mpz_powm_redc __GMP_PROTO ((struct speed_params *s));
333 double speed_mpz_powm_ui __GMP_PROTO ((struct speed_params *s));
334 double speed_mpz_urandomb __GMP_PROTO ((struct speed_params *s));
335
336 double speed_gmp_randseed __GMP_PROTO ((struct speed_params *s));
337 double speed_gmp_randseed_ui __GMP_PROTO ((struct speed_params *s));
338
339 double speed_noop __GMP_PROTO ((struct speed_params *s));
340 double speed_noop_wxs __GMP_PROTO ((struct speed_params *s));
341 double speed_noop_wxys __GMP_PROTO ((struct speed_params *s));
342
343 double speed_operator_div __GMP_PROTO ((struct speed_params *s));
344 double speed_operator_mod __GMP_PROTO ((struct speed_params *s));
345
346 double speed_udiv_qrnnd __GMP_PROTO ((struct speed_params *s));
347 double speed_udiv_qrnnd_preinv1 __GMP_PROTO ((struct speed_params *s));
348 double speed_udiv_qrnnd_preinv2 __GMP_PROTO ((struct speed_params *s));
349 double speed_udiv_qrnnd_c __GMP_PROTO ((struct speed_params *s));
350 double speed_umul_ppmm __GMP_PROTO ((struct speed_params *s));
351
352 /* Prototypes for other routines */
353
354 /* low 32-bits in p[0], high 32-bits in p[1] */
355 void speed_cyclecounter __GMP_PROTO ((unsigned p[2]));
356
357 void mftb_function __GMP_PROTO ((unsigned p[2]));
358
359 /* In i386 gcc -fPIC, ebx is a fixed register and can't be declared a dummy
360    output or a clobber for the cpuid, hence an explicit save and restore.  A
361    clobber as such doesn't provoke an error unfortunately (gcc 3.0), so use
362    the dummy output style in non-PIC, so there's an error if somehow -fPIC
363    is used without a -DPIC to tell us about it.  */
364 #if defined(__GNUC__) && ! defined (NO_ASM)     \
365   && (defined (__i386__) || defined (__i486__))
366 #ifdef PIC
367 #define speed_cyclecounter(p)                                           \
368   do {                                                                  \
369     int  __speed_cyclecounter__save_ebx;                                \
370     int  __speed_cyclecounter__dummy;                                   \
371     __asm__ __volatile__ ("movl %%ebx, %1\n"                            \
372                           "cpuid\n"                                     \
373                           "movl %1, %%ebx\n"                            \
374                           "rdtsc"                                       \
375                           : "=a"   ((p)[0]),                            \
376                             "=&rm" (__speed_cyclecounter__save_ebx),    \
377                             "=c"   (__speed_cyclecounter__dummy),       \
378                             "=d"   ((p)[1]));                           \
379   } while (0)
380 #else
381 #define speed_cyclecounter(p)                                           \
382   do {                                                                  \
383     int  __speed_cyclecounter__dummy1;                                  \
384     int  __speed_cyclecounter__dummy2;                                  \
385     __asm__ __volatile__ ("cpuid\n"                                     \
386                           "rdtsc"                                       \
387                           : "=a" ((p)[0]),                              \
388                             "=b" (__speed_cyclecounter__dummy1),        \
389                             "=c" (__speed_cyclecounter__dummy2),        \
390                             "=d" ((p)[1]));                             \
391   } while (0)
392 #endif
393 #endif
394
395 double speed_cyclecounter_diff __GMP_PROTO ((const unsigned [2], const unsigned [2]));
396 int gettimeofday_microseconds_p __GMP_PROTO ((void));
397 int getrusage_microseconds_p __GMP_PROTO ((void));
398 int cycles_works_p __GMP_PROTO ((void));
399 long clk_tck __GMP_PROTO ((void));
400 double freq_measure __GMP_PROTO ((const char *, double (*)(void)));
401
402 int double_cmp_ptr __GMP_PROTO ((const double *, const double *));
403 void pentium_wbinvd __GMP_PROTO ((void));
404 typedef int (*qsort_function_t) __GMP_PROTO ((const void *, const void *));
405
406 void noop __GMP_PROTO ((void));
407 void noop_1 __GMP_PROTO ((mp_limb_t));
408 void noop_wxs __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
409 void noop_wxys __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
410 void mpn_cache_fill __GMP_PROTO ((mp_srcptr, mp_size_t));
411 void mpn_cache_fill_dummy __GMP_PROTO ((mp_limb_t));
412 void speed_cache_fill __GMP_PROTO ((struct speed_params *));
413 void speed_operand_src __GMP_PROTO ((struct speed_params *, mp_ptr, mp_size_t));
414 void speed_operand_dst __GMP_PROTO ((struct speed_params *, mp_ptr, mp_size_t));
415
416 extern int  speed_option_addrs;
417 extern int  speed_option_verbose;
418 void speed_option_set __GMP_PROTO((const char *));
419
420 mp_limb_t mpn_divrem_1_div __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t));
421 mp_limb_t mpn_divrem_1_inv __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t));
422 mp_limb_t mpn_divrem_2_div __GMP_PROTO ((mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr));
423 mp_limb_t mpn_divrem_2_inv __GMP_PROTO ((mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr));
424
425 int mpn_jacobi_base_1 __GMP_PROTO ((mp_limb_t, mp_limb_t, int));
426 int mpn_jacobi_base_2 __GMP_PROTO ((mp_limb_t, mp_limb_t, int));
427 int mpn_jacobi_base_3 __GMP_PROTO ((mp_limb_t, mp_limb_t, int));
428
429 mp_limb_t mpn_mod_1_div __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t));
430 mp_limb_t mpn_mod_1_inv __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t));
431
432 mp_size_t mpn_gcd_binary
433   __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
434 mp_size_t mpn_gcd_accel
435   __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
436 mp_size_t mpn_gcdext_one_double
437   __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
438 mp_size_t mpn_gcdext_one_single
439   __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
440 mp_size_t mpn_gcdext_single
441   __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
442 mp_size_t mpn_gcdext_double
443   __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
444
445 mp_limb_t mpn_sb_divrem_mn_div __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t));
446 mp_limb_t mpn_sb_divrem_mn_inv __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t));
447
448 mp_size_t mpn_set_str_basecase __GMP_PROTO ((mp_ptr, const unsigned char *, size_t, int));
449 void mpn_pre_set_str __GMP_PROTO ((mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr));
450
451 void mpz_powm_mod __GMP_PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr));
452 void mpz_powm_redc __GMP_PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr));
453
454 int speed_routine_count_zeros_setup
455   __GMP_PROTO ((struct speed_params *, mp_ptr, int, int));
456
457
458 /* "get" is called repeatedly until it ticks over, just in case on a fast
459    processor it takes less than a microsecond, though this is probably
460    unlikely if it's a system call.
461
462    speed_cyclecounter is called on the same side of the "get" for the start
463    and end measurements.  It doesn't matter how long it takes from the "get"
464    sample to the cycles sample, since that period will cancel out in the
465    difference calculation (assuming it's the same each time).
466
467    Letting the test run for more than a process time slice is probably only
468    going to reduce accuracy, especially for getrusage when the cycle counter
469    is real time, or for gettimeofday if the cycle counter is in fact process
470    time.  Use CLK_TCK/2 as a reasonable stop.
471
472    It'd be desirable to be quite accurate here.  The default speed_precision
473    for a cycle counter is 10000 cycles, so to mix that with getrusage or
474    gettimeofday the frequency should be at least that accurate.  But running
475    measurements for 10000 microseconds (or more) is too long.  Be satisfied
476    with just a half clock tick (5000 microseconds usually).  */
477
478 #define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec)              \
479   do {                                                                  \
480     type      st1, st, et1, et;                                         \
481     unsigned  sc[2], ec[2];                                             \
482     long      dt, half_tick;                                            \
483     double    dc, cyc;                                                  \
484                                                                         \
485     half_tick = (1000000L / clk_tck()) / 2;                             \
486                                                                         \
487     get (st1);                                                          \
488     do {                                                                \
489       get (st);                                                         \
490     } while (usec(st) == usec(st1) && sec(st) == sec(st1));             \
491                                                                         \
492     getc (sc);                                                          \
493                                                                         \
494     for (;;)                                                            \
495       {                                                                 \
496         get (et1);                                                      \
497         do {                                                            \
498           get (et);                                                     \
499         } while (usec(et) == usec(et1) && sec(et) == sec(et1));         \
500                                                                         \
501         getc (ec);                                                      \
502                                                                         \
503         dc = speed_cyclecounter_diff (ec, sc);                          \
504                                                                         \
505         /* allow secs to cancel before multiplying */                   \
506         dt = sec(et) - sec(st);                                         \
507         dt = dt * 1000000L + (usec(et) - usec(st));                     \
508                                                                         \
509         if (dt >= half_tick)                                            \
510           break;                                                        \
511       }                                                                 \
512                                                                         \
513     cyc = dt * 1e-6 / dc;                                               \
514                                                                         \
515     if (speed_option_verbose >= 2)                                      \
516       printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n",        \
517               name, dc, dt, cyc);                                       \
518                                                                         \
519     return dt * 1e-6 / dc;                                              \
520                                                                         \
521   } while (0)
522
523
524
525
526 /* The measuring routines use these big macros to save duplication for
527    similar forms.  They also get used for some automatically generated
528    measuring of new implementations of functions.
529
530    Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
531    function pointer is considered undesirable since it's not the way a
532    normal application will be calling, and some processors might do
533    different things with an indirect call, like not branch predicting, or
534    doing a full pipe flush.  At least some of the "functions" measured are
535    actually macros too.
536
537    The net effect is to bloat the object code, possibly in a big way, but
538    only what's being measured is being run, so that doesn't matter.
539
540    The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or
541    ATTRIBUTE_CONST on the called functions.  Adding a cast to a non-pure
542    function pointer doesn't work in gcc 3.2.  Using an actual non-pure
543    function pointer variable works, but stands a real risk of a
544    non-optimizing compiler generating unnecessary overheads in the call.
545    Currently the best idea is not to use those attributes for a timing
546    program build.  __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and
547    gmp-impl.h to omit them from routines there.  */
548
549 #define SPEED_RESTRICT_COND(cond)   if (!(cond)) return -1.0;
550
551 /* For mpn_copy or similar. */
552 #define SPEED_ROUTINE_MPN_COPY(function)                                \
553   {                                                                     \
554     mp_ptr    wp;                                                       \
555     unsigned  i;                                                        \
556     double    t;                                                        \
557     TMP_DECL;                                                           \
558                                                                         \
559     SPEED_RESTRICT_COND (s->size >= 0);                                 \
560                                                                         \
561     TMP_MARK;                                                           \
562     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);                   \
563                                                                         \
564     speed_operand_src (s, s->xp, s->size);                              \
565     speed_operand_dst (s, wp, s->size);                                 \
566     speed_cache_fill (s);                                               \
567                                                                         \
568     speed_starttime ();                                                 \
569     i = s->reps;                                                        \
570     do                                                                  \
571       function (wp, s->xp, s->size);                                    \
572     while (--i != 0);                                                   \
573     t = speed_endtime ();                                               \
574                                                                         \
575     TMP_FREE;                                                           \
576     return t;                                                           \
577   }
578
579 #define SPEED_ROUTINE_MPN_COPYC(function)                               \
580   {                                                                     \
581     mp_ptr    wp;                                                       \
582     unsigned  i;                                                        \
583     double    t;                                                        \
584     TMP_DECL;                                                           \
585                                                                         \
586     SPEED_RESTRICT_COND (s->size >= 0);                                 \
587                                                                         \
588     TMP_MARK;                                                           \
589     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);                   \
590                                                                         \
591     speed_operand_src (s, s->xp, s->size);                              \
592     speed_operand_dst (s, wp, s->size);                                 \
593     speed_cache_fill (s);                                               \
594                                                                         \
595     speed_starttime ();                                                 \
596     i = s->reps;                                                        \
597     do                                                                  \
598       function (wp, s->xp, s->size, 0);                                 \
599     while (--i != 0);                                                   \
600     t = speed_endtime ();                                               \
601                                                                         \
602     TMP_FREE;                                                           \
603     return t;                                                           \
604   }
605
606 /* s->size is still in limbs, and it's limbs which are copied, but
607    "function" takes a size in bytes not limbs.  */
608 #define SPEED_ROUTINE_MPN_COPY_BYTES(function)                          \
609   {                                                                     \
610     mp_ptr    wp;                                                       \
611     unsigned  i;                                                        \
612     double    t;                                                        \
613     TMP_DECL;                                                           \
614                                                                         \
615     SPEED_RESTRICT_COND (s->size >= 0);                                 \
616                                                                         \
617     TMP_MARK;                                                           \
618     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);                   \
619                                                                         \
620     speed_operand_src (s, s->xp, s->size);                              \
621     speed_operand_dst (s, wp, s->size);                                 \
622     speed_cache_fill (s);                                               \
623                                                                         \
624     speed_starttime ();                                                 \
625     i = s->reps;                                                        \
626     do                                                                  \
627       function (wp, s->xp, s->size * BYTES_PER_MP_LIMB);                \
628     while (--i != 0);                                                   \
629     t = speed_endtime ();                                               \
630                                                                         \
631     TMP_FREE;                                                           \
632     return t;                                                           \
633   }
634
635
636 /* For mpn_add_n, mpn_sub_n, or similar. */
637 #define SPEED_ROUTINE_MPN_BINARY_N_CALL(call)                           \
638   {                                                                     \
639     mp_ptr     wp;                                                      \
640     mp_ptr     xp, yp;                                                  \
641     unsigned   i;                                                       \
642     double     t;                                                       \
643     TMP_DECL;                                                           \
644                                                                         \
645     SPEED_RESTRICT_COND (s->size >= 1);                                 \
646                                                                         \
647     TMP_MARK;                                                           \
648     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);                   \
649                                                                         \
650     xp = s->xp;                                                         \
651     yp = s->yp;                                                         \
652                                                                         \
653     if (s->r == 0)      ;                                               \
654     else if (s->r == 1) { xp = wp;          }                           \
655     else if (s->r == 2) {          yp = wp; }                           \
656     else if (s->r == 3) { xp = wp; yp = wp; }                           \
657     else if (s->r == 4) {     yp = xp;      }                           \
658     else                {                                               \
659       TMP_FREE;                                                         \
660       return -1.0;                                                      \
661     }                                                                   \
662                                                                         \
663     /* initialize wp if operand overlap */                              \
664     if (xp == wp || yp == wp)                                           \
665       MPN_COPY (wp, s->xp, s->size);                                    \
666                                                                         \
667     speed_operand_src (s, xp, s->size);                                 \
668     speed_operand_src (s, yp, s->size);                                 \
669     speed_operand_dst (s, wp, s->size);                                 \
670     speed_cache_fill (s);                                               \
671                                                                         \
672     speed_starttime ();                                                 \
673     i = s->reps;                                                        \
674     do                                                                  \
675       call;                                                             \
676     while (--i != 0);                                                   \
677     t = speed_endtime ();                                               \
678                                                                         \
679     TMP_FREE;                                                           \
680     return t;                                                           \
681   }
682
683 /* For mpn_add_n, mpn_sub_n, or similar. */
684 #define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call)                           \
685   {                                                                     \
686     mp_ptr     ap, sp;                                                  \
687     mp_ptr     xp, yp;                                                  \
688     unsigned   i;                                                       \
689     double     t;                                                       \
690     TMP_DECL;                                                           \
691                                                                         \
692     SPEED_RESTRICT_COND (s->size >= 1);                                 \
693                                                                         \
694     TMP_MARK;                                                           \
695     SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp);                   \
696     SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp);                   \
697                                                                         \
698     xp = s->xp;                                                         \
699     yp = s->yp;                                                         \
700                                                                         \
701     if ((s->r & 1) != 0) { xp = ap; }                                   \
702     if ((s->r & 2) != 0) { yp = ap; }                                   \
703     if ((s->r & 4) != 0) { xp = sp; }                                   \
704     if ((s->r & 8) != 0) { yp = sp; }                                   \
705     if ((s->r & 3) == 3  ||  (s->r & 12) == 12)                         \
706       {                                                                 \
707         TMP_FREE;                                                       \
708         return -1.0;                                                    \
709       }                                                                 \
710                                                                         \
711     /* initialize ap if operand overlap */                              \
712     if (xp == ap || yp == ap)                                           \
713       MPN_COPY (ap, s->xp, s->size);                                    \
714     /* initialize sp if operand overlap */                              \
715     if (xp == sp || yp == sp)                                           \
716       MPN_COPY (sp, s->xp, s->size);                                    \
717                                                                         \
718     speed_operand_src (s, xp, s->size);                                 \
719     speed_operand_src (s, yp, s->size);                                 \
720     speed_operand_dst (s, ap, s->size);                                 \
721     speed_operand_dst (s, sp, s->size);                                 \
722     speed_cache_fill (s);                                               \
723                                                                         \
724     speed_starttime ();                                                 \
725     i = s->reps;                                                        \
726     do                                                                  \
727       call;                                                             \
728     while (--i != 0);                                                   \
729     t = speed_endtime ();                                               \
730                                                                         \
731     TMP_FREE;                                                           \
732     return t;                                                           \
733   }
734
735 #define SPEED_ROUTINE_MPN_BINARY_N(function)                            \
736    SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size))
737
738 #define SPEED_ROUTINE_MPN_BINARY_NC(function)                           \
739    SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0))
740
741
742 /* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
743 #define SPEED_ROUTINE_MPN_UNARY_1_CALL(call)                            \
744   {                                                                     \
745     mp_ptr    wp;                                                       \
746     unsigned  i;                                                        \
747     double    t;                                                        \
748     TMP_DECL;                                                           \
749                                                                         \
750     SPEED_RESTRICT_COND (s->size >= 1);                                 \
751                                                                         \
752     TMP_MARK;                                                           \
753     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);                   \
754                                                                         \
755     speed_operand_src (s, s->xp, s->size);                              \
756     speed_operand_dst (s, wp, s->size);                                 \
757     speed_cache_fill (s);                                               \
758                                                                         \
759     speed_starttime ();                                                 \
760     i = s->reps;                                                        \
761     do                                                                  \
762       call;                                                             \
763     while (--i != 0);                                                   \
764     t = speed_endtime ();                                               \
765                                                                         \
766     TMP_FREE;                                                           \
767     return t;                                                           \
768   }
769
770 #define SPEED_ROUTINE_MPN_UNARY_1(function)                             \
771   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
772
773 #define SPEED_ROUTINE_MPN_UNARY_1C(function)                            \
774   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
775
776 /* FIXME: wp is uninitialized here, should start it off from xp */
777 #define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function)                     \
778   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
779
780 #define SPEED_ROUTINE_MPN_DIVEXACT_1(function)                          \
781   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
782
783 #define SPEED_ROUTINE_MPN_BDIV_Q_1(function)                            \
784     SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
785
786 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call)                       \
787   {                                                                     \
788     unsigned   shift;                                                   \
789     mp_limb_t  dinv;                                                    \
790                                                                         \
791     SPEED_RESTRICT_COND (s->size > 0);                                  \
792     SPEED_RESTRICT_COND (s->r != 0);                                    \
793                                                                         \
794     count_trailing_zeros (shift, s->r);                                 \
795     binvert_limb (dinv, s->r >> shift);                                 \
796                                                                         \
797     SPEED_ROUTINE_MPN_UNARY_1_CALL (call);                              \
798   }
799 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function)                        \
800   SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL                                   \
801   ((*function) (wp, s->xp, s->size, s->r, dinv, shift))
802
803 #define SPEED_ROUTINE_MPN_BDIV_DBM1C(function)                          \
804   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
805
806 #define SPEED_ROUTINE_MPN_DIVREM_1(function)                            \
807   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
808
809 #define SPEED_ROUTINE_MPN_DIVREM_1C(function)                           \
810   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
811
812 #define SPEED_ROUTINE_MPN_DIVREM_1F(function)                           \
813   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
814
815 #define SPEED_ROUTINE_MPN_DIVREM_1CF(function)                          \
816   SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
817
818
819 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call)                    \
820   {                                                                     \
821     unsigned   shift;                                                   \
822     mp_limb_t  dinv;                                                    \
823                                                                         \
824     SPEED_RESTRICT_COND (s->size >= 0);                                 \
825     SPEED_RESTRICT_COND (s->r != 0);                                    \
826                                                                         \
827     count_leading_zeros (shift, s->r);                                  \
828     invert_limb (dinv, s->r << shift);                                  \
829                                                                         \
830     SPEED_ROUTINE_MPN_UNARY_1_CALL (call);                              \
831   }                                                                     \
832
833 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function)                     \
834   SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL                                \
835   ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift))
836
837 /* s->size limbs worth of fraction part */
838 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function)                    \
839   SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL                                \
840   ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift))
841
842
843 /* s->r is duplicated to form the multiplier, defaulting to
844    MP_BASES_BIG_BASE_10.  Not sure if that's particularly useful, but at
845    least it provides some control.  */
846 #define SPEED_ROUTINE_MPN_UNARY_N(function,N)                           \
847   {                                                                     \
848     mp_ptr     wp;                                                      \
849     mp_size_t  wn;                                                      \
850     unsigned   i;                                                       \
851     double     t;                                                       \
852     mp_limb_t  yp[N];                                                   \
853     TMP_DECL;                                                           \
854                                                                         \
855     SPEED_RESTRICT_COND (s->size >= N);                                 \
856                                                                         \
857     TMP_MARK;                                                           \
858     wn = s->size + N-1;                                                 \
859     SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);                        \
860     for (i = 0; i < N; i++)                                             \
861       yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10);                \
862                                                                         \
863     speed_operand_src (s, s->xp, s->size);                              \
864     speed_operand_src (s, yp, (mp_size_t) N);                           \
865     speed_operand_dst (s, wp, wn);                                      \
866     speed_cache_fill (s);                                               \
867                                                                         \
868     speed_starttime ();                                                 \
869     i = s->reps;                                                        \
870     do                                                                  \
871       function (wp, s->xp, s->size, yp);                                \
872     while (--i != 0);                                                   \
873     t = speed_endtime ();                                               \
874                                                                         \
875     TMP_FREE;                                                           \
876     return t;                                                           \
877   }
878
879 #define SPEED_ROUTINE_MPN_UNARY_2(function)                             \
880   SPEED_ROUTINE_MPN_UNARY_N (function, 2)
881 #define SPEED_ROUTINE_MPN_UNARY_3(function)                             \
882   SPEED_ROUTINE_MPN_UNARY_N (function, 3)
883 #define SPEED_ROUTINE_MPN_UNARY_4(function)                             \
884   SPEED_ROUTINE_MPN_UNARY_N (function, 4)
885 #define SPEED_ROUTINE_MPN_UNARY_5(function)                             \
886   SPEED_ROUTINE_MPN_UNARY_N (function, 5)
887 #define SPEED_ROUTINE_MPN_UNARY_6(function)                             \
888   SPEED_ROUTINE_MPN_UNARY_N (function, 6)
889 #define SPEED_ROUTINE_MPN_UNARY_7(function)                             \
890   SPEED_ROUTINE_MPN_UNARY_N (function, 7)
891 #define SPEED_ROUTINE_MPN_UNARY_8(function)                             \
892   SPEED_ROUTINE_MPN_UNARY_N (function, 8)
893
894
895 /* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */
896 #define SPEED_ROUTINE_MPN_MUL(function)                                 \
897   {                                                                     \
898     mp_ptr    wp, xp;                                                   \
899     mp_size_t size1;                                                    \
900     unsigned  i;                                                        \
901     double    t;                                                        \
902     TMP_DECL;                                                           \
903                                                                         \
904     size1 = (s->r == 0 ? s->size : s->r);                               \
905                                                                         \
906     SPEED_RESTRICT_COND (s->size >= 1);                                 \
907     SPEED_RESTRICT_COND (size1 >= s->size);                             \
908                                                                         \
909     TMP_MARK;                                                           \
910     SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp);           \
911     SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp);                     \
912                                                                         \
913     speed_operand_src (s, xp, size1);                                   \
914     speed_operand_src (s, s->yp, s->size);                              \
915     speed_operand_dst (s, wp, size1 + s->size);                         \
916     speed_cache_fill (s);                                               \
917                                                                         \
918     speed_starttime ();                                                 \
919     i = s->reps;                                                        \
920     do                                                                  \
921       function (wp, xp, size1, s->yp, s->size);                         \
922     while (--i != 0);                                                   \
923     t = speed_endtime ();                                               \
924                                                                         \
925     TMP_FREE;                                                           \
926     return t;                                                           \
927   }
928
929
930 #define SPEED_ROUTINE_MPN_MUL_N_CALL(call)                              \
931   {                                                                     \
932     mp_ptr    wp;                                                       \
933     unsigned  i;                                                        \
934     double    t;                                                        \
935     TMP_DECL;                                                           \
936                                                                         \
937     SPEED_RESTRICT_COND (s->size >= 1);                                 \
938                                                                         \
939     TMP_MARK;                                                           \
940     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);                 \
941                                                                         \
942     speed_operand_src (s, s->xp, s->size);                              \
943     speed_operand_src (s, s->yp, s->size);                              \
944     speed_operand_dst (s, wp, 2*s->size);                               \
945     speed_cache_fill (s);                                               \
946                                                                         \
947     speed_starttime ();                                                 \
948     i = s->reps;                                                        \
949     do                                                                  \
950       call;                                                             \
951     while (--i != 0);                                                   \
952     t = speed_endtime ();                                               \
953                                                                         \
954     TMP_FREE;                                                           \
955     return t;                                                           \
956   }
957
958 #define SPEED_ROUTINE_MPN_MUL_N(function)                               \
959   SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
960
961 #define SPEED_ROUTINE_MPN_MULLO_N_CALL(call)                            \
962   {                                                                     \
963     mp_ptr    wp;                                                       \
964     unsigned  i;                                                        \
965     double    t;                                                        \
966     TMP_DECL;                                                           \
967                                                                         \
968     SPEED_RESTRICT_COND (s->size >= 1);                                 \
969                                                                         \
970     TMP_MARK;                                                           \
971     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);                   \
972                                                                         \
973     speed_operand_src (s, s->xp, s->size);                              \
974     speed_operand_src (s, s->yp, s->size);                              \
975     speed_operand_dst (s, wp, s->size);                                 \
976     speed_cache_fill (s);                                               \
977                                                                         \
978     speed_starttime ();                                                 \
979     i = s->reps;                                                        \
980     do                                                                  \
981       call;                                                             \
982     while (--i != 0);                                                   \
983     t = speed_endtime ();                                               \
984                                                                         \
985     TMP_FREE;                                                           \
986     return t;                                                           \
987   }
988
989 #define SPEED_ROUTINE_MPN_MULLO_N(function)                             \
990   SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
991
992 /* For mpn_mul_basecase, xsize=r, ysize=s->size. */
993 #define SPEED_ROUTINE_MPN_MULLO_BASECASE(function)                      \
994   {                                                                     \
995     mp_ptr    wp;                                                       \
996     unsigned  i;                                                        \
997     double    t;                                                        \
998     TMP_DECL;                                                           \
999                                                                         \
1000     SPEED_RESTRICT_COND (s->size >= 1);                                 \
1001                                                                         \
1002     TMP_MARK;                                                           \
1003     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);                   \
1004                                                                         \
1005     speed_operand_src (s, s->xp, s->size);                              \
1006     speed_operand_src (s, s->yp, s->size);                              \
1007     speed_operand_dst (s, wp, s->size);                                 \
1008     speed_cache_fill (s);                                               \
1009                                                                         \
1010     speed_starttime ();                                                 \
1011     i = s->reps;                                                        \
1012     do                                                                  \
1013       function (wp, s->xp, s->yp, s->size);                             \
1014     while (--i != 0);                                                   \
1015     t = speed_endtime ();                                               \
1016                                                                         \
1017     TMP_FREE;                                                           \
1018     return t;                                                           \
1019   }
1020
1021 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call)                        \
1022   {                                                                     \
1023     mp_ptr    wp, tp;                                                   \
1024     unsigned  i;                                                        \
1025     double    t;                                                        \
1026     mp_size_t itch;                                                     \
1027     TMP_DECL;                                                           \
1028                                                                         \
1029     SPEED_RESTRICT_COND (s->size >= 1);                                 \
1030                                                                         \
1031     itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size);            \
1032                                                                         \
1033     TMP_MARK;                                                           \
1034     SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp);               \
1035     SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);                     \
1036                                                                         \
1037     speed_operand_src (s, s->xp, s->size);                              \
1038     speed_operand_src (s, s->yp, s->size);                              \
1039     speed_operand_dst (s, wp, 2 * s->size);                             \
1040     speed_operand_dst (s, tp, itch);                                    \
1041     speed_cache_fill (s);                                               \
1042                                                                         \
1043     speed_starttime ();                                                 \
1044     i = s->reps;                                                        \
1045     do                                                                  \
1046       call;                                                             \
1047     while (--i != 0);                                                   \
1048     t = speed_endtime ();                                               \
1049                                                                         \
1050     TMP_FREE;                                                           \
1051     return t;                                                           \
1052   }
1053 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function)                 \
1054   {                                                                     \
1055     mp_ptr    wp, tp;                                                   \
1056     unsigned  i;                                                        \
1057     double    t;                                                        \
1058     mp_size_t size, itch;                                               \
1059     TMP_DECL;                                                           \
1060                                                                         \
1061     SPEED_RESTRICT_COND (s->size >= 1);                                 \
1062                                                                         \
1063     size = mpn_mulmod_bnm1_next_size (s->size);                         \
1064     itch = mpn_mulmod_bnm1_itch (size, size, size);                     \
1065                                                                         \
1066     TMP_MARK;                                                           \
1067     SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp);                      \
1068     SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);                     \
1069                                                                         \
1070     speed_operand_src (s, s->xp, s->size);                              \
1071     speed_operand_src (s, s->yp, s->size);                              \
1072     speed_operand_dst (s, wp, size);                                    \
1073     speed_operand_dst (s, tp, itch);                                    \
1074     speed_cache_fill (s);                                               \
1075                                                                         \
1076     speed_starttime ();                                                 \
1077     i = s->reps;                                                        \
1078     do                                                                  \
1079       function (wp, size, s->xp, s->size, s->yp, s->size, tp);          \
1080     while (--i != 0);                                                   \
1081     t = speed_endtime ();                                               \
1082                                                                         \
1083     TMP_FREE;                                                           \
1084     return t;                                                           \
1085   }
1086
1087 #define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize)            \
1088   {                                                                     \
1089     mp_ptr    wp, tspace;                                               \
1090     unsigned  i;                                                        \
1091     double    t;                                                        \
1092     TMP_DECL;                                                           \
1093                                                                         \
1094     SPEED_RESTRICT_COND (s->size >= minsize);                           \
1095                                                                         \
1096     TMP_MARK;                                                           \
1097     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);                 \
1098     SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);                \
1099                                                                         \
1100     speed_operand_src (s, s->xp, s->size);                              \
1101     speed_operand_src (s, s->yp, s->size);                              \
1102     speed_operand_dst (s, wp, 2*s->size);                               \
1103     speed_operand_dst (s, tspace, tsize);                               \
1104     speed_cache_fill (s);                                               \
1105                                                                         \
1106     speed_starttime ();                                                 \
1107     i = s->reps;                                                        \
1108     do                                                                  \
1109       call;                                                             \
1110     while (--i != 0);                                                   \
1111     t = speed_endtime ();                                               \
1112                                                                         \
1113     TMP_FREE;                                                           \
1114     return t;                                                           \
1115   }
1116
1117 #define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function)                        \
1118   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1119     (function (wp, s->xp, s->size, s->yp, s->size, tspace),             \
1120      mpn_toom22_mul_itch (s->size, s->size),                            \
1121      MPN_TOOM22_MUL_MINSIZE)
1122
1123 #define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function)                        \
1124   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1125     (function (wp, s->xp, s->size, s->yp, s->size, tspace),             \
1126      mpn_toom33_mul_itch (s->size, s->size),                            \
1127      MPN_TOOM33_MUL_MINSIZE)
1128
1129 #define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function)                        \
1130   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1131     (function (wp, s->xp, s->size, s->yp, s->size, tspace),             \
1132      mpn_toom44_mul_itch (s->size, s->size),                            \
1133      MPN_TOOM44_MUL_MINSIZE)
1134
1135 #define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function)                        \
1136   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1137     (function (wp, s->xp, s->size, s->yp, s->size, tspace),             \
1138      mpn_toom6h_mul_itch (s->size, s->size),                            \
1139      MPN_TOOM6H_MUL_MINSIZE)
1140
1141 #define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function)                        \
1142   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1143     (function (wp, s->xp, s->size, s->yp, s->size, tspace),             \
1144      mpn_toom8h_mul_itch (s->size, s->size),                            \
1145      MPN_TOOM8H_MUL_MINSIZE)
1146
1147 #define SPEED_ROUTINE_MPN_TOOM32_MUL(function)                          \
1148   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1149     (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace),         \
1150      mpn_toom32_mul_itch (s->size, 2*s->size/3),                        \
1151      MPN_TOOM32_MUL_MINSIZE)
1152
1153 #define SPEED_ROUTINE_MPN_TOOM42_MUL(function)                          \
1154   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1155     (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),           \
1156      mpn_toom42_mul_itch (s->size, s->size/2),                          \
1157      MPN_TOOM42_MUL_MINSIZE)
1158
1159 #define SPEED_ROUTINE_MPN_TOOM43_MUL(function)                          \
1160   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1161     (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace),         \
1162      mpn_toom43_mul_itch (s->size, s->size*3/4),                        \
1163      MPN_TOOM43_MUL_MINSIZE)
1164
1165 #define SPEED_ROUTINE_MPN_TOOM63_MUL(function)                          \
1166   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1167     (function (wp, s->xp, s->size, s->yp, s->size/2, tspace),           \
1168      mpn_toom63_mul_itch (s->size, s->size/2),                          \
1169      MPN_TOOM63_MUL_MINSIZE)
1170
1171 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function)               \
1172   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1173     (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),       \
1174      mpn_toom32_mul_itch (s->size, 17*s->size/24),                      \
1175      MPN_TOOM32_MUL_MINSIZE)
1176 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function)               \
1177   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1178     (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace),       \
1179      mpn_toom43_mul_itch (s->size, 17*s->size/24),                      \
1180      MPN_TOOM43_MUL_MINSIZE)
1181
1182 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function)               \
1183   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1184     (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),       \
1185      mpn_toom32_mul_itch (s->size, 19*s->size/30),                      \
1186      MPN_TOOM32_MUL_MINSIZE)
1187 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function)               \
1188   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1189     (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace),       \
1190      mpn_toom53_mul_itch (s->size, 19*s->size/30),                      \
1191      MPN_TOOM53_MUL_MINSIZE)
1192
1193 #define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function)               \
1194   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1195     (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),       \
1196      mpn_toom42_mul_itch (s->size, 11*s->size/20),                      \
1197      MPN_TOOM42_MUL_MINSIZE)
1198 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function)               \
1199   SPEED_ROUTINE_MPN_MUL_N_TSPACE                                        \
1200     (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace),       \
1201      mpn_toom53_mul_itch (s->size, 11*s->size/20),                      \
1202      MPN_TOOM53_MUL_MINSIZE)
1203
1204
1205
1206 #define SPEED_ROUTINE_MPN_SQR_CALL(call)                                \
1207   {                                                                     \
1208     mp_ptr    wp;                                                       \
1209     unsigned  i;                                                        \
1210     double    t;                                                        \
1211     TMP_DECL;                                                           \
1212                                                                         \
1213     SPEED_RESTRICT_COND (s->size >= 1);                                 \
1214                                                                         \
1215     TMP_MARK;                                                           \
1216     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);                 \
1217                                                                         \
1218     speed_operand_src (s, s->xp, s->size);                              \
1219     speed_operand_dst (s, wp, 2*s->size);                               \
1220     speed_cache_fill (s);                                               \
1221                                                                         \
1222     speed_starttime ();                                                 \
1223     i = s->reps;                                                        \
1224     do                                                                  \
1225       call;                                                             \
1226     while (--i != 0);                                                   \
1227     t = speed_endtime ();                                               \
1228                                                                         \
1229     TMP_FREE;                                                           \
1230     return t;                                                           \
1231   }
1232
1233 #define SPEED_ROUTINE_MPN_SQR(function)                                 \
1234   SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
1235
1236 #define SPEED_ROUTINE_MPN_SQR_DIAGONAL(function)                        \
1237   SPEED_ROUTINE_MPN_SQR (function)
1238
1239
1240 #define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize)              \
1241   {                                                                     \
1242     mp_ptr    wp, tspace;                                               \
1243     unsigned  i;                                                        \
1244     double    t;                                                        \
1245     TMP_DECL;                                                           \
1246                                                                         \
1247     SPEED_RESTRICT_COND (s->size >= minsize);                           \
1248                                                                         \
1249     TMP_MARK;                                                           \
1250     SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp);                 \
1251     SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2);                \
1252                                                                         \
1253     speed_operand_src (s, s->xp, s->size);                              \
1254     speed_operand_dst (s, wp, 2*s->size);                               \
1255     speed_operand_dst (s, tspace, tsize);                               \
1256     speed_cache_fill (s);                                               \
1257                                                                         \
1258     speed_starttime ();                                                 \
1259     i = s->reps;                                                        \
1260     do                                                                  \
1261       call;                                                             \
1262     while (--i != 0);                                                   \
1263     t = speed_endtime ();                                               \
1264                                                                         \
1265     TMP_FREE;                                                           \
1266     return t;                                                           \
1267   }
1268
1269 #define SPEED_ROUTINE_MPN_TOOM2_SQR(function)                           \
1270   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),  \
1271                                 mpn_toom2_sqr_itch (s->size),           \
1272                                 MPN_TOOM2_SQR_MINSIZE)
1273
1274 #define SPEED_ROUTINE_MPN_TOOM3_SQR(function)                           \
1275   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),  \
1276                                 mpn_toom3_sqr_itch (s->size),           \
1277                                 MPN_TOOM3_SQR_MINSIZE)
1278
1279
1280 #define SPEED_ROUTINE_MPN_TOOM4_SQR(function)                           \
1281   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),  \
1282                                 mpn_toom4_sqr_itch (s->size),           \
1283                                 MPN_TOOM4_SQR_MINSIZE)
1284
1285 #define SPEED_ROUTINE_MPN_TOOM6_SQR(function)                           \
1286   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),  \
1287                                 mpn_toom6_sqr_itch (s->size),           \
1288                                 MPN_TOOM6_SQR_MINSIZE)
1289
1290 #define SPEED_ROUTINE_MPN_TOOM8_SQR(function)                           \
1291   SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace),  \
1292                                 mpn_toom8_sqr_itch (s->size),           \
1293                                 MPN_TOOM8_SQR_MINSIZE)
1294
1295 #define SPEED_ROUTINE_MPN_MOD_CALL(call)                                \
1296   {                                                                     \
1297     unsigned   i;                                                       \
1298                                                                         \
1299     SPEED_RESTRICT_COND (s->size >= 0);                                 \
1300                                                                         \
1301     speed_operand_src (s, s->xp, s->size);                              \
1302     speed_cache_fill (s);                                               \
1303                                                                         \
1304     speed_starttime ();                                                 \
1305     i = s->reps;                                                        \
1306     do                                                                  \
1307       call;                                                             \
1308     while (--i != 0);                                                   \
1309                                                                         \
1310     return speed_endtime ();                                            \
1311   }
1312
1313 #define SPEED_ROUTINE_MPN_MOD_1(function)                               \
1314    SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
1315
1316 #define SPEED_ROUTINE_MPN_MOD_1C(function)                              \
1317    SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
1318
1319 #define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function)                      \
1320   SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
1321
1322 #define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function)                     \
1323   SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
1324
1325 #define SPEED_ROUTINE_MPN_MOD_34LSUB1(function)                         \
1326    SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
1327
1328 #define SPEED_ROUTINE_MPN_PREINV_MOD_1(function)                        \
1329   {                                                                     \
1330     unsigned   i;                                                       \
1331     mp_limb_t  inv;                                                     \
1332                                                                         \
1333     SPEED_RESTRICT_COND (s->size >= 0);                                 \
1334     SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT);                      \
1335                                                                         \
1336     invert_limb (inv, s->r);                                            \
1337     speed_operand_src (s, s->xp, s->size);                              \
1338     speed_cache_fill (s);                                               \
1339                                                                         \
1340     speed_starttime ();                                                 \
1341     i = s->reps;                                                        \
1342     do                                                                  \
1343       (*function) (s->xp, s->size, s->r, inv);                          \
1344     while (--i != 0);                                                   \
1345                                                                         \
1346     return speed_endtime ();                                            \
1347   }
1348
1349 #define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc)                       \
1350   {                                                                     \
1351     unsigned   i;                                                       \
1352     mp_limb_t  inv[4];                                                  \
1353                                                                         \
1354     SPEED_RESTRICT_COND (s->size >= 2);                                 \
1355                                                                         \
1356     mpn_mod_1_1p_cps (inv, s->r);                                       \
1357     speed_operand_src (s, s->xp, s->size);                              \
1358     speed_cache_fill (s);                                               \
1359                                                                         \
1360     speed_starttime ();                                                 \
1361     i = s->reps;                                                        \
1362     do {                                                                \
1363       pfunc (inv, s->r);                                                \
1364       function (s->xp, s->size, s->r, inv);                             \
1365     } while (--i != 0);                                                 \
1366                                                                         \
1367     return speed_endtime ();                                            \
1368   }
1369 #define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N)                     \
1370   {                                                                     \
1371     unsigned   i;                                                       \
1372     mp_limb_t  inv[N+3];                                                \
1373                                                                         \
1374     SPEED_RESTRICT_COND (s->size >= 1);                                 \
1375     SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N);                    \
1376                                                                         \
1377     speed_operand_src (s, s->xp, s->size);                              \
1378     speed_cache_fill (s);                                               \
1379                                                                         \
1380     speed_starttime ();                                                 \
1381     i = s->reps;                                                        \
1382     do {                                                                \
1383       pfunc (inv, s->r);                                                \
1384       function (s->xp, s->size, s->r, inv);                             \
1385     } while (--i != 0);                                                 \
1386                                                                         \
1387     return speed_endtime ();                                            \
1388   }
1389
1390
1391 /* A division of 2*s->size by s->size limbs */
1392
1393 #define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call)                          \
1394   {                                                                     \
1395     unsigned  i;                                                        \
1396     mp_ptr    a, d, q, r;                                               \
1397     double    t;                                                        \
1398     gmp_pi1_t dinv;                                                     \
1399     TMP_DECL;                                                           \
1400                                                                         \
1401     SPEED_RESTRICT_COND (s->size >= 1);                                 \
1402                                                                         \
1403     TMP_MARK;                                                           \
1404     SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp);                  \
1405     SPEED_TMP_ALLOC_LIMBS (d, s->size,   s->align_yp);                  \
1406     SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp);                  \
1407     SPEED_TMP_ALLOC_LIMBS (r, s->size,   s->align_wp2);                 \
1408                                                                         \
1409     MPN_COPY (a, s->xp, s->size);                                       \
1410     MPN_COPY (a+s->size, s->xp, s->size);                               \
1411                                                                         \
1412     MPN_COPY (d, s->yp, s->size);                                       \
1413                                                                         \
1414     /* normalize the data */                                            \
1415     d[s->size-1] |= GMP_NUMB_HIGHBIT;                                   \
1416     a[2*s->size-1] = d[s->size-1] - 1;                                  \
1417                                                                         \
1418     invert_pi1 (dinv, d[s->size-1], d[s->size-2]);                      \
1419                                                                         \
1420     speed_operand_src (s, a, 2*s->size);                                \
1421     speed_operand_src (s, d, s->size);                                  \
1422     speed_operand_dst (s, q, s->size+1);                                \
1423     speed_operand_dst (s, r, s->size);                                  \
1424     speed_cache_fill (s);                                               \
1425                                                                         \
1426     speed_starttime ();                                                 \
1427     i = s->reps;                                                        \
1428     do                                                                  \
1429       call;                                                             \
1430     while (--i != 0);                                                   \
1431     t = speed_endtime ();                                               \
1432                                                                         \
1433     TMP_FREE;                                                           \
1434     return t;                                                           \
1435   }
1436
1437
1438 /* A remainder 2*s->size by s->size limbs */
1439
1440 #define SPEED_ROUTINE_MPZ_MOD(function)                                 \
1441   {                                                                     \
1442     unsigned   i;                                                       \
1443     mpz_t      a, d, r;                                                 \
1444                                                                         \
1445     SPEED_RESTRICT_COND (s->size >= 1);                                 \
1446                                                                         \
1447     mpz_init_set_n (d, s->yp, s->size);                                 \
1448                                                                         \
1449     /* high part less than d, low part a duplicate copied in */         \
1450     mpz_init_set_n (a, s->xp, s->size);                                 \
1451     mpz_mod (a, a, d);                                                  \
1452     mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size);                       \
1453     MPN_COPY (PTR(a), s->xp, s->size);                                  \
1454                                                                         \
1455     mpz_init (r);                                                       \
1456                                                                         \
1457     speed_operand_src (s, PTR(a), SIZ(a));                              \
1458     speed_operand_src (s, PTR(d), SIZ(d));                              \
1459     speed_cache_fill (s);                                               \
1460                                                                         \
1461     speed_starttime ();                                                 \
1462     i = s->reps;                                                        \
1463     do                                                                  \
1464       function (r, a, d);                                               \
1465     while (--i != 0);                                                   \
1466     return speed_endtime ();                                            \
1467   }
1468
1469 #define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN)            \
1470   {                                                                     \
1471     unsigned   i;                                                       \
1472     mp_ptr     dp, tp, ap, qp;                                          \
1473     gmp_pi1_t  inv;                                                     \
1474     double     t;                                                       \
1475     mp_size_t size1;                                                    \
1476     TMP_DECL;                                                           \
1477                                                                         \
1478     size1 = (s->r == 0 ? 2 * s->size : s->r);                           \
1479                                                                         \
1480     SPEED_RESTRICT_COND (s->size >= DMIN);                              \
1481     SPEED_RESTRICT_COND (size1 - s->size >= QMIN);                      \
1482                                                                         \
1483     TMP_MARK;                                                           \
1484     SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp);                     \
1485     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);                   \
1486     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);           \
1487     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2);                    \
1488                                                                         \
1489     /* we don't fill in dividend completely when size1 > s->size */     \
1490     MPN_COPY (ap,         s->xp, s->size);                              \
1491     MPN_COPY (ap + size1 - s->size, s->xp, s->size);                    \
1492                                                                         \
1493     MPN_COPY (dp,         s->yp, s->size);                              \
1494                                                                         \
1495     /* normalize the data */                                            \
1496     dp[s->size-1] |= GMP_NUMB_HIGHBIT;                                  \
1497     ap[size1 - 1] = dp[s->size - 1] - 1;                                \
1498                                                                         \
1499     invert_pi1 (inv, dp[s->size-1], dp[s->size-2]);                     \
1500                                                                         \
1501     speed_operand_src (s, ap, size1);                                   \
1502     speed_operand_dst (s, tp, size1);                                   \
1503     speed_operand_src (s, dp, s->size);                                 \
1504     speed_operand_dst (s, qp, size1 - s->size);                         \
1505     speed_cache_fill (s);                                               \
1506                                                                         \
1507     speed_starttime ();                                                 \
1508     i = s->reps;                                                        \
1509     do {                                                                \
1510       MPN_COPY (tp, ap, size1);                                         \
1511       function (qp, tp, size1, dp, s->size, INV);                       \
1512     } while (--i != 0);                                                 \
1513     t = speed_endtime ();                                               \
1514                                                                         \
1515     TMP_FREE;                                                           \
1516     return t;                                                           \
1517   }
1518 #define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn)                     \
1519   {                                                                     \
1520     unsigned   i;                                                       \
1521     mp_ptr     dp, tp, qp, scratch;                                     \
1522     double     t;                                                       \
1523     mp_size_t itch;                                                     \
1524     TMP_DECL;                                                           \
1525                                                                         \
1526     SPEED_RESTRICT_COND (s->size >= 2);                                 \
1527                                                                         \
1528     itch = itchfn (2 * s->size, s->size, 0);                            \
1529     TMP_MARK;                                                           \
1530     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);                   \
1531     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);                   \
1532     SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);               \
1533     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);                \
1534                                                                         \
1535     MPN_COPY (tp,         s->xp, s->size);                              \
1536     MPN_COPY (tp+s->size, s->xp, s->size);                              \
1537                                                                         \
1538     /* normalize the data */                                            \
1539     dp[s->size-1] |= GMP_NUMB_HIGHBIT;                                  \
1540     tp[2*s->size-1] = dp[s->size-1] - 1;                                \
1541                                                                         \
1542     speed_operand_dst (s, qp, s->size);                                 \
1543     speed_operand_src (s, tp, 2 * s->size);                             \
1544     speed_operand_src (s, dp, s->size);                                 \
1545     speed_operand_dst (s, scratch, itch);                               \
1546     speed_cache_fill (s);                                               \
1547                                                                         \
1548     speed_starttime ();                                                 \
1549     i = s->reps;                                                        \
1550     do {                                                                \
1551       function (qp, tp, 2 * s->size, dp, s->size, scratch);             \
1552     } while (--i != 0);                                                 \
1553     t = speed_endtime ();                                               \
1554                                                                         \
1555     TMP_FREE;                                                           \
1556     return t;                                                           \
1557   }
1558 #define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn)                    \
1559   {                                                                     \
1560     unsigned   i;                                                       \
1561     mp_ptr     dp, tp, qp, rp, scratch;                                 \
1562     double     t;                                                       \
1563     mp_size_t size1, itch;                                              \
1564     TMP_DECL;                                                           \
1565                                                                         \
1566     size1 = (s->r == 0 ? 2 * s->size : s->r);                           \
1567                                                                         \
1568     SPEED_RESTRICT_COND (s->size >= 2);                                 \
1569     SPEED_RESTRICT_COND (size1 >= s->size);                             \
1570                                                                         \
1571     itch = itchfn (size1, s->size, 0);                                  \
1572     TMP_MARK;                                                           \
1573     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);                   \
1574     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);           \
1575     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);                     \
1576     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);                \
1577     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \
1578                                                                         \
1579     /* we don't fill in dividend completely when size1 > s->size */     \
1580     MPN_COPY (tp,         s->xp, s->size);                              \
1581     MPN_COPY (tp + size1 - s->size, s->xp, s->size);                    \
1582                                                                         \
1583     MPN_COPY (dp,         s->yp, s->size);                              \
1584                                                                         \
1585     /* normalize the data */                                            \
1586     dp[s->size-1] |= GMP_NUMB_HIGHBIT;                                  \
1587     tp[size1 - 1] = dp[s->size - 1] - 1;                                \
1588                                                                         \
1589     speed_operand_dst (s, qp, size1 - s->size);                         \
1590     speed_operand_dst (s, rp, s->size);                                 \
1591     speed_operand_src (s, tp, size1);                                   \
1592     speed_operand_src (s, dp, s->size);                                 \
1593     speed_operand_dst (s, scratch, itch);                               \
1594     speed_cache_fill (s);                                               \
1595                                                                         \
1596     speed_starttime ();                                                 \
1597     i = s->reps;                                                        \
1598     do {                                                                \
1599       function (qp, rp, tp, size1, dp, s->size, scratch);               \
1600     } while (--i != 0);                                                 \
1601     t = speed_endtime ();                                               \
1602                                                                         \
1603     TMP_FREE;                                                           \
1604     return t;                                                           \
1605   }
1606 #define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn)                  \
1607   {                                                                     \
1608     unsigned   i;                                                       \
1609     mp_ptr     dp, tp, qp, rp, ip, scratch;                             \
1610     double     t;                                                       \
1611     mp_size_t size1, itch;                                              \
1612     TMP_DECL;                                                           \
1613                                                                         \
1614     size1 = (s->r == 0 ? 2 * s->size : s->r);                           \
1615                                                                         \
1616     SPEED_RESTRICT_COND (s->size >= 2);                                 \
1617     SPEED_RESTRICT_COND (size1 >= s->size);                             \
1618                                                                         \
1619     itch = itchfn (size1, s->size, 0);                                  \
1620     TMP_MARK;                                                           \
1621     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);                   \
1622     SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp);           \
1623     SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp);                     \
1624     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);                \
1625     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \
1626     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */ \
1627                                                                         \
1628     /* we don't fill in dividend completely when size1 > s->size */     \
1629     MPN_COPY (tp,         s->xp, s->size);                              \
1630     MPN_COPY (tp + size1 - s->size, s->xp, s->size);                    \
1631                                                                         \
1632     MPN_COPY (dp,         s->yp, s->size);                              \
1633                                                                         \
1634     /* normalize the data */                                            \
1635     dp[s->size-1] |= GMP_NUMB_HIGHBIT;                                  \
1636     tp[size1 - 1] = dp[s->size-1] - 1;                                  \
1637                                                                         \
1638     mpn_invert (ip, dp, s->size, NULL);                                 \
1639                                                                         \
1640     speed_operand_dst (s, qp, size1 - s->size);                         \
1641     speed_operand_dst (s, rp, s->size);                                 \
1642     speed_operand_src (s, tp, size1);                                   \
1643     speed_operand_src (s, dp, s->size);                                 \
1644     speed_operand_src (s, ip, s->size);                                 \
1645     speed_operand_dst (s, scratch, itch);                               \
1646     speed_cache_fill (s);                                               \
1647                                                                         \
1648     speed_starttime ();                                                 \
1649     i = s->reps;                                                        \
1650     do {                                                                \
1651       function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch);  \
1652     } while (--i != 0);                                                 \
1653     t = speed_endtime ();                                               \
1654                                                                         \
1655     TMP_FREE;                                                           \
1656     return t;                                                           \
1657   }
1658
1659 #define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function)                         \
1660   {                                                                     \
1661     unsigned   i;                                                       \
1662     mp_ptr     dp, tp, ap, qp;                                          \
1663     mp_limb_t  inv;                                                     \
1664     double     t;                                                       \
1665     TMP_DECL;                                                           \
1666                                                                         \
1667     SPEED_RESTRICT_COND (s->size >= 1);                                 \
1668                                                                         \
1669     TMP_MARK;                                                           \
1670     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp);                 \
1671     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);                   \
1672     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);                   \
1673     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2);                \
1674                                                                         \
1675     MPN_COPY (ap,         s->xp, s->size);                              \
1676     MPN_COPY (ap+s->size, s->xp, s->size);                              \
1677                                                                         \
1678     /* divisor must be odd */                                           \
1679     MPN_COPY (dp, s->yp, s->size);                                      \
1680     dp[0] |= 1;                                                         \
1681     binvert_limb (inv, dp[0]);                                          \
1682     inv = -inv;                                                         \
1683                                                                         \
1684     speed_operand_src (s, ap, 2*s->size);                               \
1685     speed_operand_dst (s, tp, 2*s->size);                               \
1686     speed_operand_src (s, dp, s->size);                                 \
1687     speed_operand_dst (s, qp, s->size);                                 \
1688     speed_cache_fill (s);                                               \
1689                                                                         \
1690     speed_starttime ();                                                 \
1691     i = s->reps;                                                        \
1692     do {                                                                \
1693       MPN_COPY (tp, ap, 2*s->size);                                     \
1694       function (qp, tp, 2*s->size, dp, s->size, inv);                   \
1695     } while (--i != 0);                                                 \
1696     t = speed_endtime ();                                               \
1697                                                                         \
1698     TMP_FREE;                                                           \
1699     return t;                                                           \
1700   }
1701 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function)                          \
1702   {                                                                     \
1703     unsigned   i;                                                       \
1704     mp_ptr     dp, tp, qp;                                              \
1705     mp_limb_t  inv;                                                     \
1706     double     t;                                                       \
1707     TMP_DECL;                                                           \
1708                                                                         \
1709     SPEED_RESTRICT_COND (s->size >= 1);                                 \
1710                                                                         \
1711     TMP_MARK;                                                           \
1712     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);                   \
1713     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);                   \
1714     SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2);                  \
1715                                                                         \
1716     /* divisor must be odd */                                           \
1717     MPN_COPY (dp, s->yp, s->size);                                      \
1718     dp[0] |= 1;                                                         \
1719     binvert_limb (inv, dp[0]);                                          \
1720     inv = -inv;                                                         \
1721                                                                         \
1722     speed_operand_src (s, s->xp, s->size);                              \
1723     speed_operand_dst (s, tp, s->size);                                 \
1724     speed_operand_src (s, dp, s->size);                                 \
1725     speed_operand_dst (s, qp, s->size);                                 \
1726     speed_cache_fill (s);                                               \
1727                                                                         \
1728     speed_starttime ();                                                 \
1729     i = s->reps;                                                        \
1730     do {                                                                \
1731       MPN_COPY (tp, s->xp, s->size);                                    \
1732       function (qp, tp, s->size, dp, s->size, inv);                     \
1733     } while (--i != 0);                                                 \
1734     t = speed_endtime ();                                               \
1735                                                                         \
1736     TMP_FREE;                                                           \
1737     return t;                                                           \
1738   }
1739 #define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn)                    \
1740   {                                                                     \
1741     unsigned   i;                                                       \
1742     mp_ptr     dp, qp, scratch;                                         \
1743     double     t;                                                       \
1744     mp_size_t itch;                                                     \
1745     TMP_DECL;                                                           \
1746                                                                         \
1747     SPEED_RESTRICT_COND (s->size >= 2);                                 \
1748                                                                         \
1749     itch = itchfn (s->size, s->size);                                   \
1750     TMP_MARK;                                                           \
1751     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);                   \
1752     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);                   \
1753     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);                \
1754                                                                         \
1755     /* divisor must be odd */                                           \
1756     MPN_COPY (dp, s->yp, s->size);                                      \
1757     dp[0] |= 1;                                                         \
1758                                                                         \
1759     speed_operand_dst (s, qp, s->size);                                 \
1760     speed_operand_src (s, s->xp, s->size);                              \
1761     speed_operand_src (s, dp, s->size);                                 \
1762     speed_operand_dst (s, scratch, itch);                               \
1763     speed_cache_fill (s);                                               \
1764                                                                         \
1765     speed_starttime ();                                                 \
1766     i = s->reps;                                                        \
1767     do {                                                                \
1768       function (qp, s->xp, s->size, dp, s->size, scratch);              \
1769     } while (--i != 0);                                                 \
1770     t = speed_endtime ();                                               \
1771                                                                         \
1772     TMP_FREE;                                                           \
1773     return t;                                                           \
1774   }
1775 #define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn)                   \
1776   {                                                                     \
1777     unsigned   i;                                                       \
1778     mp_ptr     dp, tp, qp, rp, scratch;                                 \
1779     double     t;                                                       \
1780     mp_size_t itch;                                                     \
1781     TMP_DECL;                                                           \
1782                                                                         \
1783     SPEED_RESTRICT_COND (s->size >= 2);                                 \
1784                                                                         \
1785     itch = itchfn (2 * s->size, s->size);                               \
1786     TMP_MARK;                                                           \
1787     SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp);                   \
1788     SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp);                   \
1789     SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp);               \
1790     SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2);                \
1791     SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \
1792                                                                         \
1793     MPN_COPY (tp,         s->xp, s->size);                              \
1794     MPN_COPY (tp+s->size, s->xp, s->size);                              \
1795                                                                         \
1796     /* divisor must be odd */                                           \
1797     MPN_COPY (dp, s->yp, s->size);                                      \
1798     dp[0] |= 1;                                                         \
1799                                                                         \
1800     speed_operand_dst (s, qp, s->size);                                 \
1801     speed_operand_dst (s, rp, s->size);                                 \
1802     speed_operand_src (s, tp, 2 * s->size);                             \
1803     speed_operand_src (s, dp, s->size);                                 \
1804     speed_operand_dst (s, scratch, itch);                               \
1805     speed_cache_fill (s);                                               \
1806                                                                         \
1807     speed_starttime ();                                                 \
1808     i = s->reps;                                                        \
1809     do {                                                                \
1810       function (qp, rp, tp, 2 * s->size, dp, s->size, scratch);         \
1811     } while (--i != 0);                                                 \
1812     t = speed_endtime ();                                               \
1813                                                                         \
1814     TMP_FREE;                                                           \
1815     return t;                                                           \
1816   }
1817
1818 #define SPEED_ROUTINE_MPN_INVERT(function,itchfn)                       \
1819   {                                                                     \
1820     long  i;                                                            \
1821     mp_ptr    up, tp, ip;                                               \
1822     double    t;                                                        \
1823     TMP_DECL;                                                           \
1824                                                                         \
1825     SPEED_RESTRICT_COND (s->size >= 1);                                 \
1826                                                                         \
1827     TMP_MARK;                                                           \
1828     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);                   \
1829     SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);                 \
1830     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);          \
1831                                                                         \
1832     MPN_COPY (up, s->xp, s->size);                                      \
1833                                                                         \
1834     /* normalize the data */                                            \
1835     up[s->size-1] |= GMP_NUMB_HIGHBIT;                                  \
1836                                                                         \
1837     speed_operand_src (s, up, s->size);                                 \
1838     speed_operand_dst (s, tp, s->size);                                 \
1839     speed_operand_dst (s, ip, s->size);                                 \
1840     speed_cache_fill (s);                                               \
1841                                                                         \
1842     speed_starttime ();                                                 \
1843     i = s->reps;                                                        \
1844     do                                                                  \
1845       function (ip, up, s->size, tp);                                   \
1846     while (--i != 0);                                                   \
1847     t = speed_endtime ();                                               \
1848                                                                         \
1849     TMP_FREE;                                                           \
1850     return t;                                                           \
1851   }
1852
1853 #define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn)                   \
1854   {                                                                     \
1855     long  i;                                                            \
1856     mp_ptr    up, tp, ip;                                               \
1857     double    t;                                                        \
1858     TMP_DECL;                                                           \
1859                                                                         \
1860     SPEED_RESTRICT_COND (s->size >= 1);                                 \
1861                                                                         \
1862     TMP_MARK;                                                           \
1863     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);                   \
1864     SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);                   \
1865     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);          \
1866                                                                         \
1867     MPN_COPY (up, s->xp, s->size);                                      \
1868                                                                         \
1869     /* normalize the data */                                            \
1870     up[s->size-1] |= GMP_NUMB_HIGHBIT;                                  \
1871                                                                         \
1872     speed_operand_src (s, up, s->size);                                 \
1873     speed_operand_dst (s, tp, s->size);                                 \
1874     speed_operand_dst (s, ip, s->size);                                 \
1875     speed_cache_fill (s);                                               \
1876                                                                         \
1877     speed_starttime ();                                                 \
1878     i = s->reps;                                                        \
1879     do                                                                  \
1880       function (ip, up, s->size, tp);                                   \
1881     while (--i != 0);                                                   \
1882     t = speed_endtime ();                                               \
1883                                                                         \
1884     TMP_FREE;                                                           \
1885     return t;                                                           \
1886   }
1887
1888 #define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn)                \
1889   {                                                                     \
1890     long  i;                                                            \
1891     mp_ptr    up, tp, ip;                                               \
1892     double    t;                                                        \
1893     TMP_DECL;                                                           \
1894                                                                         \
1895     SPEED_RESTRICT_COND (s->size >= 3);                                 \
1896                                                                         \
1897     TMP_MARK;                                                           \
1898     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);                   \
1899     SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp);                   \
1900     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);          \
1901                                                                         \
1902     MPN_COPY (up, s->xp, s->size);                                      \
1903                                                                         \
1904     /* normalize the data */                                            \
1905     up[s->size-1] |= GMP_NUMB_HIGHBIT;                                  \
1906                                                                         \
1907     speed_operand_src (s, up, s->size);                                 \
1908     speed_operand_dst (s, tp, s->size);                                 \
1909     speed_operand_dst (s, ip, s->size);                                 \
1910     speed_cache_fill (s);                                               \
1911                                                                         \
1912     speed_starttime ();                                                 \
1913     i = s->reps;                                                        \
1914     do                                                                  \
1915       function (ip, up, s->size, tp);                                   \
1916     while (--i != 0);                                                   \
1917     t = speed_endtime ();                                               \
1918                                                                         \
1919     TMP_FREE;                                                           \
1920     return t;                                                           \
1921   }
1922
1923 #define SPEED_ROUTINE_MPN_BINVERT(function,itchfn)                      \
1924   {                                                                     \
1925     long  i;                                                            \
1926     mp_ptr    up, tp, ip;                                               \
1927     double    t;                                                        \
1928     TMP_DECL;                                                           \
1929                                                                         \
1930     SPEED_RESTRICT_COND (s->size >= 1);                                 \
1931                                                                         \
1932     TMP_MARK;                                                           \
1933     SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp);                   \
1934     SPEED_TMP_ALLOC_LIMBS (up, s->size,   s->align_yp);                 \
1935     SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp);          \
1936                                                                         \
1937     MPN_COPY (up, s->xp, s->size);                                      \
1938                                                                         \
1939     /* normalize the data */                                            \
1940     up[0] |= 1;                                                         \
1941                                                                         \
1942     speed_operand_src (s, up, s->size);                                 \
1943     speed_operand_dst (s, tp, s->size);                                 \
1944     speed_operand_dst (s, ip, s->size);                                 \
1945     speed_cache_fill (s);                                               \
1946                                                                         \
1947     speed_starttime ();                                                 \
1948     i = s->reps;                                                        \
1949     do                                                                  \
1950       function (ip, up, s->size, tp);                                   \
1951     while (--i != 0);                                                   \
1952     t = speed_endtime ();                                               \
1953                                                                         \
1954     TMP_FREE;                                                           \
1955     return t;                                                           \
1956   }
1957
1958 #define SPEED_ROUTINE_REDC_1(function)                                  \
1959   {                                                                     \
1960     unsigned   i;                                                       \
1961     mp_ptr     cp, mp, tp, ap;                                          \
1962     mp_limb_t  inv;                                                     \
1963     double     t;                                                       \
1964     TMP_DECL;                                                           \
1965                                                                         \
1966     SPEED_RESTRICT_COND (s->size >= 1);                                 \
1967                                                                         \
1968     TMP_MARK;                                                           \
1969     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);               \
1970     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);               \
1971     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);               \
1972     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);              \
1973                                                                         \
1974     MPN_COPY (ap,         s->xp, s->size);                              \
1975     MPN_COPY (ap+s->size, s->xp, s->size);                              \
1976                                                                         \
1977     /* modulus must be odd */                                           \
1978     MPN_COPY (mp, s->yp, s->size);                                      \
1979     mp[0] |= 1;                                                         \
1980     binvert_limb (inv, mp[0]);                                          \
1981     inv = -inv;                                                         \
1982                                                                         \
1983     speed_operand_src (s, ap, 2*s->size+1);                             \
1984     speed_operand_dst (s, tp, 2*s->size+1);                             \
1985     speed_operand_src (s, mp, s->size);                                 \
1986     speed_operand_dst (s, cp, s->size);                                 \
1987     speed_cache_fill (s);                                               \
1988                                                                         \
1989     speed_starttime ();                                                 \
1990     i = s->reps;                                                        \
1991     do {                                                                \
1992       MPN_COPY (tp, ap, 2*s->size);                                     \
1993       function (cp, tp, mp, s->size, inv);                              \
1994     } while (--i != 0);                                                 \
1995     t = speed_endtime ();                                               \
1996                                                                         \
1997     TMP_FREE;                                                           \
1998     return t;                                                           \
1999   }
2000 #define SPEED_ROUTINE_REDC_2(function)                                  \
2001   {                                                                     \
2002     unsigned   i;                                                       \
2003     mp_ptr     cp, mp, tp, ap;                                          \
2004     mp_limb_t  invp[2];                                                 \
2005     double     t;                                                       \
2006     TMP_DECL;                                                           \
2007                                                                         \
2008     SPEED_RESTRICT_COND (s->size >= 1);                                 \
2009                                                                         \
2010     TMP_MARK;                                                           \
2011     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);               \
2012     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);               \
2013     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);               \
2014     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);              \
2015                                                                         \
2016     MPN_COPY (ap,         s->xp, s->size);                              \
2017     MPN_COPY (ap+s->size, s->xp, s->size);                              \
2018                                                                         \
2019     /* modulus must be odd */                                           \
2020     MPN_COPY (mp, s->yp, s->size);                                      \
2021     mp[0] |= 1;                                                         \
2022     mpn_binvert (invp, mp, 2, tp);                                      \
2023     invp[0] = -invp[0]; invp[1] = ~invp[1];                             \
2024                                                                         \
2025     speed_operand_src (s, ap, 2*s->size+1);                             \
2026     speed_operand_dst (s, tp, 2*s->size+1);                             \
2027     speed_operand_src (s, mp, s->size);                                 \
2028     speed_operand_dst (s, cp, s->size);                                 \
2029     speed_cache_fill (s);                                               \
2030                                                                         \
2031     speed_starttime ();                                                 \
2032     i = s->reps;                                                        \
2033     do {                                                                \
2034       MPN_COPY (tp, ap, 2*s->size);                                     \
2035       function (cp, tp, mp, s->size, invp);                             \
2036     } while (--i != 0);                                                 \
2037     t = speed_endtime ();                                               \
2038                                                                         \
2039     TMP_FREE;                                                           \
2040     return t;                                                           \
2041   }
2042 #define SPEED_ROUTINE_REDC_N(function)                                  \
2043   {                                                                     \
2044     unsigned   i;                                                       \
2045     mp_ptr     cp, mp, tp, ap, invp;                                    \
2046     double     t;                                                       \
2047     TMP_DECL;                                                           \
2048                                                                         \
2049     SPEED_RESTRICT_COND (s->size > 8);                                  \
2050                                                                         \
2051     TMP_MARK;                                                           \
2052     SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp);               \
2053     SPEED_TMP_ALLOC_LIMBS (mp, s->size,     s->align_yp);               \
2054     SPEED_TMP_ALLOC_LIMBS (cp, s->size,     s->align_wp);               \
2055     SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2);              \
2056     SPEED_TMP_ALLOC_LIMBS (invp, s->size,   s->align_wp2); /* align? */ \
2057                                                                         \
2058     MPN_COPY (ap,         s->xp, s->size);                              \
2059     MPN_COPY (ap+s->size, s->xp, s->size);                              \
2060                                                                         \
2061     /* modulus must be odd */                                           \
2062     MPN_COPY (mp, s->yp, s->size);                                      \
2063     mp[0] |= 1;                                                         \
2064     mpn_binvert (invp, mp, s->size, tp);                                \
2065                                                                         \
2066     speed_operand_src (s, ap, 2*s->size+1);                             \
2067     speed_operand_dst (s, tp, 2*s->size+1);                             \
2068     speed_operand_src (s, mp, s->size);                                 \
2069     speed_operand_dst (s, cp, s->size);                                 \
2070     speed_cache_fill (s);                                               \
2071                                                                         \
2072     speed_starttime ();                                                 \
2073     i = s->reps;                                                        \
2074     do {                                                                \
2075       MPN_COPY (tp, ap, 2*s->size);                                     \
2076       function (cp, tp, mp, s->size, invp);                             \
2077     } while (--i != 0);                                                 \
2078     t = speed_endtime ();                                               \
2079                                                                         \
2080     TMP_FREE;                                                           \
2081     return t;                                                           \
2082   }
2083
2084
2085 #define SPEED_ROUTINE_MPN_POPCOUNT(function)                            \
2086   {                                                                     \
2087     unsigned i;                                                         \
2088                                                                         \
2089     SPEED_RESTRICT_COND (s->size >= 1);                                 \
2090                                                                         \
2091     speed_operand_src (s, s->xp, s->size);                              \
2092     speed_cache_fill (s);                                               \
2093                                                                         \
2094     speed_starttime ();                                                 \
2095     i = s->reps;                                                        \
2096     do                                                                  \
2097       function (s->xp, s->size);                                        \
2098     while (--i != 0);                                                   \
2099                                                                         \
2100     return speed_endtime ();                                            \
2101   }
2102
2103 #define SPEED_ROUTINE_MPN_HAMDIST(function)                             \
2104   {                                                                     \
2105     unsigned i;                                                         \
2106                                                                         \
2107     SPEED_RESTRICT_COND (s->size >= 1);                                 \
2108                                                                         \
2109     speed_operand_src (s, s->xp, s->size);                              \
2110     speed_operand_src (s, s->yp, s->size);                              \
2111     speed_cache_fill (s);                                               \
2112                                                                         \
2113     speed_starttime ();                                                 \
2114     i = s->reps;                                                        \
2115     do                                                                  \
2116       function (s->xp, s->yp, s->size);                                 \
2117     while (--i != 0);                                                   \
2118                                                                         \
2119     return speed_endtime ();                                            \
2120   }
2121
2122
2123 #define SPEED_ROUTINE_MPZ_UI(function)                                  \
2124   {                                                                     \
2125     mpz_t     z;                                                        \
2126     unsigned  i;                                                        \
2127     double    t;                                                        \
2128                                                                         \
2129     SPEED_RESTRICT_COND (s->size >= 0);                                 \
2130                                                                         \
2131     mpz_init (z);                                                       \
2132                                                                         \
2133     speed_starttime ();                                                 \
2134     i = s->reps;                                                        \
2135     do                                                                  \
2136       function (z, s->size);                                            \
2137     while (--i != 0);                                                   \
2138     t = speed_endtime ();                                               \
2139                                                                         \
2140     mpz_clear (z);                                                      \
2141     return t;                                                           \
2142   }
2143
2144 #define SPEED_ROUTINE_MPZ_FAC_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
2145 #define SPEED_ROUTINE_MPZ_FIB_UI(function)    SPEED_ROUTINE_MPZ_UI(function)
2146 #define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
2147
2148
2149 #define SPEED_ROUTINE_MPZ_2_UI(function)                                \
2150   {                                                                     \
2151     mpz_t     z, z2;                                                    \
2152     unsigned  i;                                                        \
2153     double    t;                                                        \
2154                                                                         \
2155     SPEED_RESTRICT_COND (s->size >= 0);                                 \
2156                                                                         \
2157     mpz_init (z);                                                       \
2158     mpz_init (z2);                                                      \
2159                                                                         \
2160     speed_starttime ();                                                 \
2161     i = s->reps;                                                        \
2162     do                                                                  \
2163       function (z, z2, s->size);                                        \
2164     while (--i != 0);                                                   \
2165     t = speed_endtime ();                                               \
2166                                                                         \
2167     mpz_clear (z);                                                      \
2168     mpz_clear (z2);                                                     \
2169     return t;                                                           \
2170   }
2171
2172 #define SPEED_ROUTINE_MPZ_FIB2_UI(function)    SPEED_ROUTINE_MPZ_2_UI(function)
2173 #define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
2174
2175
2176 #define SPEED_ROUTINE_MPN_FIB2_UI(function)                             \
2177   {                                                                     \
2178     mp_ptr     fp, f1p;                                                 \
2179     mp_size_t  alloc;                                                   \
2180     unsigned   i;                                                       \
2181     double     t;                                                       \
2182     TMP_DECL;                                                           \
2183                                                                         \
2184     SPEED_RESTRICT_COND (s->size >= 0);                                 \
2185                                                                         \
2186     TMP_MARK;                                                           \
2187     alloc = MPN_FIB2_SIZE (s->size);                                    \
2188     SPEED_TMP_ALLOC_LIMBS (fp,  alloc, s->align_xp);                    \
2189     SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp);                    \
2190                                                                         \
2191     speed_starttime ();                                                 \
2192     i = s->reps;                                                        \
2193     do                                                                  \
2194       function (fp, f1p, s->size);                                      \
2195     while (--i != 0);                                                   \
2196     t = speed_endtime ();                                               \
2197                                                                         \
2198     TMP_FREE;                                                           \
2199     return t;                                                           \
2200   }
2201
2202
2203
2204 /* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
2205    limbs.  m is forced to odd so that redc can be used.  e is limited in
2206    size so the calculation doesn't take too long. */
2207 #define SPEED_ROUTINE_MPZ_POWM(function)                                \
2208   {                                                                     \
2209     mpz_t     r, b, e, m;                                               \
2210     unsigned  i;                                                        \
2211     double    t;                                                        \
2212                                                                         \
2213     SPEED_RESTRICT_COND (s->size >= 1);                                 \
2214                                                                         \
2215     mpz_init (r);                                                       \
2216     mpz_init_set_n (b, s->xp, s->size);                                 \
2217     mpz_init_set_n (m, s->yp, s->size);                                 \
2218     mpz_setbit (m, 0);  /* force m to odd */                            \
2219     mpz_init_set_n (e, s->xp_block, 6);                                 \
2220                                                                         \
2221     speed_starttime ();                                                 \
2222     i = s->reps;                                                        \
2223     do                                                                  \
2224       function (r, b, e, m);                                            \
2225     while (--i != 0);                                                   \
2226     t = speed_endtime ();                                               \
2227                                                                         \
2228     mpz_clear (r);                                                      \
2229     mpz_clear (b);                                                      \
2230     mpz_clear (e);                                                      \
2231     mpz_clear (m);                                                      \
2232     return t;                                                           \
2233   }
2234
2235 /* (m-2)^0xAAAAAAAA mod m */
2236 #define SPEED_ROUTINE_MPZ_POWM_UI(function)                             \
2237   {                                                                     \
2238     mpz_t     r, b, m;                                                  \
2239     unsigned  long  e;                                                  \
2240     unsigned  i;                                                        \
2241     double    t;                                                        \
2242                                                                         \
2243     SPEED_RESTRICT_COND (s->size >= 1);                                 \
2244                                                                         \
2245     mpz_init (r);                                                       \
2246                                                                         \
2247     /* force m to odd */                                                \
2248     mpz_init (m);                                                       \
2249     mpz_set_n (m, s->xp, s->size);                                      \
2250     PTR(m)[0] |= 1;                                                     \
2251                                                                         \
2252     e = (~ (unsigned long) 0) / 3;                                      \
2253     if (s->r != 0)                                                      \
2254       e = s->r;                                                         \
2255                                                                         \
2256     mpz_init_set (b, m);                                                \
2257     mpz_sub_ui (b, b, 2);                                               \
2258 /* printf ("%X\n", mpz_get_ui(m)); */                                   \
2259     i = s->reps;                                                        \
2260     speed_starttime ();                                                 \
2261     do                                                                  \
2262       function (r, b, e, m);                                            \
2263     while (--i != 0);                                                   \
2264     t = speed_endtime ();                                               \
2265                                                                         \
2266     mpz_clear (r);                                                      \
2267     mpz_clear (b);                                                      \
2268     mpz_clear (m);                                                      \
2269     return t;                                                           \
2270   }
2271
2272
2273 #define SPEED_ROUTINE_MPN_ADDSUB_CALL(call)                             \
2274   {                                                                     \
2275     mp_ptr    wp, wp2, xp, yp;                                          \
2276     unsigned  i;                                                        \
2277     double    t;                                                        \
2278     TMP_DECL;                                                           \
2279                                                                         \
2280     SPEED_RESTRICT_COND (s->size >= 0);                                 \
2281                                                                         \
2282     TMP_MARK;                                                           \
2283     SPEED_TMP_ALLOC_LIMBS (wp,  s->size, s->align_wp);                  \
2284     SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);                 \
2285     xp = s->xp;                                                         \
2286     yp = s->yp;                                                         \
2287                                                                         \
2288     if (s->r == 0)      ;                                               \
2289     else if (s->r == 1) { xp = wp;            }                         \
2290     else if (s->r == 2) {           yp = wp2; }                         \
2291     else if (s->r == 3) { xp = wp;  yp = wp2; }                         \
2292     else if (s->r == 4) { xp = wp2; yp = wp;  }                         \
2293     else {                                                              \
2294       TMP_FREE;                                                         \
2295       return -1.0;                                                      \
2296     }                                                                   \
2297     if (xp != s->xp) MPN_COPY (xp, s->xp, s->size);                     \
2298     if (yp != s->yp) MPN_COPY (yp, s->yp, s->size);                     \
2299                                                                         \
2300     speed_operand_src (s, xp, s->size);                                 \
2301     speed_operand_src (s, yp, s->size);                                 \
2302     speed_operand_dst (s, wp, s->size);                                 \
2303     speed_operand_dst (s, wp2, s->size);                                \
2304     speed_cache_fill (s);                                               \
2305                                                                         \
2306     speed_starttime ();                                                 \
2307     i = s->reps;                                                        \
2308     do                                                                  \
2309       call;                                                             \
2310     while (--i != 0);                                                   \
2311     t = speed_endtime ();                                               \
2312                                                                         \
2313     TMP_FREE;                                                           \
2314     return t;                                                           \
2315   }
2316
2317 #define SPEED_ROUTINE_MPN_ADDSUB_N(function)                            \
2318   SPEED_ROUTINE_MPN_ADDSUB_CALL                                         \
2319     (function (wp, wp2, xp, yp, s->size));
2320
2321 #define SPEED_ROUTINE_MPN_ADDSUB_NC(function)                           \
2322   SPEED_ROUTINE_MPN_ADDSUB_CALL                                         \
2323     (function (wp, wp2, xp, yp, s->size, 0));
2324
2325
2326 /* Doing an Nx1 gcd with the given r. */
2327 #define SPEED_ROUTINE_MPN_GCD_1N(function)                              \
2328   {                                                                     \
2329     mp_ptr    xp;                                                       \
2330     unsigned  i;                                                        \
2331     double    t;                                                        \
2332     TMP_DECL;                                                           \
2333                                                                         \
2334     SPEED_RESTRICT_COND (s->size >= 1);                                 \
2335     SPEED_RESTRICT_COND (s->r != 0);                                    \
2336                                                                         \
2337     TMP_MARK;                                                           \
2338     SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);                   \
2339     MPN_COPY (xp, s->xp, s->size);                                      \
2340     xp[0] |= refmpn_zero_p (xp, s->size);                               \
2341                                                                         \
2342     speed_operand_src (s, s->xp, s->size);                              \
2343     speed_cache_fill (s);                                               \
2344                                                                         \
2345     speed_starttime ();                                                 \
2346     i = s->reps;                                                        \
2347     do                                                                  \
2348       function (xp, s->size, s->r);                                     \
2349     while (--i != 0);                                                   \
2350     t = speed_endtime ();                                               \
2351                                                                         \
2352     TMP_FREE;                                                           \
2353     return t;                                                           \
2354   }
2355
2356
2357 /* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
2358
2359 #define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call)                       \
2360   {                                                                     \
2361     unsigned  i, j;                                                     \
2362     mp_ptr    px, py;                                                   \
2363     mp_limb_t x_mask, y_mask;                                           \
2364     double    t;                                                        \
2365     TMP_DECL;                                                           \
2366                                                                         \
2367     SPEED_RESTRICT_COND (s->size >= 1);                                 \
2368     SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb);                  \
2369                                                                         \
2370     TMP_MARK;                                                           \
2371     SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp);          \
2372     SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp);          \
2373     MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE);                       \
2374     MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE);                       \
2375                                                                         \
2376     x_mask = MP_LIMB_T_LOWBITMASK (s->size);                            \
2377     y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size);         \
2378     for (i = 0; i < SPEED_BLOCK_SIZE; i++)                              \
2379       {                                                                 \
2380         px[i] &= x_mask; px[i] += (px[i] == 0);                         \
2381         py[i] &= y_mask; py[i] += (py[i] == 0);                         \
2382         setup;                                                          \
2383       }                                                                 \
2384                                                                         \
2385     speed_operand_src (s, px, SPEED_BLOCK_SIZE);                        \
2386     speed_operand_src (s, py, SPEED_BLOCK_SIZE);                        \
2387     speed_cache_fill (s);                                               \
2388                                                                         \
2389     speed_starttime ();                                                 \
2390     i = s->reps;                                                        \
2391     do                                                                  \
2392       {                                                                 \
2393         j = SPEED_BLOCK_SIZE;                                           \
2394         do                                                              \
2395           {                                                             \
2396             call;                                                       \
2397           }                                                             \
2398         while (--j != 0);                                               \
2399       }                                                                 \
2400     while (--i != 0);                                                   \
2401     t = speed_endtime ();                                               \
2402                                                                         \
2403     TMP_FREE;                                                           \
2404                                                                         \
2405     s->time_divisor = SPEED_BLOCK_SIZE;                                 \
2406     return t;                                                           \
2407   }
2408
2409 #define SPEED_ROUTINE_MPN_GCD_1(function)                               \
2410   SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
2411
2412 #define SPEED_ROUTINE_MPN_JACBASE(function)                             \
2413   SPEED_ROUTINE_MPN_GCD_1_CALL                                          \
2414     ({                                                                  \
2415        /* require x<y, y odd, y!=1 */                                   \
2416        px[i] %= py[i];                                                  \
2417        px[i] |= 1;                                                      \
2418        py[i] |= 1;                                                      \
2419        if (py[i]==1) py[i]=3;                                           \
2420      },                                                                 \
2421      function (px[j-1], py[j-1], 0))
2422
2423
2424 /* Run some GCDs of s->size limbs each.  The number of different data values
2425    is decreased as s->size**2, since GCD is a quadratic algorithm.
2426    SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
2427    though, because the plain gcd is about twice as fast as gcdext.  */
2428
2429 #define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call)                    \
2430   {                                                                     \
2431     unsigned  i;                                                        \
2432     mp_size_t j, pieces, psize;                                         \
2433     mp_ptr    wp, wp2, xtmp, ytmp, px, py;                              \
2434     double    t;                                                        \
2435     TMP_DECL;                                                           \
2436                                                                         \
2437     SPEED_RESTRICT_COND (s->size >= 1);                                 \
2438                                                                         \
2439     TMP_MARK;                                                           \
2440     SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);               \
2441     SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);               \
2442     SPEED_TMP_ALLOC_LIMBS (wp,   s->size+1, s->align_wp);               \
2443     SPEED_TMP_ALLOC_LIMBS (wp2,  s->size+1, s->align_wp2);              \
2444                                                                         \
2445     pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size;         \
2446     pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size);                  \
2447     pieces = MAX (pieces, 1);                                           \
2448                                                                         \
2449     psize = pieces * s->size;                                           \
2450     px = TMP_ALLOC_LIMBS (psize);                                       \
2451     py = TMP_ALLOC_LIMBS (psize);                                       \
2452     MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);              \
2453     MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);              \
2454                                                                         \
2455     /* Requirements: x >= y, y must be odd, high limbs != 0.            \
2456        No need to ensure random numbers are really great.  */           \
2457     for (j = 0; j < pieces; j++)                                        \
2458       {                                                                 \
2459         mp_ptr  x = px + j * s->size;                                   \
2460         mp_ptr  y = py + j * s->size;                                   \
2461         if (x[s->size - 1] == 0) x[s->size - 1] = 1;                    \
2462         if (y[s->size - 1] == 0) y[s->size - 1] = 1;                    \
2463                                                                         \
2464         if (x[s->size - 1] < y[s->size - 1])                            \
2465           MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]);              \
2466         else if (x[s->size - 1] == y[s->size - 1])                      \
2467           {                                                             \
2468             x[s->size - 1] = 2;                                         \
2469             y[s->size - 1] = 1;                                         \
2470           }                                                             \
2471         y[0] |= 1;                                                      \
2472       }                                                                 \
2473                                                                         \
2474     speed_operand_src (s, px, psize);                                   \
2475     speed_operand_src (s, py, psize);                                   \
2476     speed_operand_dst (s, xtmp, s->size);                               \
2477     speed_operand_dst (s, ytmp, s->size);                               \
2478     speed_operand_dst (s, wp, s->size);                                 \
2479     speed_cache_fill (s);                                               \
2480                                                                         \
2481     speed_starttime ();                                                 \
2482     i = s->reps;                                                        \
2483     do                                                                  \
2484       {                                                                 \
2485         j = pieces;                                                     \
2486         do                                                              \
2487           {                                                             \
2488             MPN_COPY (xtmp, px+(j - 1)*s->size, s->size);               \
2489             MPN_COPY (ytmp, py+(j - 1)*s->size, s->size);               \
2490             call;                                                       \
2491           }                                                             \
2492         while (--j != 0);                                               \
2493       }                                                                 \
2494     while (--i != 0);                                                   \
2495     t = speed_endtime ();                                               \
2496                                                                         \
2497     TMP_FREE;                                                           \
2498                                                                         \
2499     s->time_divisor = pieces;                                           \
2500     return t;                                                           \
2501   }
2502
2503 #define SPEED_ROUTINE_MPN_GCD(function) \
2504   SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
2505
2506 #define SPEED_ROUTINE_MPN_GCDEXT(function)                              \
2507   SPEED_ROUTINE_MPN_GCD_CALL                                            \
2508     (4, { mp_size_t  wp2size;                                           \
2509           function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
2510
2511
2512 #define SPEED_ROUTINE_MPN_GCDEXT_ONE(function)                          \
2513   {                                                                     \
2514     unsigned  i;                                                        \
2515     mp_size_t j, pieces, psize, wp2size;                                \
2516     mp_ptr    wp, wp2, xtmp, ytmp, px, py;                              \
2517     double    t;                                                        \
2518     TMP_DECL;                                                           \
2519                                                                         \
2520     SPEED_RESTRICT_COND (s->size >= 1);                                 \
2521                                                                         \
2522     TMP_MARK;                                                           \
2523                                                                         \
2524     SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp);               \
2525     SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp);               \
2526     MPN_COPY (xtmp, s->xp, s->size);                                    \
2527     MPN_COPY (ytmp, s->yp, s->size);                                    \
2528                                                                         \
2529     SPEED_TMP_ALLOC_LIMBS (wp,  s->size+1, s->align_wp);                \
2530     SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2);               \
2531                                                                         \
2532     pieces = SPEED_BLOCK_SIZE / 3;                                      \
2533     psize = 3 * pieces;                                                 \
2534     px = TMP_ALLOC_LIMBS (psize);                                       \
2535     py = TMP_ALLOC_LIMBS (psize);                                       \
2536     MPN_COPY (px, s->xp_block, psize);                                  \
2537     MPN_COPY (py, s->yp_block, psize);                                  \
2538                                                                         \
2539     /* x must have at least as many bits as y,                          \
2540        high limbs must be non-zero */                                   \
2541     for (j = 0; j < pieces; j++)                                        \
2542       {                                                                 \
2543         mp_ptr  x = px+3*j;                                             \
2544         mp_ptr  y = py+3*j;                                             \
2545         x[2] += (x[2] == 0);                                            \
2546         y[2] += (y[2] == 0);                                            \
2547         if (x[2] < y[2])                                                \
2548           MP_LIMB_T_SWAP (x[2], y[2]);                                  \
2549       }                                                                 \
2550                                                                         \
2551     speed_operand_src (s, px, psize);                                   \
2552     speed_operand_src (s, py, psize);                                   \
2553     speed_operand_dst (s, xtmp, s->size);                               \
2554     speed_operand_dst (s, ytmp, s->size);                               \
2555     speed_operand_dst (s, wp, s->size);                                 \
2556     speed_cache_fill (s);                                               \
2557                                                                         \
2558     speed_starttime ();                                                 \
2559     i = s->reps;                                                        \
2560     do                                                                  \
2561       {                                                                 \
2562         mp_ptr  x = px;                                                 \
2563         mp_ptr  y = py;                                                 \
2564         mp_ptr  xth = &xtmp[s->size-3];                                 \
2565         mp_ptr  yth = &ytmp[s->size-3];                                 \
2566         j = pieces;                                                     \
2567         do                                                              \
2568           {                                                             \
2569             xth[0] = x[0], xth[1] = x[1], xth[2] = x[2];                \
2570             yth[0] = y[0], yth[1] = y[1], yth[2] = y[2];                \
2571                                                                         \
2572             ytmp[0] |= 1; /* y must be odd, */                          \
2573                                                                         \
2574             function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); \
2575                                                                         \
2576             x += 3;                                                     \
2577             y += 3;                                                     \
2578           }                                                             \
2579         while (--j != 0);                                               \
2580       }                                                                 \
2581     while (--i != 0);                                                   \
2582     t = speed_endtime ();                                               \
2583                                                                         \
2584     TMP_FREE;                                                           \
2585                                                                         \
2586     s->time_divisor = pieces;                                           \
2587     return t;                                                           \
2588   }
2589
2590 #define SPEED_ROUTINE_MPZ_JACOBI(function)                              \
2591   {                                                                     \
2592     mpz_t     a, b;                                                     \
2593     unsigned  i;                                                        \
2594     mp_size_t j, pieces, psize;                                         \
2595     mp_ptr    px, py;                                                   \
2596     double    t;                                                        \
2597     TMP_DECL;                                                           \
2598                                                                         \
2599     TMP_MARK;                                                           \
2600     pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1);                       \
2601     pieces = MAX (pieces, 1);                                           \
2602     s->time_divisor = pieces;                                           \
2603                                                                         \
2604     psize = pieces * s->size;                                           \
2605     px = TMP_ALLOC_LIMBS (psize);                                       \
2606     py = TMP_ALLOC_LIMBS (psize);                                       \
2607     MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize);              \
2608     MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize);              \
2609                                                                         \
2610     for (j = 0; j < pieces; j++)                                        \
2611       {                                                                 \
2612         mp_ptr  x = px+j*s->size;                                       \
2613         mp_ptr  y = py+j*s->size;                                       \
2614                                                                         \
2615         /* y odd */                                                     \
2616         y[0] |= 1;                                                      \
2617                                                                         \
2618         /* high limbs non-zero */                                       \
2619         if (x[s->size-1] == 0) x[s->size-1] = 1;                        \
2620         if (y[s->size-1] == 0) y[s->size-1] = 1;                        \
2621       }                                                                 \
2622                                                                         \
2623     SIZ(a) = s->size;                                                   \
2624     SIZ(b) = s->size;                                                   \
2625                                                                         \
2626     speed_operand_src (s, px, psize);                                   \
2627     speed_operand_src (s, py, psize);                                   \
2628     speed_cache_fill (s);                                               \
2629                                                                         \
2630     speed_starttime ();                                                 \
2631     i = s->reps;                                                        \
2632     do                                                                  \
2633       {                                                                 \
2634         j = pieces;                                                     \
2635         do                                                              \
2636           {                                                             \
2637             PTR(a) = px+(j-1)*s->size;                                  \
2638             PTR(b) = py+(j-1)*s->size;                                  \
2639             function (a, b);                                            \
2640           }                                                             \
2641         while (--j != 0);                                               \
2642       }                                                                 \
2643     while (--i != 0);                                                   \
2644     t = speed_endtime ();                                               \
2645                                                                         \
2646     TMP_FREE;                                                           \
2647     return t;                                                           \
2648   }
2649
2650 #define SPEED_ROUTINE_MPN_DIVREM_2(function)                            \
2651   {                                                                     \
2652     mp_ptr    wp, xp;                                                   \
2653     mp_limb_t yp[2];                                                    \
2654     unsigned  i;                                                        \
2655     double    t;                                                        \
2656     TMP_DECL;                                                           \
2657                                                                         \
2658     SPEED_RESTRICT_COND (s->size >= 2);                                 \
2659                                                                         \
2660     TMP_MARK;                                                           \
2661     SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp);                   \
2662     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);                   \
2663                                                                         \
2664     /* source is destroyed */                                           \
2665     MPN_COPY (xp, s->xp, s->size);                                      \
2666                                                                         \
2667     /* divisor must be normalized */                                    \
2668     MPN_COPY (yp, s->yp_block, 2);                                      \
2669     yp[1] |= GMP_NUMB_HIGHBIT;                                          \
2670                                                                         \
2671     speed_operand_src (s, xp, s->size);                                 \
2672     speed_operand_src (s, yp, 2);                                       \
2673     speed_operand_dst (s, wp, s->size);                                 \
2674     speed_cache_fill (s);                                               \
2675                                                                         \
2676     speed_starttime ();                                                 \
2677     i = s->reps;                                                        \
2678     do                                                                  \
2679       function (wp, 0, xp, s->size, yp);                                \
2680     while (--i != 0);                                                   \
2681     t = speed_endtime ();                                               \
2682                                                                         \
2683     TMP_FREE;                                                           \
2684     return t;                                                           \
2685   }
2686
2687
2688 #define SPEED_ROUTINE_MODLIMB_INVERT(function)                          \
2689   {                                                                     \
2690     unsigned   i, j;                                                    \
2691     mp_ptr     xp;                                                      \
2692     mp_limb_t  n = 1;                                                   \
2693     double     t;                                                       \
2694                                                                         \
2695     xp = s->xp_block-1;                                                 \
2696                                                                         \
2697     speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE);               \
2698     speed_cache_fill (s);                                               \
2699                                                                         \
2700     speed_starttime ();                                                 \
2701     i = s->reps;                                                        \
2702     do                                                                  \
2703       {                                                                 \
2704         j = SPEED_BLOCK_SIZE;                                           \
2705         do                                                              \
2706           {                                                             \
2707             /* randomized but successively dependent */                 \
2708             n += (xp[j] << 1);                                          \
2709                                                                         \
2710             function (n, n);                                            \
2711           }                                                             \
2712         while (--j != 0);                                               \
2713       }                                                                 \
2714     while (--i != 0);                                                   \
2715     t = speed_endtime ();                                               \
2716                                                                         \
2717     /* make sure the compiler won't optimize away n */                  \
2718     noop_1 (n);                                                         \
2719                                                                         \
2720     s->time_divisor = SPEED_BLOCK_SIZE;                                 \
2721     return t;                                                           \
2722   }
2723
2724
2725 #define SPEED_ROUTINE_MPN_SQRTREM(function)                             \
2726   {                                                                     \
2727     mp_ptr    wp, wp2;                                                  \
2728     unsigned  i;                                                        \
2729     double    t;                                                        \
2730     TMP_DECL;                                                           \
2731                                                                         \
2732     SPEED_RESTRICT_COND (s->size >= 1);                                 \
2733                                                                         \
2734     TMP_MARK;                                                           \
2735     SPEED_TMP_ALLOC_LIMBS (wp,  s->size, s->align_wp);                  \
2736     SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);                 \
2737                                                                         \
2738     speed_operand_src (s, s->xp, s->size);                              \
2739     speed_operand_dst (s, wp, s->size);                                 \
2740     speed_operand_dst (s, wp2, s->size);                                \
2741     speed_cache_fill (s);                                               \
2742                                                                         \
2743     speed_starttime ();                                                 \
2744     i = s->reps;                                                        \
2745     do                                                                  \
2746       function (wp, wp2, s->xp, s->size);                               \
2747     while (--i != 0);                                                   \
2748     t = speed_endtime ();                                               \
2749                                                                         \
2750     TMP_FREE;                                                           \
2751     return t;                                                           \
2752   }
2753
2754 #define SPEED_ROUTINE_MPN_ROOTREM(function)                             \
2755   {                                                                     \
2756     mp_ptr    wp, wp2;                                                  \
2757     unsigned  i;                                                        \
2758     double    t;                                                        \
2759     TMP_DECL;                                                           \
2760                                                                         \
2761     SPEED_RESTRICT_COND (s->size >= 1);                                 \
2762                                                                         \
2763     TMP_MARK;                                                           \
2764     SPEED_TMP_ALLOC_LIMBS (wp,  s->size, s->align_wp);                  \
2765     SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2);                 \
2766                                                                         \
2767     speed_operand_src (s, s->xp, s->size);                              \
2768     speed_operand_dst (s, wp, s->size);                                 \
2769     speed_operand_dst (s, wp2, s->size);                                \
2770     speed_cache_fill (s);                                               \
2771                                                                         \
2772     speed_starttime ();                                                 \
2773     i = s->reps;                                                        \
2774     do                                                                  \
2775       function (wp, wp2, s->xp, s->size, s->r);                         \
2776     while (--i != 0);                                                   \
2777     t = speed_endtime ();                                               \
2778                                                                         \
2779     TMP_FREE;                                                           \
2780     return t;                                                           \
2781   }
2782
2783
2784 /* s->size controls the number of limbs in the input, s->r is the base, or
2785    decimal by default. */
2786 #define SPEED_ROUTINE_MPN_GET_STR(function)                             \
2787   {                                                                     \
2788     unsigned char *wp;                                                  \
2789     mp_size_t wn;                                                       \
2790     mp_ptr xp;                                                          \
2791     int base;                                                           \
2792     unsigned i;                                                         \
2793     double t;                                                           \
2794     TMP_DECL;                                                           \
2795                                                                         \
2796     SPEED_RESTRICT_COND (s->size >= 1);                                 \
2797                                                                         \
2798     base = s->r == 0 ? 10 : s->r;                                       \
2799     SPEED_RESTRICT_COND (base >= 2 && base <= 256);                     \
2800                                                                         \
2801     TMP_MARK;                                                           \
2802     SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp);               \
2803                                                                         \
2804     MPN_SIZEINBASE (wn, s->xp, s->size, base);                          \
2805     wp = TMP_ALLOC (wn);                                                \
2806                                                                         \
2807     /* use this during development to guard against overflowing wp */   \
2808     /*                                                                  \
2809     MPN_COPY (xp, s->xp, s->size);                                      \
2810     ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn);          \
2811     */                                                                  \
2812                                                                         \
2813     speed_operand_src (s, s->xp, s->size);                              \
2814     speed_operand_dst (s, xp, s->size);                                 \
2815     speed_operand_dst (s, (mp_ptr) wp, wn/BYTES_PER_MP_LIMB);           \
2816     speed_cache_fill (s);                                               \
2817                                                                         \
2818     speed_starttime ();                                                 \
2819     i = s->reps;                                                        \
2820     do                                                                  \
2821       {                                                                 \
2822         MPN_COPY (xp, s->xp, s->size);                                  \
2823         function (wp, base, xp, s->size);                               \
2824       }                                                                 \
2825     while (--i != 0);                                                   \
2826     t = speed_endtime ();                                               \
2827                                                                         \
2828     TMP_FREE;                                                           \
2829     return t;                                                           \
2830   }
2831
2832 /* s->size controls the number of digits in the input, s->r is the base, or
2833    decimal by default. */
2834 #define SPEED_ROUTINE_MPN_SET_STR_CALL(call)                            \
2835   {                                                                     \
2836     unsigned char *xp;                                                  \
2837     mp_ptr     wp;                                                      \
2838     mp_size_t  wn;                                                      \
2839     unsigned   i;                                                       \
2840     int        base;                                                    \
2841     double     t;                                                       \
2842     TMP_DECL;                                                           \
2843                                                                         \
2844     SPEED_RESTRICT_COND (s->size >= 1);                                 \
2845                                                                         \
2846     base = s->r == 0 ? 10 : s->r;                                       \
2847     SPEED_RESTRICT_COND (base >= 2 && base <= 256);                     \
2848                                                                         \
2849     TMP_MARK;                                                           \
2850                                                                         \
2851     xp = TMP_ALLOC (s->size);                                           \
2852     for (i = 0; i < s->size; i++)                                       \
2853       xp[i] = s->xp[i] % base;                                          \
2854                                                                         \
2855     wn = ((mp_size_t) (s->size / mp_bases[base].chars_per_bit_exactly)) \
2856       / GMP_LIMB_BITS + 2;                                              \
2857     SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);                        \
2858                                                                         \
2859     /* use this during development to check wn is big enough */         \
2860     /*                                                                  \
2861     ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn);          \
2862     */                                                                  \
2863                                                                         \
2864     speed_operand_src (s, (mp_ptr) xp, s->size/BYTES_PER_MP_LIMB);      \
2865     speed_operand_dst (s, wp, wn);                                      \
2866     speed_cache_fill (s);                                               \
2867                                                                         \
2868     speed_starttime ();                                                 \
2869     i = s->reps;                                                        \
2870     do                                                                  \
2871       call;                                                             \
2872     while (--i != 0);                                                   \
2873     t = speed_endtime ();                                               \
2874                                                                         \
2875     TMP_FREE;                                                           \
2876     return t;                                                           \
2877   }
2878
2879
2880 /* Run an accel gcd find_a() function over various data values.  A set of
2881    values is used in case some run particularly fast or slow.  The size
2882    parameter is ignored, the amount of data tested is fixed.  */
2883
2884 #define SPEED_ROUTINE_MPN_GCD_FINDA(function)                           \
2885   {                                                                     \
2886     unsigned  i, j;                                                     \
2887     mp_limb_t cp[SPEED_BLOCK_SIZE][2];                                  \
2888     double    t;                                                        \
2889     TMP_DECL;                                                           \
2890                                                                         \
2891     TMP_MARK;                                                           \
2892                                                                         \
2893     /* low must be odd, high must be non-zero */                        \
2894     for (i = 0; i < SPEED_BLOCK_SIZE; i++)                              \
2895       {                                                                 \
2896         cp[i][0] = s->xp_block[i] | 1;                                  \
2897         cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0);              \
2898       }                                                                 \
2899                                                                         \
2900     speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE);               \
2901     speed_cache_fill (s);                                               \
2902                                                                         \
2903     speed_starttime ();                                                 \
2904     i = s->reps;                                                        \
2905     do                                                                  \
2906       {                                                                 \
2907         j = SPEED_BLOCK_SIZE;                                           \
2908         do                                                              \
2909           {                                                             \
2910             function (cp[j-1]);                                         \
2911           }                                                             \
2912         while (--j != 0);                                               \
2913       }                                                                 \
2914     while (--i != 0);                                                   \
2915     t = speed_endtime ();                                               \
2916                                                                         \
2917     TMP_FREE;                                                           \
2918                                                                         \
2919     s->time_divisor = SPEED_BLOCK_SIZE;                                 \
2920     return t;                                                           \
2921   }
2922
2923
2924 /* "call" should do "count_foo_zeros(c,n)".
2925    Give leading=1 if foo is leading zeros, leading=0 for trailing.
2926    Give zero=1 if n=0 is allowed in the call, zero=0 if not.  */
2927
2928 #define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero)                      \
2929   {                                                                     \
2930     mp_ptr     xp;                                                      \
2931     int        i, c;                                                    \
2932     unsigned   j;                                                       \
2933     mp_limb_t  n;                                                       \
2934     double     t;                                                       \
2935     TMP_DECL;                                                           \
2936                                                                         \
2937     TMP_MARK;                                                           \
2938     SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp);          \
2939                                                                         \
2940     if (! speed_routine_count_zeros_setup (s, xp, leading, zero))       \
2941       return -1.0;                                                      \
2942     speed_operand_src (s, xp, SPEED_BLOCK_SIZE);                        \
2943     speed_cache_fill (s);                                               \
2944                                                                         \
2945     c = 0;                                                              \
2946     speed_starttime ();                                                 \
2947     j = s->reps;                                                        \
2948     do {                                                                \
2949       for (i = 0; i < SPEED_BLOCK_SIZE; i++)                            \
2950         {                                                               \
2951           n = xp[i];                                                    \
2952           n ^= c;                                                       \
2953
2954 #define SPEED_ROUTINE_COUNT_ZEROS_B()                                   \
2955         }                                                               \
2956     } while (--j != 0);                                                 \
2957     t = speed_endtime ();                                               \
2958                                                                         \
2959     /* don't let c go dead */                                           \
2960     noop_1 (c);                                                         \
2961                                                                         \
2962     s->time_divisor = SPEED_BLOCK_SIZE;                                 \
2963                                                                         \
2964     TMP_FREE;                                                           \
2965     return t;                                                           \
2966   }                                                                     \
2967
2968 #define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero)                \
2969   do {                                                                  \
2970     SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero);                        \
2971     call;                                                               \
2972     SPEED_ROUTINE_COUNT_ZEROS_B ();                                     \
2973   } while (0)                                                           \
2974
2975 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero)                  \
2976   SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
2977 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun)                          \
2978   SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
2979
2980 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero)                 \
2981   SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
2982 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call)                        \
2983   SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
2984
2985
2986 #define SPEED_ROUTINE_INVERT_LIMB_CALL(call)                            \
2987   {                                                                     \
2988     unsigned   i, j;                                                    \
2989     mp_limb_t  d, dinv=0;                                               \
2990     mp_ptr     xp = s->xp_block - 1;                                    \
2991                                                                         \
2992     s->time_divisor = SPEED_BLOCK_SIZE;                                 \
2993                                                                         \
2994     speed_starttime ();                                                 \
2995     i = s->reps;                                                        \
2996     do                                                                  \
2997       {                                                                 \
2998         j = SPEED_BLOCK_SIZE;                                           \
2999         do                                                              \
3000           {                                                             \
3001             d = dinv ^ xp[j];                                           \
3002             d |= GMP_LIMB_HIGHBIT;                                      \
3003             do { call; } while (0);                                     \
3004           }                                                             \
3005         while (--j != 0);                                               \
3006       }                                                                 \
3007     while (--i != 0);                                                   \
3008                                                                         \
3009     /* don't let the compiler optimize everything away */               \
3010     noop_1 (dinv);                                                      \
3011                                                                         \
3012     return speed_endtime();                                             \
3013   }
3014
3015
3016 #endif
3017
3018
3019 #define SPEED_ROUTINE_MPN_BACK_TO_BACK(function)                        \
3020   {                                                                     \
3021     unsigned  i;                                                        \
3022     speed_starttime ();                                                 \
3023     i = s->reps;                                                        \
3024     do                                                                  \
3025       function ();                                                      \
3026     while (--i != 0);                                                   \
3027     return speed_endtime ();                                            \
3028   }
3029
3030
3031 #define SPEED_ROUTINE_MPN_ZERO_CALL(call)                               \
3032   {                                                                     \
3033     mp_ptr    wp;                                                       \
3034     unsigned  i;                                                        \
3035     double    t;                                                        \
3036     TMP_DECL;                                                           \
3037                                                                         \
3038     SPEED_RESTRICT_COND (s->size >= 0);                                 \
3039                                                                         \
3040     TMP_MARK;                                                           \
3041     SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp);                   \
3042     speed_operand_dst (s, wp, s->size);                                 \
3043     speed_cache_fill (s);                                               \
3044                                                                         \
3045     speed_starttime ();                                                 \
3046     i = s->reps;                                                        \
3047     do                                                                  \
3048       call;                                                             \
3049     while (--i != 0);                                                   \
3050     t = speed_endtime ();                                               \
3051                                                                         \
3052     TMP_FREE;                                                           \
3053     return t;                                                           \
3054   }
3055
3056 #define SPEED_ROUTINE_MPN_ZERO(function)                                \
3057   SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))