1 /* Header for speed and threshold things.
3 Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010 Free
4 Software Foundation, Inc.
6 This file is part of the GNU MP Library.
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
25 /* Pad ptr,oldsize with zero limbs (at the most significant end) to make it
27 #define MPN_ZERO_EXTEND(ptr, oldsize, newsize) \
29 ASSERT ((newsize) >= (oldsize)); \
30 MPN_ZERO ((ptr)+(oldsize), (newsize)-(oldsize)); \
33 /* A mask of the least significant n bits. Note 1<<32 doesn't give zero on
34 x86 family CPUs, hence the separate case for GMP_LIMB_BITS. */
35 #define MP_LIMB_T_LOWBITMASK(n) \
36 ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX : ((mp_limb_t) 1 << (n)) - 1)
39 /* align must be a power of 2 here, usually CACHE_LINE_SIZE is a good choice */
41 #define TMP_ALLOC_ALIGNED(bytes, align) \
42 align_pointer (TMP_ALLOC ((bytes) + (align)-1), (align))
43 #define TMP_ALLOC_LIMBS_ALIGNED(limbs, align) \
44 ((mp_ptr) TMP_ALLOC_ALIGNED ((limbs)*sizeof(mp_limb_t), align))
46 /* CACHE_LINE_SIZE is our default alignment for speed operands, and the
47 limit on what s->align_xp etc and then request for off-alignment. Maybe
48 this should be an option of some sort, but in any case here are some line
57 #define CACHE_LINE_SIZE 64 /* bytes */
59 #define SPEED_TMP_ALLOC_ADJUST_MASK (CACHE_LINE_SIZE/BYTES_PER_MP_LIMB - 1)
61 /* Set ptr to a TMP_ALLOC block of the given limbs, with the given limb
63 #define SPEED_TMP_ALLOC_LIMBS(ptr, limbs, align) \
66 mp_size_t __ptr_align, __ptr_add; \
68 ASSERT ((CACHE_LINE_SIZE % BYTES_PER_MP_LIMB) == 0); \
69 __ptr = TMP_ALLOC_LIMBS ((limbs) + SPEED_TMP_ALLOC_ADJUST_MASK); \
70 __ptr_align = (__ptr - (mp_ptr) NULL); \
71 __ptr_add = ((align) - __ptr_align) & SPEED_TMP_ALLOC_ADJUST_MASK; \
72 (ptr) = __ptr + __ptr_add; \
76 /* This is the size for s->xp_block and s->yp_block, used in certain
77 routines that want to run across many different data values and use
78 s->size for a different purpose, eg. SPEED_ROUTINE_MPN_GCD_1.
80 512 means 2kbytes of data for each of xp_block and yp_block, making 4k
81 total, which should fit easily in any L1 data cache. */
83 #define SPEED_BLOCK_SIZE 512 /* limbs */
86 extern double speed_unittime;
87 extern double speed_cycletime;
88 extern int speed_precision;
89 extern char speed_time_string[];
90 void speed_time_init __GMP_PROTO ((void));
91 void speed_cycletime_fail __GMP_PROTO ((const char *str));
92 void speed_cycletime_init __GMP_PROTO ((void));
93 void speed_cycletime_need_cycles __GMP_PROTO ((void));
94 void speed_cycletime_need_seconds __GMP_PROTO ((void));
95 void speed_starttime __GMP_PROTO ((void));
96 double speed_endtime __GMP_PROTO ((void));
100 unsigned reps; /* how many times to run the routine */
101 mp_ptr xp; /* first argument */
102 mp_ptr yp; /* second argument */
103 mp_size_t size; /* size of both arguments */
104 mp_limb_t r; /* user supplied parameter */
105 mp_size_t align_xp; /* alignment of xp */
106 mp_size_t align_yp; /* alignment of yp */
107 mp_size_t align_wp; /* intended alignment of wp */
108 mp_size_t align_wp2; /* intended alignment of wp2 */
109 mp_ptr xp_block; /* first special SPEED_BLOCK_SIZE block */
110 mp_ptr yp_block; /* second special SPEED_BLOCK_SIZE block */
112 double time_divisor; /* optionally set by the speed routine */
114 /* used by the cache priming things */
116 unsigned src_num, dst_num;
123 typedef double (*speed_function_t) __GMP_PROTO ((struct speed_params *s));
125 double speed_measure __GMP_PROTO ((speed_function_t fun, struct speed_params *s));
127 /* Prototypes for speed measuring routines */
129 double speed_back_to_back __GMP_PROTO ((struct speed_params *s));
130 double speed_count_leading_zeros __GMP_PROTO ((struct speed_params *s));
131 double speed_count_trailing_zeros __GMP_PROTO ((struct speed_params *s));
132 double speed_find_a __GMP_PROTO ((struct speed_params *s));
133 double speed_gmp_allocate_free __GMP_PROTO ((struct speed_params *s));
134 double speed_gmp_allocate_reallocate_free __GMP_PROTO ((struct speed_params *s));
135 double speed_invert_limb __GMP_PROTO ((struct speed_params *s));
136 double speed_malloc_free __GMP_PROTO ((struct speed_params *s));
137 double speed_malloc_realloc_free __GMP_PROTO ((struct speed_params *s));
138 double speed_memcpy __GMP_PROTO ((struct speed_params *s));
139 double speed_binvert_limb __GMP_PROTO ((struct speed_params *s));
140 double speed_binvert_limb_mul1 __GMP_PROTO ((struct speed_params *s));
141 double speed_binvert_limb_loop __GMP_PROTO ((struct speed_params *s));
142 double speed_binvert_limb_cond __GMP_PROTO ((struct speed_params *s));
143 double speed_binvert_limb_arith __GMP_PROTO ((struct speed_params *s));
145 double speed_mpf_init_clear __GMP_PROTO ((struct speed_params *s));
147 double speed_mpn_add_n __GMP_PROTO ((struct speed_params *s));
148 double speed_mpn_addlsh1_n __GMP_PROTO ((struct speed_params *s));
149 double speed_mpn_addlsh2_n __GMP_PROTO ((struct speed_params *s));
150 double speed_mpn_add_n_sub_n __GMP_PROTO ((struct speed_params *s));
151 double speed_mpn_and_n __GMP_PROTO ((struct speed_params *s));
152 double speed_mpn_andn_n __GMP_PROTO ((struct speed_params *s));
153 double speed_mpn_addmul_1 __GMP_PROTO ((struct speed_params *s));
154 double speed_mpn_addmul_2 __GMP_PROTO ((struct speed_params *s));
155 double speed_mpn_addmul_3 __GMP_PROTO ((struct speed_params *s));
156 double speed_mpn_addmul_4 __GMP_PROTO ((struct speed_params *s));
157 double speed_mpn_addmul_5 __GMP_PROTO ((struct speed_params *s));
158 double speed_mpn_addmul_6 __GMP_PROTO ((struct speed_params *s));
159 double speed_mpn_addmul_7 __GMP_PROTO ((struct speed_params *s));
160 double speed_mpn_addmul_8 __GMP_PROTO ((struct speed_params *s));
161 double speed_mpn_com __GMP_PROTO ((struct speed_params *s));
162 double speed_mpn_copyd __GMP_PROTO ((struct speed_params *s));
163 double speed_mpn_copyi __GMP_PROTO ((struct speed_params *s));
164 double speed_MPN_COPY __GMP_PROTO ((struct speed_params *s));
165 double speed_MPN_COPY_DECR __GMP_PROTO ((struct speed_params *s));
166 double speed_MPN_COPY_INCR __GMP_PROTO ((struct speed_params *s));
167 double speed_mpn_divexact_1 __GMP_PROTO ((struct speed_params *s));
168 double speed_mpn_divexact_by3 __GMP_PROTO ((struct speed_params *s));
169 double speed_mpn_bdiv_q_1 __GMP_PROTO ((struct speed_params *));
170 double speed_mpn_pi1_bdiv_q_1 __GMP_PROTO ((struct speed_params *));
171 double speed_mpn_bdiv_dbm1c __GMP_PROTO ((struct speed_params *s));
172 double speed_mpn_divrem_1 __GMP_PROTO ((struct speed_params *s));
173 double speed_mpn_divrem_1f __GMP_PROTO ((struct speed_params *s));
174 double speed_mpn_divrem_1c __GMP_PROTO ((struct speed_params *s));
175 double speed_mpn_divrem_1cf __GMP_PROTO ((struct speed_params *s));
176 double speed_mpn_divrem_1_div __GMP_PROTO ((struct speed_params *s));
177 double speed_mpn_divrem_1f_div __GMP_PROTO ((struct speed_params *s));
178 double speed_mpn_divrem_1_inv __GMP_PROTO ((struct speed_params *s));
179 double speed_mpn_divrem_1f_inv __GMP_PROTO ((struct speed_params *s));
180 double speed_mpn_divrem_2 __GMP_PROTO ((struct speed_params *s));
181 double speed_mpn_divrem_2_div __GMP_PROTO ((struct speed_params *s));
182 double speed_mpn_divrem_2_inv __GMP_PROTO ((struct speed_params *s));
183 double speed_mpn_fib2_ui __GMP_PROTO ((struct speed_params *s));
184 double speed_mpn_matrix22_mul __GMP_PROTO ((struct speed_params *s));
185 double speed_mpn_hgcd __GMP_PROTO ((struct speed_params *s));
186 double speed_mpn_hgcd_lehmer __GMP_PROTO ((struct speed_params *s));
187 double speed_mpn_gcd __GMP_PROTO ((struct speed_params *s));
188 double speed_mpn_gcd_1 __GMP_PROTO ((struct speed_params *s));
189 double speed_mpn_gcd_1N __GMP_PROTO ((struct speed_params *s));
190 double speed_mpn_gcdext __GMP_PROTO ((struct speed_params *s));
191 double speed_mpn_gcdext_double __GMP_PROTO ((struct speed_params *s));
192 double speed_mpn_gcdext_one_double __GMP_PROTO ((struct speed_params *s));
193 double speed_mpn_gcdext_one_single __GMP_PROTO ((struct speed_params *s));
194 double speed_mpn_gcdext_single __GMP_PROTO ((struct speed_params *s));
195 double speed_mpn_get_str __GMP_PROTO ((struct speed_params *s));
196 double speed_mpn_hamdist __GMP_PROTO ((struct speed_params *s));
197 double speed_mpn_ior_n __GMP_PROTO ((struct speed_params *s));
198 double speed_mpn_iorn_n __GMP_PROTO ((struct speed_params *s));
199 double speed_mpn_jacobi_base __GMP_PROTO ((struct speed_params *s));
200 double speed_mpn_jacobi_base_1 __GMP_PROTO ((struct speed_params *s));
201 double speed_mpn_jacobi_base_2 __GMP_PROTO ((struct speed_params *s));
202 double speed_mpn_jacobi_base_3 __GMP_PROTO ((struct speed_params *s));
203 double speed_mpn_lshift __GMP_PROTO ((struct speed_params *s));
204 double speed_mpn_lshiftc __GMP_PROTO ((struct speed_params *s));
205 double speed_mpn_mod_1 __GMP_PROTO ((struct speed_params *s));
206 double speed_mpn_mod_1c __GMP_PROTO ((struct speed_params *s));
207 double speed_mpn_mod_1_div __GMP_PROTO ((struct speed_params *s));
208 double speed_mpn_mod_1_inv __GMP_PROTO ((struct speed_params *s));
209 double speed_mpn_mod_1_1 __GMP_PROTO ((struct speed_params *s));
210 double speed_mpn_mod_1_2 __GMP_PROTO ((struct speed_params *s));
211 double speed_mpn_mod_1_3 __GMP_PROTO ((struct speed_params *s));
212 double speed_mpn_mod_1_4 __GMP_PROTO ((struct speed_params *s));
213 double speed_mpn_mod_34lsub1 __GMP_PROTO ((struct speed_params *s));
214 double speed_mpn_modexact_1_odd __GMP_PROTO ((struct speed_params *s));
215 double speed_mpn_modexact_1c_odd __GMP_PROTO ((struct speed_params *s));
216 double speed_mpn_mul_1 __GMP_PROTO ((struct speed_params *s));
217 double speed_mpn_mul_1_inplace __GMP_PROTO ((struct speed_params *s));
218 double speed_mpn_mul_2 __GMP_PROTO ((struct speed_params *s));
219 double speed_mpn_mul_3 __GMP_PROTO ((struct speed_params *s));
220 double speed_mpn_mul_4 __GMP_PROTO ((struct speed_params *s));
221 double speed_mpn_mul __GMP_PROTO ((struct speed_params *s));
222 double speed_mpn_mul_basecase __GMP_PROTO ((struct speed_params *s));
223 double speed_mpn_mul_fft __GMP_PROTO ((struct speed_params *s));
224 double speed_mpn_mul_fft_sqr __GMP_PROTO ((struct speed_params *s));
225 double speed_mpn_fft_mul __GMP_PROTO ((struct speed_params *s));
226 double speed_mpn_fft_sqr __GMP_PROTO ((struct speed_params *s));
227 #if WANT_OLD_FFT_FULL
228 double speed_mpn_mul_fft_full __GMP_PROTO ((struct speed_params *s));
229 double speed_mpn_mul_fft_full_sqr __GMP_PROTO ((struct speed_params *s));
231 double speed_mpn_nussbaumer_mul __GMP_PROTO ((struct speed_params *s));
232 double speed_mpn_nussbaumer_mul_sqr __GMP_PROTO ((struct speed_params *s));
233 double speed_mpn_mul_n __GMP_PROTO ((struct speed_params *s));
234 double speed_mpn_mul_n_sqr __GMP_PROTO ((struct speed_params *s));
235 double speed_mpn_mullo_n __GMP_PROTO ((struct speed_params *s));
236 double speed_mpn_mullo_basecase __GMP_PROTO ((struct speed_params *s));
237 double speed_mpn_nand_n __GMP_PROTO ((struct speed_params *s));
238 double speed_mpn_nior_n __GMP_PROTO ((struct speed_params *s));
239 double speed_mpn_popcount __GMP_PROTO ((struct speed_params *s));
240 double speed_mpn_preinv_divrem_1 __GMP_PROTO ((struct speed_params *s));
241 double speed_mpn_preinv_divrem_1f __GMP_PROTO ((struct speed_params *s));
242 double speed_mpn_preinv_mod_1 __GMP_PROTO ((struct speed_params *s));
243 double speed_mpn_sbpi1_div_qr __GMP_PROTO ((struct speed_params *s));
244 double speed_mpn_dcpi1_div_qr __GMP_PROTO ((struct speed_params *s));
245 double speed_mpn_sbpi1_divappr_q __GMP_PROTO ((struct speed_params *s));
246 double speed_mpn_dcpi1_divappr_q __GMP_PROTO ((struct speed_params *s));
247 double speed_mpn_mu_div_qr __GMP_PROTO ((struct speed_params *s));
248 double speed_mpn_mu_divappr_q __GMP_PROTO ((struct speed_params *s));
249 double speed_mpn_mupi_div_qr __GMP_PROTO ((struct speed_params *s));
250 double speed_mpn_mu_div_q __GMP_PROTO ((struct speed_params *s));
251 double speed_mpn_sbpi1_bdiv_qr __GMP_PROTO ((struct speed_params *s));
252 double speed_mpn_dcpi1_bdiv_qr __GMP_PROTO ((struct speed_params *s));
253 double speed_mpn_sbpi1_bdiv_q __GMP_PROTO ((struct speed_params *s));
254 double speed_mpn_dcpi1_bdiv_q __GMP_PROTO ((struct speed_params *s));
255 double speed_mpn_mu_bdiv_q __GMP_PROTO ((struct speed_params *s));
256 double speed_mpn_mu_bdiv_qr __GMP_PROTO ((struct speed_params *s));
257 double speed_mpn_invert __GMP_PROTO ((struct speed_params *s));
258 double speed_mpn_invertappr __GMP_PROTO ((struct speed_params *s));
259 double speed_mpn_ni_invertappr __GMP_PROTO ((struct speed_params *s));
260 double speed_mpn_binvert __GMP_PROTO ((struct speed_params *s));
261 double speed_mpn_redc_1 __GMP_PROTO ((struct speed_params *s));
262 double speed_mpn_redc_2 __GMP_PROTO ((struct speed_params *s));
263 double speed_mpn_redc_n __GMP_PROTO ((struct speed_params *s));
264 double speed_mpn_rsblsh1_n __GMP_PROTO ((struct speed_params *s));
265 double speed_mpn_rsblsh2_n __GMP_PROTO ((struct speed_params *s));
266 double speed_mpn_rsh1add_n __GMP_PROTO ((struct speed_params *s));
267 double speed_mpn_rsh1sub_n __GMP_PROTO ((struct speed_params *s));
268 double speed_mpn_rshift __GMP_PROTO ((struct speed_params *s));
269 double speed_mpn_sb_divrem_m3 __GMP_PROTO ((struct speed_params *s));
270 double speed_mpn_sb_divrem_m3_div __GMP_PROTO ((struct speed_params *s));
271 double speed_mpn_sb_divrem_m3_inv __GMP_PROTO ((struct speed_params *s));
272 double speed_mpn_set_str __GMP_PROTO ((struct speed_params *s));
273 double speed_mpn_bc_set_str __GMP_PROTO ((struct speed_params *s));
274 double speed_mpn_dc_set_str __GMP_PROTO ((struct speed_params *s));
275 double speed_mpn_set_str_pre __GMP_PROTO ((struct speed_params *s));
276 double speed_mpn_sqr_basecase __GMP_PROTO ((struct speed_params *s));
277 double speed_mpn_sqr_diagonal __GMP_PROTO ((struct speed_params *s));
278 double speed_mpn_sqr __GMP_PROTO ((struct speed_params *s));
279 double speed_mpn_sqrtrem __GMP_PROTO ((struct speed_params *s));
280 double speed_mpn_rootrem __GMP_PROTO ((struct speed_params *s));
281 double speed_mpn_sub_n __GMP_PROTO ((struct speed_params *s));
282 double speed_mpn_sublsh1_n __GMP_PROTO ((struct speed_params *s));
283 double speed_mpn_sublsh2_n __GMP_PROTO ((struct speed_params *s));
284 double speed_mpn_submul_1 __GMP_PROTO ((struct speed_params *s));
285 double speed_mpn_toom2_sqr __GMP_PROTO ((struct speed_params *s));
286 double speed_mpn_toom3_sqr __GMP_PROTO ((struct speed_params *s));
287 double speed_mpn_toom4_sqr __GMP_PROTO ((struct speed_params *s));
288 double speed_mpn_toom6_sqr __GMP_PROTO ((struct speed_params *s));
289 double speed_mpn_toom8_sqr __GMP_PROTO ((struct speed_params *s));
290 double speed_mpn_toom22_mul __GMP_PROTO ((struct speed_params *s));
291 double speed_mpn_toom33_mul __GMP_PROTO ((struct speed_params *s));
292 double speed_mpn_toom44_mul __GMP_PROTO ((struct speed_params *s));
293 double speed_mpn_toom6h_mul __GMP_PROTO ((struct speed_params *s));
294 double speed_mpn_toom8h_mul __GMP_PROTO ((struct speed_params *s));
295 double speed_mpn_toom32_mul __GMP_PROTO ((struct speed_params *s));
296 double speed_mpn_toom42_mul __GMP_PROTO ((struct speed_params *s));
297 double speed_mpn_toom43_mul __GMP_PROTO ((struct speed_params *s));
298 double speed_mpn_toom63_mul __GMP_PROTO ((struct speed_params *s));
299 double speed_mpn_toom32_for_toom43_mul __GMP_PROTO ((struct speed_params *s));
300 double speed_mpn_toom43_for_toom32_mul __GMP_PROTO ((struct speed_params *s));
301 double speed_mpn_toom32_for_toom53_mul __GMP_PROTO ((struct speed_params *s));
302 double speed_mpn_toom53_for_toom32_mul __GMP_PROTO ((struct speed_params *s));
303 double speed_mpn_toom42_for_toom53_mul __GMP_PROTO ((struct speed_params *s));
304 double speed_mpn_toom53_for_toom42_mul __GMP_PROTO ((struct speed_params *s));
305 double speed_mpn_mulmod_bnm1 __GMP_PROTO ((struct speed_params *s));
306 double speed_mpn_bc_mulmod_bnm1 __GMP_PROTO ((struct speed_params *s));
307 double speed_mpn_mulmod_bnm1_rounded __GMP_PROTO ((struct speed_params *s));
308 double speed_mpn_sqrmod_bnm1 __GMP_PROTO ((struct speed_params *s));
309 double speed_mpn_udiv_qrnnd __GMP_PROTO ((struct speed_params *s));
310 double speed_mpn_udiv_qrnnd_r __GMP_PROTO ((struct speed_params *s));
311 double speed_mpn_umul_ppmm __GMP_PROTO ((struct speed_params *s));
312 double speed_mpn_umul_ppmm_r __GMP_PROTO ((struct speed_params *s));
313 double speed_mpn_xnor_n __GMP_PROTO ((struct speed_params *s));
314 double speed_mpn_xor_n __GMP_PROTO ((struct speed_params *s));
315 double speed_MPN_ZERO __GMP_PROTO ((struct speed_params *s));
317 double speed_mpq_init_clear __GMP_PROTO ((struct speed_params *s));
319 double speed_mpz_add __GMP_PROTO ((struct speed_params *s));
320 double speed_mpz_bin_uiui __GMP_PROTO ((struct speed_params *s));
321 double speed_mpz_fac_ui __GMP_PROTO ((struct speed_params *s));
322 double speed_mpz_fib_ui __GMP_PROTO ((struct speed_params *s));
323 double speed_mpz_fib2_ui __GMP_PROTO ((struct speed_params *s));
324 double speed_mpz_init_clear __GMP_PROTO ((struct speed_params *s));
325 double speed_mpz_init_realloc_clear __GMP_PROTO ((struct speed_params *s));
326 double speed_mpz_jacobi __GMP_PROTO ((struct speed_params *s));
327 double speed_mpz_lucnum_ui __GMP_PROTO ((struct speed_params *s));
328 double speed_mpz_lucnum2_ui __GMP_PROTO ((struct speed_params *s));
329 double speed_mpz_mod __GMP_PROTO ((struct speed_params *s));
330 double speed_mpz_powm __GMP_PROTO ((struct speed_params *s));
331 double speed_mpz_powm_mod __GMP_PROTO ((struct speed_params *s));
332 double speed_mpz_powm_redc __GMP_PROTO ((struct speed_params *s));
333 double speed_mpz_powm_ui __GMP_PROTO ((struct speed_params *s));
334 double speed_mpz_urandomb __GMP_PROTO ((struct speed_params *s));
336 double speed_gmp_randseed __GMP_PROTO ((struct speed_params *s));
337 double speed_gmp_randseed_ui __GMP_PROTO ((struct speed_params *s));
339 double speed_noop __GMP_PROTO ((struct speed_params *s));
340 double speed_noop_wxs __GMP_PROTO ((struct speed_params *s));
341 double speed_noop_wxys __GMP_PROTO ((struct speed_params *s));
343 double speed_operator_div __GMP_PROTO ((struct speed_params *s));
344 double speed_operator_mod __GMP_PROTO ((struct speed_params *s));
346 double speed_udiv_qrnnd __GMP_PROTO ((struct speed_params *s));
347 double speed_udiv_qrnnd_preinv1 __GMP_PROTO ((struct speed_params *s));
348 double speed_udiv_qrnnd_preinv2 __GMP_PROTO ((struct speed_params *s));
349 double speed_udiv_qrnnd_c __GMP_PROTO ((struct speed_params *s));
350 double speed_umul_ppmm __GMP_PROTO ((struct speed_params *s));
352 /* Prototypes for other routines */
354 /* low 32-bits in p[0], high 32-bits in p[1] */
355 void speed_cyclecounter __GMP_PROTO ((unsigned p[2]));
357 void mftb_function __GMP_PROTO ((unsigned p[2]));
359 /* In i386 gcc -fPIC, ebx is a fixed register and can't be declared a dummy
360 output or a clobber for the cpuid, hence an explicit save and restore. A
361 clobber as such doesn't provoke an error unfortunately (gcc 3.0), so use
362 the dummy output style in non-PIC, so there's an error if somehow -fPIC
363 is used without a -DPIC to tell us about it. */
364 #if defined(__GNUC__) && ! defined (NO_ASM) \
365 && (defined (__i386__) || defined (__i486__))
367 #define speed_cyclecounter(p) \
369 int __speed_cyclecounter__save_ebx; \
370 int __speed_cyclecounter__dummy; \
371 __asm__ __volatile__ ("movl %%ebx, %1\n" \
376 "=&rm" (__speed_cyclecounter__save_ebx), \
377 "=c" (__speed_cyclecounter__dummy), \
381 #define speed_cyclecounter(p) \
383 int __speed_cyclecounter__dummy1; \
384 int __speed_cyclecounter__dummy2; \
385 __asm__ __volatile__ ("cpuid\n" \
388 "=b" (__speed_cyclecounter__dummy1), \
389 "=c" (__speed_cyclecounter__dummy2), \
395 double speed_cyclecounter_diff __GMP_PROTO ((const unsigned [2], const unsigned [2]));
396 int gettimeofday_microseconds_p __GMP_PROTO ((void));
397 int getrusage_microseconds_p __GMP_PROTO ((void));
398 int cycles_works_p __GMP_PROTO ((void));
399 long clk_tck __GMP_PROTO ((void));
400 double freq_measure __GMP_PROTO ((const char *, double (*)(void)));
402 int double_cmp_ptr __GMP_PROTO ((const double *, const double *));
403 void pentium_wbinvd __GMP_PROTO ((void));
404 typedef int (*qsort_function_t) __GMP_PROTO ((const void *, const void *));
406 void noop __GMP_PROTO ((void));
407 void noop_1 __GMP_PROTO ((mp_limb_t));
408 void noop_wxs __GMP_PROTO ((mp_ptr, mp_srcptr, mp_size_t));
409 void noop_wxys __GMP_PROTO ((mp_ptr, mp_srcptr, mp_srcptr, mp_size_t));
410 void mpn_cache_fill __GMP_PROTO ((mp_srcptr, mp_size_t));
411 void mpn_cache_fill_dummy __GMP_PROTO ((mp_limb_t));
412 void speed_cache_fill __GMP_PROTO ((struct speed_params *));
413 void speed_operand_src __GMP_PROTO ((struct speed_params *, mp_ptr, mp_size_t));
414 void speed_operand_dst __GMP_PROTO ((struct speed_params *, mp_ptr, mp_size_t));
416 extern int speed_option_addrs;
417 extern int speed_option_verbose;
418 void speed_option_set __GMP_PROTO((const char *));
420 mp_limb_t mpn_divrem_1_div __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t));
421 mp_limb_t mpn_divrem_1_inv __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t));
422 mp_limb_t mpn_divrem_2_div __GMP_PROTO ((mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr));
423 mp_limb_t mpn_divrem_2_inv __GMP_PROTO ((mp_ptr, mp_size_t, mp_ptr, mp_size_t, mp_srcptr));
425 int mpn_jacobi_base_1 __GMP_PROTO ((mp_limb_t, mp_limb_t, int));
426 int mpn_jacobi_base_2 __GMP_PROTO ((mp_limb_t, mp_limb_t, int));
427 int mpn_jacobi_base_3 __GMP_PROTO ((mp_limb_t, mp_limb_t, int));
429 mp_limb_t mpn_mod_1_div __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t));
430 mp_limb_t mpn_mod_1_inv __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t));
432 mp_size_t mpn_gcd_binary
433 __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
434 mp_size_t mpn_gcd_accel
435 __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
436 mp_size_t mpn_gcdext_one_double
437 __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
438 mp_size_t mpn_gcdext_one_single
439 __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
440 mp_size_t mpn_gcdext_single
441 __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
442 mp_size_t mpn_gcdext_double
443 __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t *, mp_ptr, mp_size_t, mp_ptr, mp_size_t));
445 mp_limb_t mpn_sb_divrem_mn_div __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t));
446 mp_limb_t mpn_sb_divrem_mn_inv __GMP_PROTO ((mp_ptr, mp_ptr, mp_size_t, mp_srcptr, mp_size_t));
448 mp_size_t mpn_set_str_basecase __GMP_PROTO ((mp_ptr, const unsigned char *, size_t, int));
449 void mpn_pre_set_str __GMP_PROTO ((mp_ptr, unsigned char *, size_t, powers_t *, mp_ptr));
451 void mpz_powm_mod __GMP_PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr));
452 void mpz_powm_redc __GMP_PROTO ((mpz_ptr, mpz_srcptr, mpz_srcptr, mpz_srcptr));
454 int speed_routine_count_zeros_setup
455 __GMP_PROTO ((struct speed_params *, mp_ptr, int, int));
458 /* "get" is called repeatedly until it ticks over, just in case on a fast
459 processor it takes less than a microsecond, though this is probably
460 unlikely if it's a system call.
462 speed_cyclecounter is called on the same side of the "get" for the start
463 and end measurements. It doesn't matter how long it takes from the "get"
464 sample to the cycles sample, since that period will cancel out in the
465 difference calculation (assuming it's the same each time).
467 Letting the test run for more than a process time slice is probably only
468 going to reduce accuracy, especially for getrusage when the cycle counter
469 is real time, or for gettimeofday if the cycle counter is in fact process
470 time. Use CLK_TCK/2 as a reasonable stop.
472 It'd be desirable to be quite accurate here. The default speed_precision
473 for a cycle counter is 10000 cycles, so to mix that with getrusage or
474 gettimeofday the frequency should be at least that accurate. But running
475 measurements for 10000 microseconds (or more) is too long. Be satisfied
476 with just a half clock tick (5000 microseconds usually). */
478 #define FREQ_MEASURE_ONE(name, type, get, getc, sec, usec) \
480 type st1, st, et1, et; \
481 unsigned sc[2], ec[2]; \
482 long dt, half_tick; \
485 half_tick = (1000000L / clk_tck()) / 2; \
490 } while (usec(st) == usec(st1) && sec(st) == sec(st1)); \
499 } while (usec(et) == usec(et1) && sec(et) == sec(et1)); \
503 dc = speed_cyclecounter_diff (ec, sc); \
505 /* allow secs to cancel before multiplying */ \
506 dt = sec(et) - sec(st); \
507 dt = dt * 1000000L + (usec(et) - usec(st)); \
509 if (dt >= half_tick) \
513 cyc = dt * 1e-6 / dc; \
515 if (speed_option_verbose >= 2) \
516 printf ("freq_measure_%s_one() dc=%.6g dt=%ld cyc=%.6g\n", \
517 name, dc, dt, cyc); \
519 return dt * 1e-6 / dc; \
526 /* The measuring routines use these big macros to save duplication for
527 similar forms. They also get used for some automatically generated
528 measuring of new implementations of functions.
530 Having something like SPEED_ROUTINE_BINARY_N as a subroutine accepting a
531 function pointer is considered undesirable since it's not the way a
532 normal application will be calling, and some processors might do
533 different things with an indirect call, like not branch predicting, or
534 doing a full pipe flush. At least some of the "functions" measured are
537 The net effect is to bloat the object code, possibly in a big way, but
538 only what's being measured is being run, so that doesn't matter.
540 The loop forms don't try to cope with __GMP_ATTRIBUTE_PURE or
541 ATTRIBUTE_CONST on the called functions. Adding a cast to a non-pure
542 function pointer doesn't work in gcc 3.2. Using an actual non-pure
543 function pointer variable works, but stands a real risk of a
544 non-optimizing compiler generating unnecessary overheads in the call.
545 Currently the best idea is not to use those attributes for a timing
546 program build. __GMP_NO_ATTRIBUTE_CONST_PURE will tell gmp.h and
547 gmp-impl.h to omit them from routines there. */
549 #define SPEED_RESTRICT_COND(cond) if (!(cond)) return -1.0;
551 /* For mpn_copy or similar. */
552 #define SPEED_ROUTINE_MPN_COPY(function) \
559 SPEED_RESTRICT_COND (s->size >= 0); \
562 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
564 speed_operand_src (s, s->xp, s->size); \
565 speed_operand_dst (s, wp, s->size); \
566 speed_cache_fill (s); \
568 speed_starttime (); \
571 function (wp, s->xp, s->size); \
573 t = speed_endtime (); \
579 #define SPEED_ROUTINE_MPN_COPYC(function) \
586 SPEED_RESTRICT_COND (s->size >= 0); \
589 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
591 speed_operand_src (s, s->xp, s->size); \
592 speed_operand_dst (s, wp, s->size); \
593 speed_cache_fill (s); \
595 speed_starttime (); \
598 function (wp, s->xp, s->size, 0); \
600 t = speed_endtime (); \
606 /* s->size is still in limbs, and it's limbs which are copied, but
607 "function" takes a size in bytes not limbs. */
608 #define SPEED_ROUTINE_MPN_COPY_BYTES(function) \
615 SPEED_RESTRICT_COND (s->size >= 0); \
618 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
620 speed_operand_src (s, s->xp, s->size); \
621 speed_operand_dst (s, wp, s->size); \
622 speed_cache_fill (s); \
624 speed_starttime (); \
627 function (wp, s->xp, s->size * BYTES_PER_MP_LIMB); \
629 t = speed_endtime (); \
636 /* For mpn_add_n, mpn_sub_n, or similar. */
637 #define SPEED_ROUTINE_MPN_BINARY_N_CALL(call) \
645 SPEED_RESTRICT_COND (s->size >= 1); \
648 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
654 else if (s->r == 1) { xp = wp; } \
655 else if (s->r == 2) { yp = wp; } \
656 else if (s->r == 3) { xp = wp; yp = wp; } \
657 else if (s->r == 4) { yp = xp; } \
663 /* initialize wp if operand overlap */ \
664 if (xp == wp || yp == wp) \
665 MPN_COPY (wp, s->xp, s->size); \
667 speed_operand_src (s, xp, s->size); \
668 speed_operand_src (s, yp, s->size); \
669 speed_operand_dst (s, wp, s->size); \
670 speed_cache_fill (s); \
672 speed_starttime (); \
677 t = speed_endtime (); \
683 /* For mpn_add_n, mpn_sub_n, or similar. */
684 #define SPEED_ROUTINE_MPN_ADDSUB_N_CALL(call) \
692 SPEED_RESTRICT_COND (s->size >= 1); \
695 SPEED_TMP_ALLOC_LIMBS (ap, s->size, s->align_wp); \
696 SPEED_TMP_ALLOC_LIMBS (sp, s->size, s->align_wp); \
701 if ((s->r & 1) != 0) { xp = ap; } \
702 if ((s->r & 2) != 0) { yp = ap; } \
703 if ((s->r & 4) != 0) { xp = sp; } \
704 if ((s->r & 8) != 0) { yp = sp; } \
705 if ((s->r & 3) == 3 || (s->r & 12) == 12) \
711 /* initialize ap if operand overlap */ \
712 if (xp == ap || yp == ap) \
713 MPN_COPY (ap, s->xp, s->size); \
714 /* initialize sp if operand overlap */ \
715 if (xp == sp || yp == sp) \
716 MPN_COPY (sp, s->xp, s->size); \
718 speed_operand_src (s, xp, s->size); \
719 speed_operand_src (s, yp, s->size); \
720 speed_operand_dst (s, ap, s->size); \
721 speed_operand_dst (s, sp, s->size); \
722 speed_cache_fill (s); \
724 speed_starttime (); \
729 t = speed_endtime (); \
735 #define SPEED_ROUTINE_MPN_BINARY_N(function) \
736 SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size))
738 #define SPEED_ROUTINE_MPN_BINARY_NC(function) \
739 SPEED_ROUTINE_MPN_BINARY_N_CALL ((*function) (wp, xp, yp, s->size, 0))
742 /* For mpn_lshift, mpn_rshift, mpn_mul_1, with r, or similar. */
743 #define SPEED_ROUTINE_MPN_UNARY_1_CALL(call) \
750 SPEED_RESTRICT_COND (s->size >= 1); \
753 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
755 speed_operand_src (s, s->xp, s->size); \
756 speed_operand_dst (s, wp, s->size); \
757 speed_cache_fill (s); \
759 speed_starttime (); \
764 t = speed_endtime (); \
770 #define SPEED_ROUTINE_MPN_UNARY_1(function) \
771 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
773 #define SPEED_ROUTINE_MPN_UNARY_1C(function) \
774 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
776 /* FIXME: wp is uninitialized here, should start it off from xp */
777 #define SPEED_ROUTINE_MPN_UNARY_1_INPLACE(function) \
778 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, wp, s->size, s->r))
780 #define SPEED_ROUTINE_MPN_DIVEXACT_1(function) \
781 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
783 #define SPEED_ROUTINE_MPN_BDIV_Q_1(function) \
784 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r))
786 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL(call) \
791 SPEED_RESTRICT_COND (s->size > 0); \
792 SPEED_RESTRICT_COND (s->r != 0); \
794 count_trailing_zeros (shift, s->r); \
795 binvert_limb (dinv, s->r >> shift); \
797 SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \
799 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q_1(function) \
800 SPEED_ROUTINE_MPN_PI1_BDIV_Q_1_CALL \
801 ((*function) (wp, s->xp, s->size, s->r, dinv, shift))
803 #define SPEED_ROUTINE_MPN_BDIV_DBM1C(function) \
804 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->xp, s->size, s->r, 0))
806 #define SPEED_ROUTINE_MPN_DIVREM_1(function) \
807 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r))
809 #define SPEED_ROUTINE_MPN_DIVREM_1C(function) \
810 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, 0, s->xp, s->size, s->r, 0))
812 #define SPEED_ROUTINE_MPN_DIVREM_1F(function) \
813 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r))
815 #define SPEED_ROUTINE_MPN_DIVREM_1CF(function) \
816 SPEED_ROUTINE_MPN_UNARY_1_CALL ((*function) (wp, s->size, s->xp, 0, s->r, 0))
819 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL(call) \
824 SPEED_RESTRICT_COND (s->size >= 0); \
825 SPEED_RESTRICT_COND (s->r != 0); \
827 count_leading_zeros (shift, s->r); \
828 invert_limb (dinv, s->r << shift); \
830 SPEED_ROUTINE_MPN_UNARY_1_CALL (call); \
833 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1(function) \
834 SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \
835 ((*function) (wp, 0, s->xp, s->size, s->r, dinv, shift))
837 /* s->size limbs worth of fraction part */
838 #define SPEED_ROUTINE_MPN_PREINV_DIVREM_1F(function) \
839 SPEED_ROUTINE_MPN_PREINV_DIVREM_1_CALL \
840 ((*function) (wp, s->size, s->xp, 0, s->r, dinv, shift))
843 /* s->r is duplicated to form the multiplier, defaulting to
844 MP_BASES_BIG_BASE_10. Not sure if that's particularly useful, but at
845 least it provides some control. */
846 #define SPEED_ROUTINE_MPN_UNARY_N(function,N) \
855 SPEED_RESTRICT_COND (s->size >= N); \
858 wn = s->size + N-1; \
859 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \
860 for (i = 0; i < N; i++) \
861 yp[i] = (s->r != 0 ? s->r : MP_BASES_BIG_BASE_10); \
863 speed_operand_src (s, s->xp, s->size); \
864 speed_operand_src (s, yp, (mp_size_t) N); \
865 speed_operand_dst (s, wp, wn); \
866 speed_cache_fill (s); \
868 speed_starttime (); \
871 function (wp, s->xp, s->size, yp); \
873 t = speed_endtime (); \
879 #define SPEED_ROUTINE_MPN_UNARY_2(function) \
880 SPEED_ROUTINE_MPN_UNARY_N (function, 2)
881 #define SPEED_ROUTINE_MPN_UNARY_3(function) \
882 SPEED_ROUTINE_MPN_UNARY_N (function, 3)
883 #define SPEED_ROUTINE_MPN_UNARY_4(function) \
884 SPEED_ROUTINE_MPN_UNARY_N (function, 4)
885 #define SPEED_ROUTINE_MPN_UNARY_5(function) \
886 SPEED_ROUTINE_MPN_UNARY_N (function, 5)
887 #define SPEED_ROUTINE_MPN_UNARY_6(function) \
888 SPEED_ROUTINE_MPN_UNARY_N (function, 6)
889 #define SPEED_ROUTINE_MPN_UNARY_7(function) \
890 SPEED_ROUTINE_MPN_UNARY_N (function, 7)
891 #define SPEED_ROUTINE_MPN_UNARY_8(function) \
892 SPEED_ROUTINE_MPN_UNARY_N (function, 8)
895 /* For mpn_mul, mpn_mul_basecase, xsize=r, ysize=s->size. */
896 #define SPEED_ROUTINE_MPN_MUL(function) \
904 size1 = (s->r == 0 ? s->size : s->r); \
906 SPEED_RESTRICT_COND (s->size >= 1); \
907 SPEED_RESTRICT_COND (size1 >= s->size); \
910 SPEED_TMP_ALLOC_LIMBS (wp, size1 + s->size, s->align_wp); \
911 SPEED_TMP_ALLOC_LIMBS (xp, size1, s->align_xp); \
913 speed_operand_src (s, xp, size1); \
914 speed_operand_src (s, s->yp, s->size); \
915 speed_operand_dst (s, wp, size1 + s->size); \
916 speed_cache_fill (s); \
918 speed_starttime (); \
921 function (wp, xp, size1, s->yp, s->size); \
923 t = speed_endtime (); \
930 #define SPEED_ROUTINE_MPN_MUL_N_CALL(call) \
937 SPEED_RESTRICT_COND (s->size >= 1); \
940 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \
942 speed_operand_src (s, s->xp, s->size); \
943 speed_operand_src (s, s->yp, s->size); \
944 speed_operand_dst (s, wp, 2*s->size); \
945 speed_cache_fill (s); \
947 speed_starttime (); \
952 t = speed_endtime (); \
958 #define SPEED_ROUTINE_MPN_MUL_N(function) \
959 SPEED_ROUTINE_MPN_MUL_N_CALL (function (wp, s->xp, s->yp, s->size));
961 #define SPEED_ROUTINE_MPN_MULLO_N_CALL(call) \
968 SPEED_RESTRICT_COND (s->size >= 1); \
971 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
973 speed_operand_src (s, s->xp, s->size); \
974 speed_operand_src (s, s->yp, s->size); \
975 speed_operand_dst (s, wp, s->size); \
976 speed_cache_fill (s); \
978 speed_starttime (); \
983 t = speed_endtime (); \
989 #define SPEED_ROUTINE_MPN_MULLO_N(function) \
990 SPEED_ROUTINE_MPN_MULLO_N_CALL (function (wp, s->xp, s->yp, s->size));
992 /* For mpn_mul_basecase, xsize=r, ysize=s->size. */
993 #define SPEED_ROUTINE_MPN_MULLO_BASECASE(function) \
1000 SPEED_RESTRICT_COND (s->size >= 1); \
1003 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
1005 speed_operand_src (s, s->xp, s->size); \
1006 speed_operand_src (s, s->yp, s->size); \
1007 speed_operand_dst (s, wp, s->size); \
1008 speed_cache_fill (s); \
1010 speed_starttime (); \
1013 function (wp, s->xp, s->yp, s->size); \
1015 t = speed_endtime (); \
1021 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL(call) \
1029 SPEED_RESTRICT_COND (s->size >= 1); \
1031 itch = mpn_mulmod_bnm1_itch (s->size, s->size, s->size); \
1034 SPEED_TMP_ALLOC_LIMBS (wp, 2 * s->size, s->align_wp); \
1035 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \
1037 speed_operand_src (s, s->xp, s->size); \
1038 speed_operand_src (s, s->yp, s->size); \
1039 speed_operand_dst (s, wp, 2 * s->size); \
1040 speed_operand_dst (s, tp, itch); \
1041 speed_cache_fill (s); \
1043 speed_starttime (); \
1048 t = speed_endtime (); \
1053 #define SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED(function) \
1058 mp_size_t size, itch; \
1061 SPEED_RESTRICT_COND (s->size >= 1); \
1063 size = mpn_mulmod_bnm1_next_size (s->size); \
1064 itch = mpn_mulmod_bnm1_itch (size, size, size); \
1067 SPEED_TMP_ALLOC_LIMBS (wp, size, s->align_wp); \
1068 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2); \
1070 speed_operand_src (s, s->xp, s->size); \
1071 speed_operand_src (s, s->yp, s->size); \
1072 speed_operand_dst (s, wp, size); \
1073 speed_operand_dst (s, tp, itch); \
1074 speed_cache_fill (s); \
1076 speed_starttime (); \
1079 function (wp, size, s->xp, s->size, s->yp, s->size, tp); \
1081 t = speed_endtime (); \
1087 #define SPEED_ROUTINE_MPN_MUL_N_TSPACE(call, tsize, minsize) \
1089 mp_ptr wp, tspace; \
1094 SPEED_RESTRICT_COND (s->size >= minsize); \
1097 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \
1098 SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \
1100 speed_operand_src (s, s->xp, s->size); \
1101 speed_operand_src (s, s->yp, s->size); \
1102 speed_operand_dst (s, wp, 2*s->size); \
1103 speed_operand_dst (s, tspace, tsize); \
1104 speed_cache_fill (s); \
1106 speed_starttime (); \
1111 t = speed_endtime (); \
1117 #define SPEED_ROUTINE_MPN_TOOM22_MUL_N(function) \
1118 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1119 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \
1120 mpn_toom22_mul_itch (s->size, s->size), \
1121 MPN_TOOM22_MUL_MINSIZE)
1123 #define SPEED_ROUTINE_MPN_TOOM33_MUL_N(function) \
1124 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1125 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \
1126 mpn_toom33_mul_itch (s->size, s->size), \
1127 MPN_TOOM33_MUL_MINSIZE)
1129 #define SPEED_ROUTINE_MPN_TOOM44_MUL_N(function) \
1130 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1131 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \
1132 mpn_toom44_mul_itch (s->size, s->size), \
1133 MPN_TOOM44_MUL_MINSIZE)
1135 #define SPEED_ROUTINE_MPN_TOOM6H_MUL_N(function) \
1136 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1137 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \
1138 mpn_toom6h_mul_itch (s->size, s->size), \
1139 MPN_TOOM6H_MUL_MINSIZE)
1141 #define SPEED_ROUTINE_MPN_TOOM8H_MUL_N(function) \
1142 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1143 (function (wp, s->xp, s->size, s->yp, s->size, tspace), \
1144 mpn_toom8h_mul_itch (s->size, s->size), \
1145 MPN_TOOM8H_MUL_MINSIZE)
1147 #define SPEED_ROUTINE_MPN_TOOM32_MUL(function) \
1148 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1149 (function (wp, s->xp, s->size, s->yp, 2*s->size/3, tspace), \
1150 mpn_toom32_mul_itch (s->size, 2*s->size/3), \
1151 MPN_TOOM32_MUL_MINSIZE)
1153 #define SPEED_ROUTINE_MPN_TOOM42_MUL(function) \
1154 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1155 (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \
1156 mpn_toom42_mul_itch (s->size, s->size/2), \
1157 MPN_TOOM42_MUL_MINSIZE)
1159 #define SPEED_ROUTINE_MPN_TOOM43_MUL(function) \
1160 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1161 (function (wp, s->xp, s->size, s->yp, s->size*3/4, tspace), \
1162 mpn_toom43_mul_itch (s->size, s->size*3/4), \
1163 MPN_TOOM43_MUL_MINSIZE)
1165 #define SPEED_ROUTINE_MPN_TOOM63_MUL(function) \
1166 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1167 (function (wp, s->xp, s->size, s->yp, s->size/2, tspace), \
1168 mpn_toom63_mul_itch (s->size, s->size/2), \
1169 MPN_TOOM63_MUL_MINSIZE)
1171 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL(function) \
1172 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1173 (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \
1174 mpn_toom32_mul_itch (s->size, 17*s->size/24), \
1175 MPN_TOOM32_MUL_MINSIZE)
1176 #define SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL(function) \
1177 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1178 (function (wp, s->xp, s->size, s->yp, 17*s->size/24, tspace), \
1179 mpn_toom43_mul_itch (s->size, 17*s->size/24), \
1180 MPN_TOOM43_MUL_MINSIZE)
1182 #define SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL(function) \
1183 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1184 (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \
1185 mpn_toom32_mul_itch (s->size, 19*s->size/30), \
1186 MPN_TOOM32_MUL_MINSIZE)
1187 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL(function) \
1188 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1189 (function (wp, s->xp, s->size, s->yp, 19*s->size/30, tspace), \
1190 mpn_toom53_mul_itch (s->size, 19*s->size/30), \
1191 MPN_TOOM53_MUL_MINSIZE)
1193 #define SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL(function) \
1194 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1195 (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \
1196 mpn_toom42_mul_itch (s->size, 11*s->size/20), \
1197 MPN_TOOM42_MUL_MINSIZE)
1198 #define SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL(function) \
1199 SPEED_ROUTINE_MPN_MUL_N_TSPACE \
1200 (function (wp, s->xp, s->size, s->yp, 11*s->size/20, tspace), \
1201 mpn_toom53_mul_itch (s->size, 11*s->size/20), \
1202 MPN_TOOM53_MUL_MINSIZE)
1206 #define SPEED_ROUTINE_MPN_SQR_CALL(call) \
1213 SPEED_RESTRICT_COND (s->size >= 1); \
1216 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \
1218 speed_operand_src (s, s->xp, s->size); \
1219 speed_operand_dst (s, wp, 2*s->size); \
1220 speed_cache_fill (s); \
1222 speed_starttime (); \
1227 t = speed_endtime (); \
1233 #define SPEED_ROUTINE_MPN_SQR(function) \
1234 SPEED_ROUTINE_MPN_SQR_CALL (function (wp, s->xp, s->size))
1236 #define SPEED_ROUTINE_MPN_SQR_DIAGONAL(function) \
1237 SPEED_ROUTINE_MPN_SQR (function)
1240 #define SPEED_ROUTINE_MPN_SQR_TSPACE(call, tsize, minsize) \
1242 mp_ptr wp, tspace; \
1247 SPEED_RESTRICT_COND (s->size >= minsize); \
1250 SPEED_TMP_ALLOC_LIMBS (wp, 2*s->size, s->align_wp); \
1251 SPEED_TMP_ALLOC_LIMBS (tspace, tsize, s->align_wp2); \
1253 speed_operand_src (s, s->xp, s->size); \
1254 speed_operand_dst (s, wp, 2*s->size); \
1255 speed_operand_dst (s, tspace, tsize); \
1256 speed_cache_fill (s); \
1258 speed_starttime (); \
1263 t = speed_endtime (); \
1269 #define SPEED_ROUTINE_MPN_TOOM2_SQR(function) \
1270 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
1271 mpn_toom2_sqr_itch (s->size), \
1272 MPN_TOOM2_SQR_MINSIZE)
1274 #define SPEED_ROUTINE_MPN_TOOM3_SQR(function) \
1275 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
1276 mpn_toom3_sqr_itch (s->size), \
1277 MPN_TOOM3_SQR_MINSIZE)
1280 #define SPEED_ROUTINE_MPN_TOOM4_SQR(function) \
1281 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
1282 mpn_toom4_sqr_itch (s->size), \
1283 MPN_TOOM4_SQR_MINSIZE)
1285 #define SPEED_ROUTINE_MPN_TOOM6_SQR(function) \
1286 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
1287 mpn_toom6_sqr_itch (s->size), \
1288 MPN_TOOM6_SQR_MINSIZE)
1290 #define SPEED_ROUTINE_MPN_TOOM8_SQR(function) \
1291 SPEED_ROUTINE_MPN_SQR_TSPACE (function (wp, s->xp, s->size, tspace), \
1292 mpn_toom8_sqr_itch (s->size), \
1293 MPN_TOOM8_SQR_MINSIZE)
1295 #define SPEED_ROUTINE_MPN_MOD_CALL(call) \
1299 SPEED_RESTRICT_COND (s->size >= 0); \
1301 speed_operand_src (s, s->xp, s->size); \
1302 speed_cache_fill (s); \
1304 speed_starttime (); \
1310 return speed_endtime (); \
1313 #define SPEED_ROUTINE_MPN_MOD_1(function) \
1314 SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size, s->r))
1316 #define SPEED_ROUTINE_MPN_MOD_1C(function) \
1317 SPEED_ROUTINE_MPN_MOD_CALL ((*function)(s->xp, s->size, s->r, CNST_LIMB(0)))
1319 #define SPEED_ROUTINE_MPN_MODEXACT_1_ODD(function) \
1320 SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r));
1322 #define SPEED_ROUTINE_MPN_MODEXACT_1C_ODD(function) \
1323 SPEED_ROUTINE_MPN_MOD_CALL (function (s->xp, s->size, s->r, CNST_LIMB(0)));
1325 #define SPEED_ROUTINE_MPN_MOD_34LSUB1(function) \
1326 SPEED_ROUTINE_MPN_MOD_CALL ((*function) (s->xp, s->size))
1328 #define SPEED_ROUTINE_MPN_PREINV_MOD_1(function) \
1333 SPEED_RESTRICT_COND (s->size >= 0); \
1334 SPEED_RESTRICT_COND (s->r & GMP_LIMB_HIGHBIT); \
1336 invert_limb (inv, s->r); \
1337 speed_operand_src (s, s->xp, s->size); \
1338 speed_cache_fill (s); \
1340 speed_starttime (); \
1343 (*function) (s->xp, s->size, s->r, inv); \
1346 return speed_endtime (); \
1349 #define SPEED_ROUTINE_MPN_MOD_1_1(function,pfunc) \
1354 SPEED_RESTRICT_COND (s->size >= 2); \
1356 mpn_mod_1_1p_cps (inv, s->r); \
1357 speed_operand_src (s, s->xp, s->size); \
1358 speed_cache_fill (s); \
1360 speed_starttime (); \
1363 pfunc (inv, s->r); \
1364 function (s->xp, s->size, s->r, inv); \
1365 } while (--i != 0); \
1367 return speed_endtime (); \
1369 #define SPEED_ROUTINE_MPN_MOD_1_N(function,pfunc,N) \
1372 mp_limb_t inv[N+3]; \
1374 SPEED_RESTRICT_COND (s->size >= 1); \
1375 SPEED_RESTRICT_COND (s->r <= ~(mp_limb_t)0 / N); \
1377 speed_operand_src (s, s->xp, s->size); \
1378 speed_cache_fill (s); \
1380 speed_starttime (); \
1383 pfunc (inv, s->r); \
1384 function (s->xp, s->size, s->r, inv); \
1385 } while (--i != 0); \
1387 return speed_endtime (); \
1391 /* A division of 2*s->size by s->size limbs */
1393 #define SPEED_ROUTINE_MPN_DC_DIVREM_CALL(call) \
1396 mp_ptr a, d, q, r; \
1401 SPEED_RESTRICT_COND (s->size >= 1); \
1404 SPEED_TMP_ALLOC_LIMBS (a, 2*s->size, s->align_xp); \
1405 SPEED_TMP_ALLOC_LIMBS (d, s->size, s->align_yp); \
1406 SPEED_TMP_ALLOC_LIMBS (q, s->size+1, s->align_wp); \
1407 SPEED_TMP_ALLOC_LIMBS (r, s->size, s->align_wp2); \
1409 MPN_COPY (a, s->xp, s->size); \
1410 MPN_COPY (a+s->size, s->xp, s->size); \
1412 MPN_COPY (d, s->yp, s->size); \
1414 /* normalize the data */ \
1415 d[s->size-1] |= GMP_NUMB_HIGHBIT; \
1416 a[2*s->size-1] = d[s->size-1] - 1; \
1418 invert_pi1 (dinv, d[s->size-1], d[s->size-2]); \
1420 speed_operand_src (s, a, 2*s->size); \
1421 speed_operand_src (s, d, s->size); \
1422 speed_operand_dst (s, q, s->size+1); \
1423 speed_operand_dst (s, r, s->size); \
1424 speed_cache_fill (s); \
1426 speed_starttime (); \
1431 t = speed_endtime (); \
1438 /* A remainder 2*s->size by s->size limbs */
1440 #define SPEED_ROUTINE_MPZ_MOD(function) \
1445 SPEED_RESTRICT_COND (s->size >= 1); \
1447 mpz_init_set_n (d, s->yp, s->size); \
1449 /* high part less than d, low part a duplicate copied in */ \
1450 mpz_init_set_n (a, s->xp, s->size); \
1451 mpz_mod (a, a, d); \
1452 mpz_mul_2exp (a, a, GMP_LIMB_BITS * s->size); \
1453 MPN_COPY (PTR(a), s->xp, s->size); \
1457 speed_operand_src (s, PTR(a), SIZ(a)); \
1458 speed_operand_src (s, PTR(d), SIZ(d)); \
1459 speed_cache_fill (s); \
1461 speed_starttime (); \
1464 function (r, a, d); \
1466 return speed_endtime (); \
1469 #define SPEED_ROUTINE_MPN_PI1_DIV(function, INV, DMIN, QMIN) \
1472 mp_ptr dp, tp, ap, qp; \
1478 size1 = (s->r == 0 ? 2 * s->size : s->r); \
1480 SPEED_RESTRICT_COND (s->size >= DMIN); \
1481 SPEED_RESTRICT_COND (size1 - s->size >= QMIN); \
1484 SPEED_TMP_ALLOC_LIMBS (ap, size1, s->align_xp); \
1485 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
1486 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \
1487 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_wp2); \
1489 /* we don't fill in dividend completely when size1 > s->size */ \
1490 MPN_COPY (ap, s->xp, s->size); \
1491 MPN_COPY (ap + size1 - s->size, s->xp, s->size); \
1493 MPN_COPY (dp, s->yp, s->size); \
1495 /* normalize the data */ \
1496 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \
1497 ap[size1 - 1] = dp[s->size - 1] - 1; \
1499 invert_pi1 (inv, dp[s->size-1], dp[s->size-2]); \
1501 speed_operand_src (s, ap, size1); \
1502 speed_operand_dst (s, tp, size1); \
1503 speed_operand_src (s, dp, s->size); \
1504 speed_operand_dst (s, qp, size1 - s->size); \
1505 speed_cache_fill (s); \
1507 speed_starttime (); \
1510 MPN_COPY (tp, ap, size1); \
1511 function (qp, tp, size1, dp, s->size, INV); \
1512 } while (--i != 0); \
1513 t = speed_endtime (); \
1518 #define SPEED_ROUTINE_MPN_MU_DIV_Q(function,itchfn) \
1521 mp_ptr dp, tp, qp, scratch; \
1526 SPEED_RESTRICT_COND (s->size >= 2); \
1528 itch = itchfn (2 * s->size, s->size, 0); \
1530 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
1531 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \
1532 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp); \
1533 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \
1535 MPN_COPY (tp, s->xp, s->size); \
1536 MPN_COPY (tp+s->size, s->xp, s->size); \
1538 /* normalize the data */ \
1539 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \
1540 tp[2*s->size-1] = dp[s->size-1] - 1; \
1542 speed_operand_dst (s, qp, s->size); \
1543 speed_operand_src (s, tp, 2 * s->size); \
1544 speed_operand_src (s, dp, s->size); \
1545 speed_operand_dst (s, scratch, itch); \
1546 speed_cache_fill (s); \
1548 speed_starttime (); \
1551 function (qp, tp, 2 * s->size, dp, s->size, scratch); \
1552 } while (--i != 0); \
1553 t = speed_endtime (); \
1558 #define SPEED_ROUTINE_MPN_MU_DIV_QR(function,itchfn) \
1561 mp_ptr dp, tp, qp, rp, scratch; \
1563 mp_size_t size1, itch; \
1566 size1 = (s->r == 0 ? 2 * s->size : s->r); \
1568 SPEED_RESTRICT_COND (s->size >= 2); \
1569 SPEED_RESTRICT_COND (size1 >= s->size); \
1571 itch = itchfn (size1, s->size, 0); \
1573 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
1574 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \
1575 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp); \
1576 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \
1577 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \
1579 /* we don't fill in dividend completely when size1 > s->size */ \
1580 MPN_COPY (tp, s->xp, s->size); \
1581 MPN_COPY (tp + size1 - s->size, s->xp, s->size); \
1583 MPN_COPY (dp, s->yp, s->size); \
1585 /* normalize the data */ \
1586 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \
1587 tp[size1 - 1] = dp[s->size - 1] - 1; \
1589 speed_operand_dst (s, qp, size1 - s->size); \
1590 speed_operand_dst (s, rp, s->size); \
1591 speed_operand_src (s, tp, size1); \
1592 speed_operand_src (s, dp, s->size); \
1593 speed_operand_dst (s, scratch, itch); \
1594 speed_cache_fill (s); \
1596 speed_starttime (); \
1599 function (qp, rp, tp, size1, dp, s->size, scratch); \
1600 } while (--i != 0); \
1601 t = speed_endtime (); \
1606 #define SPEED_ROUTINE_MPN_MUPI_DIV_QR(function,itchfn) \
1609 mp_ptr dp, tp, qp, rp, ip, scratch; \
1611 mp_size_t size1, itch; \
1614 size1 = (s->r == 0 ? 2 * s->size : s->r); \
1616 SPEED_RESTRICT_COND (s->size >= 2); \
1617 SPEED_RESTRICT_COND (size1 >= s->size); \
1619 itch = itchfn (size1, s->size, 0); \
1621 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
1622 SPEED_TMP_ALLOC_LIMBS (qp, size1 - s->size, s->align_wp); \
1623 SPEED_TMP_ALLOC_LIMBS (tp, size1, s->align_xp); \
1624 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \
1625 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \
1626 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_wp2); /* alignment? */ \
1628 /* we don't fill in dividend completely when size1 > s->size */ \
1629 MPN_COPY (tp, s->xp, s->size); \
1630 MPN_COPY (tp + size1 - s->size, s->xp, s->size); \
1632 MPN_COPY (dp, s->yp, s->size); \
1634 /* normalize the data */ \
1635 dp[s->size-1] |= GMP_NUMB_HIGHBIT; \
1636 tp[size1 - 1] = dp[s->size-1] - 1; \
1638 mpn_invert (ip, dp, s->size, NULL); \
1640 speed_operand_dst (s, qp, size1 - s->size); \
1641 speed_operand_dst (s, rp, s->size); \
1642 speed_operand_src (s, tp, size1); \
1643 speed_operand_src (s, dp, s->size); \
1644 speed_operand_src (s, ip, s->size); \
1645 speed_operand_dst (s, scratch, itch); \
1646 speed_cache_fill (s); \
1648 speed_starttime (); \
1651 function (qp, rp, tp, size1, dp, s->size, ip, s->size, scratch); \
1652 } while (--i != 0); \
1653 t = speed_endtime (); \
1659 #define SPEED_ROUTINE_MPN_PI1_BDIV_QR(function) \
1662 mp_ptr dp, tp, ap, qp; \
1667 SPEED_RESTRICT_COND (s->size >= 1); \
1670 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size, s->align_xp); \
1671 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
1672 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \
1673 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size, s->align_wp2); \
1675 MPN_COPY (ap, s->xp, s->size); \
1676 MPN_COPY (ap+s->size, s->xp, s->size); \
1678 /* divisor must be odd */ \
1679 MPN_COPY (dp, s->yp, s->size); \
1681 binvert_limb (inv, dp[0]); \
1684 speed_operand_src (s, ap, 2*s->size); \
1685 speed_operand_dst (s, tp, 2*s->size); \
1686 speed_operand_src (s, dp, s->size); \
1687 speed_operand_dst (s, qp, s->size); \
1688 speed_cache_fill (s); \
1690 speed_starttime (); \
1693 MPN_COPY (tp, ap, 2*s->size); \
1694 function (qp, tp, 2*s->size, dp, s->size, inv); \
1695 } while (--i != 0); \
1696 t = speed_endtime (); \
1701 #define SPEED_ROUTINE_MPN_PI1_BDIV_Q(function) \
1704 mp_ptr dp, tp, qp; \
1709 SPEED_RESTRICT_COND (s->size >= 1); \
1712 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
1713 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \
1714 SPEED_TMP_ALLOC_LIMBS (tp, s->size, s->align_wp2); \
1716 /* divisor must be odd */ \
1717 MPN_COPY (dp, s->yp, s->size); \
1719 binvert_limb (inv, dp[0]); \
1722 speed_operand_src (s, s->xp, s->size); \
1723 speed_operand_dst (s, tp, s->size); \
1724 speed_operand_src (s, dp, s->size); \
1725 speed_operand_dst (s, qp, s->size); \
1726 speed_cache_fill (s); \
1728 speed_starttime (); \
1731 MPN_COPY (tp, s->xp, s->size); \
1732 function (qp, tp, s->size, dp, s->size, inv); \
1733 } while (--i != 0); \
1734 t = speed_endtime (); \
1739 #define SPEED_ROUTINE_MPN_MU_BDIV_Q(function,itchfn) \
1742 mp_ptr dp, qp, scratch; \
1747 SPEED_RESTRICT_COND (s->size >= 2); \
1749 itch = itchfn (s->size, s->size); \
1751 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
1752 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \
1753 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \
1755 /* divisor must be odd */ \
1756 MPN_COPY (dp, s->yp, s->size); \
1759 speed_operand_dst (s, qp, s->size); \
1760 speed_operand_src (s, s->xp, s->size); \
1761 speed_operand_src (s, dp, s->size); \
1762 speed_operand_dst (s, scratch, itch); \
1763 speed_cache_fill (s); \
1765 speed_starttime (); \
1768 function (qp, s->xp, s->size, dp, s->size, scratch); \
1769 } while (--i != 0); \
1770 t = speed_endtime (); \
1775 #define SPEED_ROUTINE_MPN_MU_BDIV_QR(function,itchfn) \
1778 mp_ptr dp, tp, qp, rp, scratch; \
1783 SPEED_RESTRICT_COND (s->size >= 2); \
1785 itch = itchfn (2 * s->size, s->size); \
1787 SPEED_TMP_ALLOC_LIMBS (dp, s->size, s->align_yp); \
1788 SPEED_TMP_ALLOC_LIMBS (qp, s->size, s->align_wp); \
1789 SPEED_TMP_ALLOC_LIMBS (tp, 2 * s->size, s->align_xp); \
1790 SPEED_TMP_ALLOC_LIMBS (scratch, itch, s->align_wp2); \
1791 SPEED_TMP_ALLOC_LIMBS (rp, s->size, s->align_wp2); /* alignment? */ \
1793 MPN_COPY (tp, s->xp, s->size); \
1794 MPN_COPY (tp+s->size, s->xp, s->size); \
1796 /* divisor must be odd */ \
1797 MPN_COPY (dp, s->yp, s->size); \
1800 speed_operand_dst (s, qp, s->size); \
1801 speed_operand_dst (s, rp, s->size); \
1802 speed_operand_src (s, tp, 2 * s->size); \
1803 speed_operand_src (s, dp, s->size); \
1804 speed_operand_dst (s, scratch, itch); \
1805 speed_cache_fill (s); \
1807 speed_starttime (); \
1810 function (qp, rp, tp, 2 * s->size, dp, s->size, scratch); \
1811 } while (--i != 0); \
1812 t = speed_endtime (); \
1818 #define SPEED_ROUTINE_MPN_INVERT(function,itchfn) \
1821 mp_ptr up, tp, ip; \
1825 SPEED_RESTRICT_COND (s->size >= 1); \
1828 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \
1829 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \
1830 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \
1832 MPN_COPY (up, s->xp, s->size); \
1834 /* normalize the data */ \
1835 up[s->size-1] |= GMP_NUMB_HIGHBIT; \
1837 speed_operand_src (s, up, s->size); \
1838 speed_operand_dst (s, tp, s->size); \
1839 speed_operand_dst (s, ip, s->size); \
1840 speed_cache_fill (s); \
1842 speed_starttime (); \
1845 function (ip, up, s->size, tp); \
1847 t = speed_endtime (); \
1853 #define SPEED_ROUTINE_MPN_INVERTAPPR(function,itchfn) \
1856 mp_ptr up, tp, ip; \
1860 SPEED_RESTRICT_COND (s->size >= 1); \
1863 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \
1864 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \
1865 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \
1867 MPN_COPY (up, s->xp, s->size); \
1869 /* normalize the data */ \
1870 up[s->size-1] |= GMP_NUMB_HIGHBIT; \
1872 speed_operand_src (s, up, s->size); \
1873 speed_operand_dst (s, tp, s->size); \
1874 speed_operand_dst (s, ip, s->size); \
1875 speed_cache_fill (s); \
1877 speed_starttime (); \
1880 function (ip, up, s->size, tp); \
1882 t = speed_endtime (); \
1888 #define SPEED_ROUTINE_MPN_NI_INVERTAPPR(function,itchfn) \
1891 mp_ptr up, tp, ip; \
1895 SPEED_RESTRICT_COND (s->size >= 3); \
1898 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \
1899 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \
1900 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \
1902 MPN_COPY (up, s->xp, s->size); \
1904 /* normalize the data */ \
1905 up[s->size-1] |= GMP_NUMB_HIGHBIT; \
1907 speed_operand_src (s, up, s->size); \
1908 speed_operand_dst (s, tp, s->size); \
1909 speed_operand_dst (s, ip, s->size); \
1910 speed_cache_fill (s); \
1912 speed_starttime (); \
1915 function (ip, up, s->size, tp); \
1917 t = speed_endtime (); \
1923 #define SPEED_ROUTINE_MPN_BINVERT(function,itchfn) \
1926 mp_ptr up, tp, ip; \
1930 SPEED_RESTRICT_COND (s->size >= 1); \
1933 SPEED_TMP_ALLOC_LIMBS (ip, s->size, s->align_xp); \
1934 SPEED_TMP_ALLOC_LIMBS (up, s->size, s->align_yp); \
1935 SPEED_TMP_ALLOC_LIMBS (tp, itchfn (s->size), s->align_wp); \
1937 MPN_COPY (up, s->xp, s->size); \
1939 /* normalize the data */ \
1942 speed_operand_src (s, up, s->size); \
1943 speed_operand_dst (s, tp, s->size); \
1944 speed_operand_dst (s, ip, s->size); \
1945 speed_cache_fill (s); \
1947 speed_starttime (); \
1950 function (ip, up, s->size, tp); \
1952 t = speed_endtime (); \
1958 #define SPEED_ROUTINE_REDC_1(function) \
1961 mp_ptr cp, mp, tp, ap; \
1966 SPEED_RESTRICT_COND (s->size >= 1); \
1969 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \
1970 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \
1971 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \
1972 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \
1974 MPN_COPY (ap, s->xp, s->size); \
1975 MPN_COPY (ap+s->size, s->xp, s->size); \
1977 /* modulus must be odd */ \
1978 MPN_COPY (mp, s->yp, s->size); \
1980 binvert_limb (inv, mp[0]); \
1983 speed_operand_src (s, ap, 2*s->size+1); \
1984 speed_operand_dst (s, tp, 2*s->size+1); \
1985 speed_operand_src (s, mp, s->size); \
1986 speed_operand_dst (s, cp, s->size); \
1987 speed_cache_fill (s); \
1989 speed_starttime (); \
1992 MPN_COPY (tp, ap, 2*s->size); \
1993 function (cp, tp, mp, s->size, inv); \
1994 } while (--i != 0); \
1995 t = speed_endtime (); \
2000 #define SPEED_ROUTINE_REDC_2(function) \
2003 mp_ptr cp, mp, tp, ap; \
2004 mp_limb_t invp[2]; \
2008 SPEED_RESTRICT_COND (s->size >= 1); \
2011 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \
2012 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \
2013 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \
2014 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \
2016 MPN_COPY (ap, s->xp, s->size); \
2017 MPN_COPY (ap+s->size, s->xp, s->size); \
2019 /* modulus must be odd */ \
2020 MPN_COPY (mp, s->yp, s->size); \
2022 mpn_binvert (invp, mp, 2, tp); \
2023 invp[0] = -invp[0]; invp[1] = ~invp[1]; \
2025 speed_operand_src (s, ap, 2*s->size+1); \
2026 speed_operand_dst (s, tp, 2*s->size+1); \
2027 speed_operand_src (s, mp, s->size); \
2028 speed_operand_dst (s, cp, s->size); \
2029 speed_cache_fill (s); \
2031 speed_starttime (); \
2034 MPN_COPY (tp, ap, 2*s->size); \
2035 function (cp, tp, mp, s->size, invp); \
2036 } while (--i != 0); \
2037 t = speed_endtime (); \
2042 #define SPEED_ROUTINE_REDC_N(function) \
2045 mp_ptr cp, mp, tp, ap, invp; \
2049 SPEED_RESTRICT_COND (s->size > 8); \
2052 SPEED_TMP_ALLOC_LIMBS (ap, 2*s->size+1, s->align_xp); \
2053 SPEED_TMP_ALLOC_LIMBS (mp, s->size, s->align_yp); \
2054 SPEED_TMP_ALLOC_LIMBS (cp, s->size, s->align_wp); \
2055 SPEED_TMP_ALLOC_LIMBS (tp, 2*s->size+1, s->align_wp2); \
2056 SPEED_TMP_ALLOC_LIMBS (invp, s->size, s->align_wp2); /* align? */ \
2058 MPN_COPY (ap, s->xp, s->size); \
2059 MPN_COPY (ap+s->size, s->xp, s->size); \
2061 /* modulus must be odd */ \
2062 MPN_COPY (mp, s->yp, s->size); \
2064 mpn_binvert (invp, mp, s->size, tp); \
2066 speed_operand_src (s, ap, 2*s->size+1); \
2067 speed_operand_dst (s, tp, 2*s->size+1); \
2068 speed_operand_src (s, mp, s->size); \
2069 speed_operand_dst (s, cp, s->size); \
2070 speed_cache_fill (s); \
2072 speed_starttime (); \
2075 MPN_COPY (tp, ap, 2*s->size); \
2076 function (cp, tp, mp, s->size, invp); \
2077 } while (--i != 0); \
2078 t = speed_endtime (); \
2085 #define SPEED_ROUTINE_MPN_POPCOUNT(function) \
2089 SPEED_RESTRICT_COND (s->size >= 1); \
2091 speed_operand_src (s, s->xp, s->size); \
2092 speed_cache_fill (s); \
2094 speed_starttime (); \
2097 function (s->xp, s->size); \
2100 return speed_endtime (); \
2103 #define SPEED_ROUTINE_MPN_HAMDIST(function) \
2107 SPEED_RESTRICT_COND (s->size >= 1); \
2109 speed_operand_src (s, s->xp, s->size); \
2110 speed_operand_src (s, s->yp, s->size); \
2111 speed_cache_fill (s); \
2113 speed_starttime (); \
2116 function (s->xp, s->yp, s->size); \
2119 return speed_endtime (); \
2123 #define SPEED_ROUTINE_MPZ_UI(function) \
2129 SPEED_RESTRICT_COND (s->size >= 0); \
2133 speed_starttime (); \
2136 function (z, s->size); \
2138 t = speed_endtime (); \
2144 #define SPEED_ROUTINE_MPZ_FAC_UI(function) SPEED_ROUTINE_MPZ_UI(function)
2145 #define SPEED_ROUTINE_MPZ_FIB_UI(function) SPEED_ROUTINE_MPZ_UI(function)
2146 #define SPEED_ROUTINE_MPZ_LUCNUM_UI(function) SPEED_ROUTINE_MPZ_UI(function)
2149 #define SPEED_ROUTINE_MPZ_2_UI(function) \
2155 SPEED_RESTRICT_COND (s->size >= 0); \
2160 speed_starttime (); \
2163 function (z, z2, s->size); \
2165 t = speed_endtime (); \
2172 #define SPEED_ROUTINE_MPZ_FIB2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
2173 #define SPEED_ROUTINE_MPZ_LUCNUM2_UI(function) SPEED_ROUTINE_MPZ_2_UI(function)
2176 #define SPEED_ROUTINE_MPN_FIB2_UI(function) \
2184 SPEED_RESTRICT_COND (s->size >= 0); \
2187 alloc = MPN_FIB2_SIZE (s->size); \
2188 SPEED_TMP_ALLOC_LIMBS (fp, alloc, s->align_xp); \
2189 SPEED_TMP_ALLOC_LIMBS (f1p, alloc, s->align_yp); \
2191 speed_starttime (); \
2194 function (fp, f1p, s->size); \
2196 t = speed_endtime (); \
2204 /* Calculate b^e mod m for random b and m of s->size limbs and random e of 6
2205 limbs. m is forced to odd so that redc can be used. e is limited in
2206 size so the calculation doesn't take too long. */
2207 #define SPEED_ROUTINE_MPZ_POWM(function) \
2213 SPEED_RESTRICT_COND (s->size >= 1); \
2216 mpz_init_set_n (b, s->xp, s->size); \
2217 mpz_init_set_n (m, s->yp, s->size); \
2218 mpz_setbit (m, 0); /* force m to odd */ \
2219 mpz_init_set_n (e, s->xp_block, 6); \
2221 speed_starttime (); \
2224 function (r, b, e, m); \
2226 t = speed_endtime (); \
2235 /* (m-2)^0xAAAAAAAA mod m */
2236 #define SPEED_ROUTINE_MPZ_POWM_UI(function) \
2243 SPEED_RESTRICT_COND (s->size >= 1); \
2247 /* force m to odd */ \
2249 mpz_set_n (m, s->xp, s->size); \
2252 e = (~ (unsigned long) 0) / 3; \
2256 mpz_init_set (b, m); \
2257 mpz_sub_ui (b, b, 2); \
2258 /* printf ("%X\n", mpz_get_ui(m)); */ \
2260 speed_starttime (); \
2262 function (r, b, e, m); \
2264 t = speed_endtime (); \
2273 #define SPEED_ROUTINE_MPN_ADDSUB_CALL(call) \
2275 mp_ptr wp, wp2, xp, yp; \
2280 SPEED_RESTRICT_COND (s->size >= 0); \
2283 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
2284 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \
2289 else if (s->r == 1) { xp = wp; } \
2290 else if (s->r == 2) { yp = wp2; } \
2291 else if (s->r == 3) { xp = wp; yp = wp2; } \
2292 else if (s->r == 4) { xp = wp2; yp = wp; } \
2297 if (xp != s->xp) MPN_COPY (xp, s->xp, s->size); \
2298 if (yp != s->yp) MPN_COPY (yp, s->yp, s->size); \
2300 speed_operand_src (s, xp, s->size); \
2301 speed_operand_src (s, yp, s->size); \
2302 speed_operand_dst (s, wp, s->size); \
2303 speed_operand_dst (s, wp2, s->size); \
2304 speed_cache_fill (s); \
2306 speed_starttime (); \
2311 t = speed_endtime (); \
2317 #define SPEED_ROUTINE_MPN_ADDSUB_N(function) \
2318 SPEED_ROUTINE_MPN_ADDSUB_CALL \
2319 (function (wp, wp2, xp, yp, s->size));
2321 #define SPEED_ROUTINE_MPN_ADDSUB_NC(function) \
2322 SPEED_ROUTINE_MPN_ADDSUB_CALL \
2323 (function (wp, wp2, xp, yp, s->size, 0));
2326 /* Doing an Nx1 gcd with the given r. */
2327 #define SPEED_ROUTINE_MPN_GCD_1N(function) \
2334 SPEED_RESTRICT_COND (s->size >= 1); \
2335 SPEED_RESTRICT_COND (s->r != 0); \
2338 SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp); \
2339 MPN_COPY (xp, s->xp, s->size); \
2340 xp[0] |= refmpn_zero_p (xp, s->size); \
2342 speed_operand_src (s, s->xp, s->size); \
2343 speed_cache_fill (s); \
2345 speed_starttime (); \
2348 function (xp, s->size, s->r); \
2350 t = speed_endtime (); \
2357 /* SPEED_BLOCK_SIZE many one GCDs of s->size bits each. */
2359 #define SPEED_ROUTINE_MPN_GCD_1_CALL(setup, call) \
2363 mp_limb_t x_mask, y_mask; \
2367 SPEED_RESTRICT_COND (s->size >= 1); \
2368 SPEED_RESTRICT_COND (s->size <= mp_bits_per_limb); \
2371 SPEED_TMP_ALLOC_LIMBS (px, SPEED_BLOCK_SIZE, s->align_xp); \
2372 SPEED_TMP_ALLOC_LIMBS (py, SPEED_BLOCK_SIZE, s->align_yp); \
2373 MPN_COPY (px, s->xp_block, SPEED_BLOCK_SIZE); \
2374 MPN_COPY (py, s->yp_block, SPEED_BLOCK_SIZE); \
2376 x_mask = MP_LIMB_T_LOWBITMASK (s->size); \
2377 y_mask = MP_LIMB_T_LOWBITMASK (s->r != 0 ? s->r : s->size); \
2378 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
2380 px[i] &= x_mask; px[i] += (px[i] == 0); \
2381 py[i] &= y_mask; py[i] += (py[i] == 0); \
2385 speed_operand_src (s, px, SPEED_BLOCK_SIZE); \
2386 speed_operand_src (s, py, SPEED_BLOCK_SIZE); \
2387 speed_cache_fill (s); \
2389 speed_starttime (); \
2393 j = SPEED_BLOCK_SIZE; \
2401 t = speed_endtime (); \
2405 s->time_divisor = SPEED_BLOCK_SIZE; \
2409 #define SPEED_ROUTINE_MPN_GCD_1(function) \
2410 SPEED_ROUTINE_MPN_GCD_1_CALL( , function (&px[j-1], 1, py[j-1]))
2412 #define SPEED_ROUTINE_MPN_JACBASE(function) \
2413 SPEED_ROUTINE_MPN_GCD_1_CALL \
2415 /* require x<y, y odd, y!=1 */ \
2419 if (py[i]==1) py[i]=3; \
2421 function (px[j-1], py[j-1], 0))
2424 /* Run some GCDs of s->size limbs each. The number of different data values
2425 is decreased as s->size**2, since GCD is a quadratic algorithm.
2426 SPEED_ROUTINE_MPN_GCD runs more times than SPEED_ROUTINE_MPN_GCDEXT
2427 though, because the plain gcd is about twice as fast as gcdext. */
2429 #define SPEED_ROUTINE_MPN_GCD_CALL(datafactor, call) \
2432 mp_size_t j, pieces, psize; \
2433 mp_ptr wp, wp2, xtmp, ytmp, px, py; \
2437 SPEED_RESTRICT_COND (s->size >= 1); \
2440 SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp); \
2441 SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp); \
2442 SPEED_TMP_ALLOC_LIMBS (wp, s->size+1, s->align_wp); \
2443 SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2); \
2445 pieces = SPEED_BLOCK_SIZE * datafactor / s->size / s->size; \
2446 pieces = MIN (pieces, SPEED_BLOCK_SIZE / s->size); \
2447 pieces = MAX (pieces, 1); \
2449 psize = pieces * s->size; \
2450 px = TMP_ALLOC_LIMBS (psize); \
2451 py = TMP_ALLOC_LIMBS (psize); \
2452 MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \
2453 MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \
2455 /* Requirements: x >= y, y must be odd, high limbs != 0. \
2456 No need to ensure random numbers are really great. */ \
2457 for (j = 0; j < pieces; j++) \
2459 mp_ptr x = px + j * s->size; \
2460 mp_ptr y = py + j * s->size; \
2461 if (x[s->size - 1] == 0) x[s->size - 1] = 1; \
2462 if (y[s->size - 1] == 0) y[s->size - 1] = 1; \
2464 if (x[s->size - 1] < y[s->size - 1]) \
2465 MP_LIMB_T_SWAP (x[s->size - 1], y[s->size - 1]); \
2466 else if (x[s->size - 1] == y[s->size - 1]) \
2468 x[s->size - 1] = 2; \
2469 y[s->size - 1] = 1; \
2474 speed_operand_src (s, px, psize); \
2475 speed_operand_src (s, py, psize); \
2476 speed_operand_dst (s, xtmp, s->size); \
2477 speed_operand_dst (s, ytmp, s->size); \
2478 speed_operand_dst (s, wp, s->size); \
2479 speed_cache_fill (s); \
2481 speed_starttime (); \
2488 MPN_COPY (xtmp, px+(j - 1)*s->size, s->size); \
2489 MPN_COPY (ytmp, py+(j - 1)*s->size, s->size); \
2495 t = speed_endtime (); \
2499 s->time_divisor = pieces; \
2503 #define SPEED_ROUTINE_MPN_GCD(function) \
2504 SPEED_ROUTINE_MPN_GCD_CALL (8, function (wp, xtmp, s->size, ytmp, s->size))
2506 #define SPEED_ROUTINE_MPN_GCDEXT(function) \
2507 SPEED_ROUTINE_MPN_GCD_CALL \
2508 (4, { mp_size_t wp2size; \
2509 function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); })
2512 #define SPEED_ROUTINE_MPN_GCDEXT_ONE(function) \
2515 mp_size_t j, pieces, psize, wp2size; \
2516 mp_ptr wp, wp2, xtmp, ytmp, px, py; \
2520 SPEED_RESTRICT_COND (s->size >= 1); \
2524 SPEED_TMP_ALLOC_LIMBS (xtmp, s->size+1, s->align_xp); \
2525 SPEED_TMP_ALLOC_LIMBS (ytmp, s->size+1, s->align_yp); \
2526 MPN_COPY (xtmp, s->xp, s->size); \
2527 MPN_COPY (ytmp, s->yp, s->size); \
2529 SPEED_TMP_ALLOC_LIMBS (wp, s->size+1, s->align_wp); \
2530 SPEED_TMP_ALLOC_LIMBS (wp2, s->size+1, s->align_wp2); \
2532 pieces = SPEED_BLOCK_SIZE / 3; \
2533 psize = 3 * pieces; \
2534 px = TMP_ALLOC_LIMBS (psize); \
2535 py = TMP_ALLOC_LIMBS (psize); \
2536 MPN_COPY (px, s->xp_block, psize); \
2537 MPN_COPY (py, s->yp_block, psize); \
2539 /* x must have at least as many bits as y, \
2540 high limbs must be non-zero */ \
2541 for (j = 0; j < pieces; j++) \
2543 mp_ptr x = px+3*j; \
2544 mp_ptr y = py+3*j; \
2545 x[2] += (x[2] == 0); \
2546 y[2] += (y[2] == 0); \
2548 MP_LIMB_T_SWAP (x[2], y[2]); \
2551 speed_operand_src (s, px, psize); \
2552 speed_operand_src (s, py, psize); \
2553 speed_operand_dst (s, xtmp, s->size); \
2554 speed_operand_dst (s, ytmp, s->size); \
2555 speed_operand_dst (s, wp, s->size); \
2556 speed_cache_fill (s); \
2558 speed_starttime (); \
2564 mp_ptr xth = &xtmp[s->size-3]; \
2565 mp_ptr yth = &ytmp[s->size-3]; \
2569 xth[0] = x[0], xth[1] = x[1], xth[2] = x[2]; \
2570 yth[0] = y[0], yth[1] = y[1], yth[2] = y[2]; \
2572 ytmp[0] |= 1; /* y must be odd, */ \
2574 function (wp, wp2, &wp2size, xtmp, s->size, ytmp, s->size); \
2582 t = speed_endtime (); \
2586 s->time_divisor = pieces; \
2590 #define SPEED_ROUTINE_MPZ_JACOBI(function) \
2594 mp_size_t j, pieces, psize; \
2600 pieces = SPEED_BLOCK_SIZE / MAX (s->size, 1); \
2601 pieces = MAX (pieces, 1); \
2602 s->time_divisor = pieces; \
2604 psize = pieces * s->size; \
2605 px = TMP_ALLOC_LIMBS (psize); \
2606 py = TMP_ALLOC_LIMBS (psize); \
2607 MPN_COPY (px, pieces==1 ? s->xp : s->xp_block, psize); \
2608 MPN_COPY (py, pieces==1 ? s->yp : s->yp_block, psize); \
2610 for (j = 0; j < pieces; j++) \
2612 mp_ptr x = px+j*s->size; \
2613 mp_ptr y = py+j*s->size; \
2618 /* high limbs non-zero */ \
2619 if (x[s->size-1] == 0) x[s->size-1] = 1; \
2620 if (y[s->size-1] == 0) y[s->size-1] = 1; \
2626 speed_operand_src (s, px, psize); \
2627 speed_operand_src (s, py, psize); \
2628 speed_cache_fill (s); \
2630 speed_starttime (); \
2637 PTR(a) = px+(j-1)*s->size; \
2638 PTR(b) = py+(j-1)*s->size; \
2644 t = speed_endtime (); \
2650 #define SPEED_ROUTINE_MPN_DIVREM_2(function) \
2658 SPEED_RESTRICT_COND (s->size >= 2); \
2661 SPEED_TMP_ALLOC_LIMBS (xp, s->size, s->align_xp); \
2662 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
2664 /* source is destroyed */ \
2665 MPN_COPY (xp, s->xp, s->size); \
2667 /* divisor must be normalized */ \
2668 MPN_COPY (yp, s->yp_block, 2); \
2669 yp[1] |= GMP_NUMB_HIGHBIT; \
2671 speed_operand_src (s, xp, s->size); \
2672 speed_operand_src (s, yp, 2); \
2673 speed_operand_dst (s, wp, s->size); \
2674 speed_cache_fill (s); \
2676 speed_starttime (); \
2679 function (wp, 0, xp, s->size, yp); \
2681 t = speed_endtime (); \
2688 #define SPEED_ROUTINE_MODLIMB_INVERT(function) \
2695 xp = s->xp_block-1; \
2697 speed_operand_src (s, s->xp_block, SPEED_BLOCK_SIZE); \
2698 speed_cache_fill (s); \
2700 speed_starttime (); \
2704 j = SPEED_BLOCK_SIZE; \
2707 /* randomized but successively dependent */ \
2708 n += (xp[j] << 1); \
2715 t = speed_endtime (); \
2717 /* make sure the compiler won't optimize away n */ \
2720 s->time_divisor = SPEED_BLOCK_SIZE; \
2725 #define SPEED_ROUTINE_MPN_SQRTREM(function) \
2732 SPEED_RESTRICT_COND (s->size >= 1); \
2735 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
2736 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \
2738 speed_operand_src (s, s->xp, s->size); \
2739 speed_operand_dst (s, wp, s->size); \
2740 speed_operand_dst (s, wp2, s->size); \
2741 speed_cache_fill (s); \
2743 speed_starttime (); \
2746 function (wp, wp2, s->xp, s->size); \
2748 t = speed_endtime (); \
2754 #define SPEED_ROUTINE_MPN_ROOTREM(function) \
2761 SPEED_RESTRICT_COND (s->size >= 1); \
2764 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
2765 SPEED_TMP_ALLOC_LIMBS (wp2, s->size, s->align_wp2); \
2767 speed_operand_src (s, s->xp, s->size); \
2768 speed_operand_dst (s, wp, s->size); \
2769 speed_operand_dst (s, wp2, s->size); \
2770 speed_cache_fill (s); \
2772 speed_starttime (); \
2775 function (wp, wp2, s->xp, s->size, s->r); \
2777 t = speed_endtime (); \
2784 /* s->size controls the number of limbs in the input, s->r is the base, or
2785 decimal by default. */
2786 #define SPEED_ROUTINE_MPN_GET_STR(function) \
2788 unsigned char *wp; \
2796 SPEED_RESTRICT_COND (s->size >= 1); \
2798 base = s->r == 0 ? 10 : s->r; \
2799 SPEED_RESTRICT_COND (base >= 2 && base <= 256); \
2802 SPEED_TMP_ALLOC_LIMBS (xp, s->size + 1, s->align_xp); \
2804 MPN_SIZEINBASE (wn, s->xp, s->size, base); \
2805 wp = TMP_ALLOC (wn); \
2807 /* use this during development to guard against overflowing wp */ \
2809 MPN_COPY (xp, s->xp, s->size); \
2810 ASSERT_ALWAYS (mpn_get_str (wp, base, xp, s->size) <= wn); \
2813 speed_operand_src (s, s->xp, s->size); \
2814 speed_operand_dst (s, xp, s->size); \
2815 speed_operand_dst (s, (mp_ptr) wp, wn/BYTES_PER_MP_LIMB); \
2816 speed_cache_fill (s); \
2818 speed_starttime (); \
2822 MPN_COPY (xp, s->xp, s->size); \
2823 function (wp, base, xp, s->size); \
2826 t = speed_endtime (); \
2832 /* s->size controls the number of digits in the input, s->r is the base, or
2833 decimal by default. */
2834 #define SPEED_ROUTINE_MPN_SET_STR_CALL(call) \
2836 unsigned char *xp; \
2844 SPEED_RESTRICT_COND (s->size >= 1); \
2846 base = s->r == 0 ? 10 : s->r; \
2847 SPEED_RESTRICT_COND (base >= 2 && base <= 256); \
2851 xp = TMP_ALLOC (s->size); \
2852 for (i = 0; i < s->size; i++) \
2853 xp[i] = s->xp[i] % base; \
2855 wn = ((mp_size_t) (s->size / mp_bases[base].chars_per_bit_exactly)) \
2856 / GMP_LIMB_BITS + 2; \
2857 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp); \
2859 /* use this during development to check wn is big enough */ \
2861 ASSERT_ALWAYS (mpn_set_str (wp, xp, s->size, base) <= wn); \
2864 speed_operand_src (s, (mp_ptr) xp, s->size/BYTES_PER_MP_LIMB); \
2865 speed_operand_dst (s, wp, wn); \
2866 speed_cache_fill (s); \
2868 speed_starttime (); \
2873 t = speed_endtime (); \
2880 /* Run an accel gcd find_a() function over various data values. A set of
2881 values is used in case some run particularly fast or slow. The size
2882 parameter is ignored, the amount of data tested is fixed. */
2884 #define SPEED_ROUTINE_MPN_GCD_FINDA(function) \
2887 mp_limb_t cp[SPEED_BLOCK_SIZE][2]; \
2893 /* low must be odd, high must be non-zero */ \
2894 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
2896 cp[i][0] = s->xp_block[i] | 1; \
2897 cp[i][1] = s->yp_block[i] + (s->yp_block[i] == 0); \
2900 speed_operand_src (s, &cp[0][0], 2*SPEED_BLOCK_SIZE); \
2901 speed_cache_fill (s); \
2903 speed_starttime (); \
2907 j = SPEED_BLOCK_SIZE; \
2910 function (cp[j-1]); \
2915 t = speed_endtime (); \
2919 s->time_divisor = SPEED_BLOCK_SIZE; \
2924 /* "call" should do "count_foo_zeros(c,n)".
2925 Give leading=1 if foo is leading zeros, leading=0 for trailing.
2926 Give zero=1 if n=0 is allowed in the call, zero=0 if not. */
2928 #define SPEED_ROUTINE_COUNT_ZEROS_A(leading, zero) \
2938 SPEED_TMP_ALLOC_LIMBS (xp, SPEED_BLOCK_SIZE, s->align_xp); \
2940 if (! speed_routine_count_zeros_setup (s, xp, leading, zero)) \
2942 speed_operand_src (s, xp, SPEED_BLOCK_SIZE); \
2943 speed_cache_fill (s); \
2946 speed_starttime (); \
2949 for (i = 0; i < SPEED_BLOCK_SIZE; i++) \
2954 #define SPEED_ROUTINE_COUNT_ZEROS_B() \
2956 } while (--j != 0); \
2957 t = speed_endtime (); \
2959 /* don't let c go dead */ \
2962 s->time_divisor = SPEED_BLOCK_SIZE; \
2968 #define SPEED_ROUTINE_COUNT_ZEROS_C(call, leading, zero) \
2970 SPEED_ROUTINE_COUNT_ZEROS_A (leading, zero); \
2972 SPEED_ROUTINE_COUNT_ZEROS_B (); \
2975 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS_C(call,zero) \
2976 SPEED_ROUTINE_COUNT_ZEROS_C (call, 1, zero)
2977 #define SPEED_ROUTINE_COUNT_LEADING_ZEROS(fun) \
2978 SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 1, 0)
2980 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS_C(call,zero) \
2981 SPEED_ROUTINE_COUNT_ZEROS_C (call, 0, zero)
2982 #define SPEED_ROUTINE_COUNT_TRAILING_ZEROS(call) \
2983 SPEED_ROUTINE_COUNT_ZEROS_C (fun (c, n), 0, 0)
2986 #define SPEED_ROUTINE_INVERT_LIMB_CALL(call) \
2989 mp_limb_t d, dinv=0; \
2990 mp_ptr xp = s->xp_block - 1; \
2992 s->time_divisor = SPEED_BLOCK_SIZE; \
2994 speed_starttime (); \
2998 j = SPEED_BLOCK_SIZE; \
3002 d |= GMP_LIMB_HIGHBIT; \
3003 do { call; } while (0); \
3009 /* don't let the compiler optimize everything away */ \
3012 return speed_endtime(); \
3019 #define SPEED_ROUTINE_MPN_BACK_TO_BACK(function) \
3022 speed_starttime (); \
3027 return speed_endtime (); \
3031 #define SPEED_ROUTINE_MPN_ZERO_CALL(call) \
3038 SPEED_RESTRICT_COND (s->size >= 0); \
3041 SPEED_TMP_ALLOC_LIMBS (wp, s->size, s->align_wp); \
3042 speed_operand_dst (s, wp, s->size); \
3043 speed_cache_fill (s); \
3045 speed_starttime (); \
3050 t = speed_endtime (); \
3056 #define SPEED_ROUTINE_MPN_ZERO(function) \
3057 SPEED_ROUTINE_MPN_ZERO_CALL (function (wp, s->size))