1 /* Speed measuring program.
3 Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010 Free
4 Software Foundation, Inc.
6 This file is part of the GNU MP Library.
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
21 /* Usage message is in the code below, run with no arguments to print it.
22 See README for interesting applications.
24 To add a new routine foo(), create a speed_foo() function in the style of
25 the existing ones and add an entry in the routine[] array. Put FLAG_R if
26 speed_foo() wants an "r" parameter.
28 The routines don't have help messages or descriptions, but most have
29 suggestive names. See the source code for full details.
41 #include <unistd.h> /* for getpid, R_OK */
44 #if TIME_WITH_SYS_TIME
45 # include <sys/time.h> /* for struct timeval */
49 # include <sys/time.h>
55 #if HAVE_SYS_RESOURCE_H
56 #include <sys/resource.h> /* for getrusage() */
62 #include "longlong.h" /* for the benefit of speed-many.c */
69 extern int optind, opterr;
73 #define strtoul(p,e,b) (unsigned long) strtol(p,e,b)
76 #ifdef SPEED_EXTRA_PROTOS
79 #ifdef SPEED_EXTRA_PROTOS2
84 #define MPN_FILL(ptr, size, n) \
87 ASSERT ((size) >= 0); \
88 for (__i = 0; __i < (size); __i++) \
93 #if GMP_LIMB_BITS == 32
94 #define GMP_NUMB_0xAA (CNST_LIMB(0xAAAAAAAA) & GMP_NUMB_MASK)
96 #if GMP_LIMB_BITS == 64
97 #define GMP_NUMB_0xAA (CNST_LIMB(0xAAAAAAAAAAAAAAAA) & GMP_NUMB_MASK)
101 #define CMP_ABSOLUTE 1
103 #define CMP_DIFFERENCE 3
104 #define CMP_DIFFPREV 4
105 int option_cmp = CMP_ABSOLUTE;
107 #define UNIT_SECONDS 1
108 #define UNIT_CYCLES 2
109 #define UNIT_CYCLESPERLIMB 3
110 int option_unit = UNIT_SECONDS;
112 #define DATA_RANDOM 1
113 #define DATA_RANDOM2 2
118 int option_data = DATA_RANDOM;
120 int option_square = 0;
121 double option_factor = 0.0;
122 mp_size_t option_step = 1;
123 int option_gnuplot = 0;
124 char *option_gnuplot_basename;
125 struct size_array_t {
126 mp_size_t start, end;
127 } *size_array = NULL;
128 mp_size_t size_num = 0;
129 mp_size_t size_allocnum = 0;
130 int option_resource_usage = 0;
131 long option_seed = 123456789;
133 struct speed_params sp;
135 #define COLUMN_WIDTH 13 /* for the free-form output */
137 #define FLAG_R (1<<0) /* require ".r" */
138 #define FLAG_R_OPTIONAL (1<<1) /* optional ".r" */
139 #define FLAG_RSIZE (1<<2)
140 #define FLAG_NODATA (1<<3) /* don't alloc xp, yp */
142 const struct routine_t {
145 speed_function_t fun;
149 { "noop", speed_noop },
150 { "noop_wxs", speed_noop_wxs },
151 { "noop_wxys", speed_noop_wxys },
153 { "mpn_add_n", speed_mpn_add_n, FLAG_R_OPTIONAL },
154 { "mpn_sub_n", speed_mpn_sub_n, FLAG_R_OPTIONAL },
156 #if HAVE_NATIVE_mpn_add_n_sub_n
157 { "mpn_add_n_sub_n", speed_mpn_add_n_sub_n, FLAG_R_OPTIONAL },
160 { "mpn_addmul_1", speed_mpn_addmul_1, FLAG_R },
161 { "mpn_submul_1", speed_mpn_submul_1, FLAG_R },
162 #if HAVE_NATIVE_mpn_addmul_2
163 { "mpn_addmul_2", speed_mpn_addmul_2, FLAG_R_OPTIONAL },
165 #if HAVE_NATIVE_mpn_addmul_3
166 { "mpn_addmul_3", speed_mpn_addmul_3, FLAG_R_OPTIONAL },
168 #if HAVE_NATIVE_mpn_addmul_4
169 { "mpn_addmul_4", speed_mpn_addmul_4, FLAG_R_OPTIONAL },
171 #if HAVE_NATIVE_mpn_addmul_5
172 { "mpn_addmul_5", speed_mpn_addmul_5, FLAG_R_OPTIONAL },
174 #if HAVE_NATIVE_mpn_addmul_6
175 { "mpn_addmul_6", speed_mpn_addmul_6, FLAG_R_OPTIONAL },
177 #if HAVE_NATIVE_mpn_addmul_7
178 { "mpn_addmul_7", speed_mpn_addmul_7, FLAG_R_OPTIONAL },
180 #if HAVE_NATIVE_mpn_addmul_8
181 { "mpn_addmul_8", speed_mpn_addmul_8, FLAG_R_OPTIONAL },
183 { "mpn_mul_1", speed_mpn_mul_1, FLAG_R },
184 { "mpn_mul_1_inplace", speed_mpn_mul_1_inplace, FLAG_R },
185 #if HAVE_NATIVE_mpn_mul_2
186 { "mpn_mul_2", speed_mpn_mul_2, FLAG_R_OPTIONAL },
188 #if HAVE_NATIVE_mpn_mul_3
189 { "mpn_mul_3", speed_mpn_mul_3, FLAG_R_OPTIONAL },
191 #if HAVE_NATIVE_mpn_mul_4
192 { "mpn_mul_4", speed_mpn_mul_4, FLAG_R_OPTIONAL },
195 { "mpn_divrem_1", speed_mpn_divrem_1, FLAG_R },
196 { "mpn_divrem_1f", speed_mpn_divrem_1f, FLAG_R },
197 #if HAVE_NATIVE_mpn_divrem_1c
198 { "mpn_divrem_1c", speed_mpn_divrem_1c, FLAG_R },
199 { "mpn_divrem_1cf", speed_mpn_divrem_1cf,FLAG_R },
201 { "mpn_mod_1", speed_mpn_mod_1, FLAG_R_OPTIONAL },
202 #if HAVE_NATIVE_mpn_mod_1c
203 { "mpn_mod_1c", speed_mpn_mod_1c, FLAG_R_OPTIONAL },
205 { "mpn_preinv_divrem_1", speed_mpn_preinv_divrem_1, FLAG_R },
206 { "mpn_preinv_divrem_1f", speed_mpn_preinv_divrem_1f, FLAG_R },
207 { "mpn_preinv_mod_1", speed_mpn_preinv_mod_1, FLAG_R },
209 { "mpn_mod_1_1", speed_mpn_mod_1_1, FLAG_R_OPTIONAL },
210 { "mpn_mod_1s_2", speed_mpn_mod_1_2, FLAG_R_OPTIONAL },
211 { "mpn_mod_1s_3", speed_mpn_mod_1_3, FLAG_R_OPTIONAL },
212 { "mpn_mod_1s_4", speed_mpn_mod_1_4, FLAG_R_OPTIONAL },
214 { "mpn_divrem_1_div", speed_mpn_divrem_1_div, FLAG_R },
215 { "mpn_divrem_1_inv", speed_mpn_divrem_1_inv, FLAG_R },
216 { "mpn_divrem_1f_div", speed_mpn_divrem_1f_div, FLAG_R },
217 { "mpn_divrem_1f_inv", speed_mpn_divrem_1f_inv, FLAG_R },
218 { "mpn_mod_1_div", speed_mpn_mod_1_div, FLAG_R },
219 { "mpn_mod_1_inv", speed_mpn_mod_1_inv, FLAG_R },
221 { "mpn_divrem_2", speed_mpn_divrem_2, },
222 { "mpn_divrem_2_div", speed_mpn_divrem_2_div, },
223 { "mpn_divrem_2_inv", speed_mpn_divrem_2_inv, },
225 { "mpn_divexact_1", speed_mpn_divexact_1, FLAG_R },
226 { "mpn_divexact_by3", speed_mpn_divexact_by3 },
228 { "mpn_bdiv_q_1", speed_mpn_bdiv_q_1, FLAG_R_OPTIONAL },
229 { "mpn_pi1_bdiv_q_1", speed_mpn_pi1_bdiv_q_1, FLAG_R_OPTIONAL },
230 { "mpn_bdiv_dbm1c", speed_mpn_bdiv_dbm1c, FLAG_R_OPTIONAL },
232 #if HAVE_NATIVE_mpn_modexact_1_odd
233 { "mpn_modexact_1_odd", speed_mpn_modexact_1_odd, FLAG_R },
235 { "mpn_modexact_1c_odd", speed_mpn_modexact_1c_odd, FLAG_R },
237 #if GMP_NUMB_BITS % 4 == 0
238 { "mpn_mod_34lsub1", speed_mpn_mod_34lsub1 },
241 { "mpn_lshift", speed_mpn_lshift, FLAG_R },
242 { "mpn_lshiftc", speed_mpn_lshiftc, FLAG_R },
243 { "mpn_rshift", speed_mpn_rshift, FLAG_R },
245 { "mpn_and_n", speed_mpn_and_n, FLAG_R_OPTIONAL },
246 { "mpn_andn_n", speed_mpn_andn_n, FLAG_R_OPTIONAL },
247 { "mpn_nand_n", speed_mpn_nand_n, FLAG_R_OPTIONAL },
248 { "mpn_ior_n", speed_mpn_ior_n, FLAG_R_OPTIONAL },
249 { "mpn_iorn_n", speed_mpn_iorn_n, FLAG_R_OPTIONAL },
250 { "mpn_nior_n", speed_mpn_nior_n, FLAG_R_OPTIONAL },
251 { "mpn_xor_n", speed_mpn_xor_n, FLAG_R_OPTIONAL },
252 { "mpn_xnor_n", speed_mpn_xnor_n, FLAG_R_OPTIONAL },
253 { "mpn_com", speed_mpn_com },
255 { "mpn_popcount", speed_mpn_popcount },
256 { "mpn_hamdist", speed_mpn_hamdist },
258 { "mpn_matrix22_mul", speed_mpn_matrix22_mul },
260 { "mpn_hgcd", speed_mpn_hgcd },
261 { "mpn_hgcd_lehmer", speed_mpn_hgcd_lehmer },
263 { "mpn_gcd_1", speed_mpn_gcd_1, FLAG_R_OPTIONAL },
264 { "mpn_gcd_1N", speed_mpn_gcd_1N, FLAG_R_OPTIONAL },
266 { "mpn_gcd", speed_mpn_gcd },
268 { "mpn_gcd_binary", speed_mpn_gcd_binary },
269 { "mpn_gcd_accel", speed_mpn_gcd_accel },
270 { "find_a", speed_find_a, FLAG_NODATA },
273 { "mpn_gcdext", speed_mpn_gcdext },
274 { "mpn_gcdext_single", speed_mpn_gcdext_single },
275 { "mpn_gcdext_double", speed_mpn_gcdext_double },
276 { "mpn_gcdext_one_single", speed_mpn_gcdext_one_single },
277 { "mpn_gcdext_one_double", speed_mpn_gcdext_one_double },
279 { "mpn_gcdext_lehmer", speed_mpn_gcdext_lehmer },
281 { "mpz_jacobi", speed_mpz_jacobi },
282 { "mpn_jacobi_base", speed_mpn_jacobi_base },
283 { "mpn_jacobi_base_1", speed_mpn_jacobi_base_1 },
284 { "mpn_jacobi_base_2", speed_mpn_jacobi_base_2 },
285 { "mpn_jacobi_base_3", speed_mpn_jacobi_base_3 },
287 { "mpn_mul", speed_mpn_mul, FLAG_R_OPTIONAL },
288 { "mpn_mul_basecase", speed_mpn_mul_basecase,FLAG_R_OPTIONAL },
289 { "mpn_sqr_basecase", speed_mpn_sqr_basecase },
290 #if HAVE_NATIVE_mpn_sqr_diagonal
291 { "mpn_sqr_diagonal", speed_mpn_sqr_diagonal },
294 { "mpn_mul_n", speed_mpn_mul_n },
295 { "mpn_sqr", speed_mpn_sqr },
297 { "mpn_toom2_sqr", speed_mpn_toom2_sqr },
298 { "mpn_toom3_sqr", speed_mpn_toom3_sqr },
299 { "mpn_toom4_sqr", speed_mpn_toom4_sqr },
300 { "mpn_toom6_sqr", speed_mpn_toom6_sqr },
301 { "mpn_toom8_sqr", speed_mpn_toom8_sqr },
302 { "mpn_toom22_mul", speed_mpn_toom22_mul },
303 { "mpn_toom33_mul", speed_mpn_toom33_mul },
304 { "mpn_toom44_mul", speed_mpn_toom44_mul },
305 { "mpn_toom6h_mul", speed_mpn_toom6h_mul },
306 { "mpn_toom8h_mul", speed_mpn_toom8h_mul },
307 { "mpn_toom32_mul", speed_mpn_toom32_mul },
308 { "mpn_toom42_mul", speed_mpn_toom42_mul },
309 { "mpn_toom43_mul", speed_mpn_toom43_mul },
310 { "mpn_toom63_mul", speed_mpn_toom63_mul },
311 { "mpn_nussbaumer_mul", speed_mpn_nussbaumer_mul },
312 { "mpn_nussbaumer_mul_sqr",speed_mpn_nussbaumer_mul_sqr},
313 #if WANT_OLD_FFT_FULL
314 { "mpn_mul_fft_full", speed_mpn_mul_fft_full },
315 { "mpn_mul_fft_full_sqr", speed_mpn_mul_fft_full_sqr },
317 { "mpn_mul_fft", speed_mpn_mul_fft, FLAG_R_OPTIONAL },
318 { "mpn_mul_fft_sqr", speed_mpn_mul_fft_sqr, FLAG_R_OPTIONAL },
320 { "mpn_mullo_n", speed_mpn_mullo_n },
321 { "mpn_mullo_basecase", speed_mpn_mullo_basecase },
323 { "mpn_bc_mulmod_bnm1", speed_mpn_bc_mulmod_bnm1 },
324 { "mpn_mulmod_bnm1", speed_mpn_mulmod_bnm1 },
325 { "mpn_mulmod_bnm1_rounded", speed_mpn_mulmod_bnm1_rounded },
326 { "mpn_sqrmod_bnm1", speed_mpn_sqrmod_bnm1 },
328 { "mpn_invert", speed_mpn_invert },
329 { "mpn_invertappr", speed_mpn_invertappr },
330 { "mpn_ni_invertappr", speed_mpn_ni_invertappr },
331 { "mpn_binvert", speed_mpn_binvert },
333 { "mpn_sbpi1_div_qr", speed_mpn_sbpi1_div_qr, FLAG_R_OPTIONAL},
334 { "mpn_dcpi1_div_qr", speed_mpn_dcpi1_div_qr, FLAG_R_OPTIONAL},
335 { "mpn_mu_div_qr", speed_mpn_mu_div_qr, FLAG_R_OPTIONAL},
336 { "mpn_mupi_div_qr", speed_mpn_mupi_div_qr, FLAG_R_OPTIONAL},
337 { "mpn_sbpi1_divappr_q", speed_mpn_sbpi1_divappr_q, FLAG_R_OPTIONAL},
338 { "mpn_dcpi1_divappr_q", speed_mpn_dcpi1_divappr_q, FLAG_R_OPTIONAL},
340 { "mpn_sbpi1_bdiv_qr", speed_mpn_sbpi1_bdiv_qr },
341 { "mpn_dcpi1_bdiv_qr", speed_mpn_dcpi1_bdiv_qr },
342 { "mpn_sbpi1_bdiv_q", speed_mpn_sbpi1_bdiv_q },
343 { "mpn_dcpi1_bdiv_q", speed_mpn_dcpi1_bdiv_q },
345 { "mpn_get_str", speed_mpn_get_str, FLAG_R_OPTIONAL },
346 { "mpn_set_str", speed_mpn_set_str, FLAG_R_OPTIONAL },
347 { "mpn_set_str_basecase", speed_mpn_bc_set_str, FLAG_R_OPTIONAL },
349 { "mpn_sqrtrem", speed_mpn_sqrtrem },
350 { "mpn_rootrem", speed_mpn_rootrem, FLAG_R },
352 { "mpn_fib2_ui", speed_mpn_fib2_ui, FLAG_NODATA },
353 { "mpz_fib_ui", speed_mpz_fib_ui, FLAG_NODATA },
354 { "mpz_fib2_ui", speed_mpz_fib2_ui, FLAG_NODATA },
355 { "mpz_lucnum_ui", speed_mpz_lucnum_ui, FLAG_NODATA },
356 { "mpz_lucnum2_ui", speed_mpz_lucnum2_ui, FLAG_NODATA },
358 { "mpz_add", speed_mpz_add },
359 { "mpz_bin_uiui", speed_mpz_bin_uiui, FLAG_NODATA | FLAG_R_OPTIONAL },
360 { "mpz_fac_ui", speed_mpz_fac_ui, FLAG_NODATA },
361 { "mpz_powm", speed_mpz_powm },
362 { "mpz_powm_mod", speed_mpz_powm_mod },
363 { "mpz_powm_redc", speed_mpz_powm_redc },
364 { "mpz_powm_ui", speed_mpz_powm_ui, FLAG_R_OPTIONAL },
366 { "mpz_mod", speed_mpz_mod },
367 { "mpn_redc_1", speed_mpn_redc_1 },
368 { "mpn_redc_2", speed_mpn_redc_2 },
369 { "mpn_redc_n", speed_mpn_redc_n },
371 { "MPN_COPY", speed_MPN_COPY },
372 { "MPN_COPY_INCR", speed_MPN_COPY_INCR },
373 { "MPN_COPY_DECR", speed_MPN_COPY_DECR },
374 { "memcpy", speed_memcpy },
375 #if HAVE_NATIVE_mpn_copyi
376 { "mpn_copyi", speed_mpn_copyi },
378 #if HAVE_NATIVE_mpn_copyd
379 { "mpn_copyd", speed_mpn_copyd },
381 #if HAVE_NATIVE_mpn_addlsh1_n
382 { "mpn_addlsh1_n", speed_mpn_addlsh1_n },
384 #if HAVE_NATIVE_mpn_sublsh1_n
385 { "mpn_sublsh1_n", speed_mpn_sublsh1_n },
387 #if HAVE_NATIVE_mpn_rsblsh1_n
388 { "mpn_rsblsh1_n", speed_mpn_rsblsh1_n },
390 #if HAVE_NATIVE_mpn_addlsh2_n
391 { "mpn_addlsh2_n", speed_mpn_addlsh2_n },
393 #if HAVE_NATIVE_mpn_sublsh2_n
394 { "mpn_sublsh2_n", speed_mpn_sublsh2_n },
396 #if HAVE_NATIVE_mpn_rsblsh2_n
397 { "mpn_rsblsh2_n", speed_mpn_rsblsh2_n },
399 #if HAVE_NATIVE_mpn_rsh1add_n
400 { "mpn_rsh1add_n", speed_mpn_rsh1add_n },
402 #if HAVE_NATIVE_mpn_rsh1sub_n
403 { "mpn_rsh1sub_n", speed_mpn_rsh1sub_n },
406 { "MPN_ZERO", speed_MPN_ZERO },
408 { "binvert_limb", speed_binvert_limb, FLAG_NODATA },
409 { "binvert_limb_mul1", speed_binvert_limb_mul1, FLAG_NODATA },
410 { "binvert_limb_loop", speed_binvert_limb_loop, FLAG_NODATA },
411 { "binvert_limb_cond", speed_binvert_limb_cond, FLAG_NODATA },
412 { "binvert_limb_arith", speed_binvert_limb_arith, FLAG_NODATA },
414 { "malloc_free", speed_malloc_free },
415 { "malloc_realloc_free", speed_malloc_realloc_free },
416 { "gmp_allocate_free", speed_gmp_allocate_free },
417 { "gmp_allocate_reallocate_free", speed_gmp_allocate_reallocate_free },
418 { "mpz_init_clear", speed_mpz_init_clear },
419 { "mpq_init_clear", speed_mpq_init_clear },
420 { "mpf_init_clear", speed_mpf_init_clear },
421 { "mpz_init_realloc_clear", speed_mpz_init_realloc_clear },
423 { "umul_ppmm", speed_umul_ppmm, FLAG_R_OPTIONAL },
424 #if HAVE_NATIVE_mpn_umul_ppmm
425 { "mpn_umul_ppmm", speed_mpn_umul_ppmm, FLAG_R_OPTIONAL },
427 #if HAVE_NATIVE_mpn_umul_ppmm_r
428 { "mpn_umul_ppmm_r", speed_mpn_umul_ppmm_r, FLAG_R_OPTIONAL },
431 { "count_leading_zeros", speed_count_leading_zeros, FLAG_NODATA | FLAG_R_OPTIONAL },
432 { "count_trailing_zeros", speed_count_trailing_zeros, FLAG_NODATA | FLAG_R_OPTIONAL },
434 { "udiv_qrnnd", speed_udiv_qrnnd, FLAG_R_OPTIONAL },
435 { "udiv_qrnnd_preinv1", speed_udiv_qrnnd_preinv1, FLAG_R_OPTIONAL },
436 { "udiv_qrnnd_preinv2", speed_udiv_qrnnd_preinv2, FLAG_R_OPTIONAL },
437 { "udiv_qrnnd_c", speed_udiv_qrnnd_c, FLAG_R_OPTIONAL },
438 #if HAVE_NATIVE_mpn_udiv_qrnnd
439 { "mpn_udiv_qrnnd", speed_mpn_udiv_qrnnd, FLAG_R_OPTIONAL },
441 #if HAVE_NATIVE_mpn_udiv_qrnnd_r
442 { "mpn_udiv_qrnnd_r", speed_mpn_udiv_qrnnd_r, FLAG_R_OPTIONAL },
444 { "invert_limb", speed_invert_limb, FLAG_R_OPTIONAL },
446 { "operator_div", speed_operator_div, FLAG_R_OPTIONAL },
447 { "operator_mod", speed_operator_mod, FLAG_R_OPTIONAL },
449 { "gmp_randseed", speed_gmp_randseed, FLAG_R_OPTIONAL },
450 { "gmp_randseed_ui", speed_gmp_randseed_ui, FLAG_R_OPTIONAL | FLAG_NODATA },
451 { "mpz_urandomb", speed_mpz_urandomb, FLAG_R_OPTIONAL | FLAG_NODATA },
453 #ifdef SPEED_EXTRA_ROUTINES
456 #ifdef SPEED_EXTRA_ROUTINES2
457 SPEED_EXTRA_ROUTINES2
463 const struct routine_t *p;
471 struct choice_t *choice;
476 data_fill (mp_ptr ptr, mp_size_t size)
478 switch (option_data) {
480 mpn_random (ptr, size);
483 mpn_random2 (ptr, size);
486 MPN_ZERO (ptr, size);
489 MPN_FILL (ptr, size, GMP_NUMB_0xAA);
492 MPN_FILL (ptr, size, GMP_NUMB_MAX);
495 MPN_FILL (ptr, size, GMP_NUMB_MAX);
504 /* The code here handling the various combinations of output options isn't
505 too attractive, but it works and is fairly clean. */
507 #define SIZE_TO_DIVISOR(n) \
508 (option_square == 1 ? (n)*(n) \
509 : option_square == 2 ? (n)*((n)+1)/2 \
513 run_one (FILE *fp, struct speed_params *s, mp_size_t prev_size)
515 const char *first_open_fastest, *first_open_notfastest, *first_close;
516 int i, fastest, want_data;
522 /* allocate data, unless all routines are NODATA */
524 for (i = 0; i < num_choices; i++)
525 want_data |= ((choice[i].p->flag & FLAG_NODATA) == 0);
529 SPEED_TMP_ALLOC_LIMBS (sp.xp, s->size, s->align_xp);
530 SPEED_TMP_ALLOC_LIMBS (sp.yp, s->size, s->align_yp);
532 data_fill (s->xp, s->size);
533 data_fill (s->yp, s->size);
541 if (prev_size == -1 && option_cmp == CMP_DIFFPREV)
543 first_open_fastest = "(#";
544 first_open_notfastest = " (";
549 first_open_fastest = "#";
550 first_open_notfastest = " ";
556 for (i = 0; i < num_choices; i++)
559 choice[i].time = speed_measure (choice[i].p->fun, s);
560 choice[i].no_time = (choice[i].time == -1.0);
561 if (! choice[i].no_time)
562 choice[i].time *= choice[i].scale;
564 /* Apply the effect of CMP_DIFFPREV, but the new choice[i].prev_time
565 is before any differences. */
569 if (t != -1.0 && option_cmp == CMP_DIFFPREV && prev_size != -1)
571 if (choice[i].prev_time == -1.0)
572 choice[i].no_time = 1;
574 choice[i].time = choice[i].time - choice[i].prev_time;
576 choice[i].prev_time = t;
579 if (choice[i].no_time)
582 /* Look for the fastest after CMP_DIFFPREV has been applied, but
583 before CMP_RATIO or CMP_DIFFERENCE. There's only a fastest shown
584 if there's more than one routine. */
585 if (num_choices > 1 && (fastest == -1 || choice[i].time < fastest_time))
588 fastest_time = choice[i].time;
591 if (option_cmp == CMP_DIFFPREV)
593 /* Conversion for UNIT_CYCLESPERLIMB differs in CMP_DIFFPREV. */
594 if (option_unit == UNIT_CYCLES)
595 choice[i].time /= speed_cycletime;
596 else if (option_unit == UNIT_CYCLESPERLIMB)
599 choice[i].time /= speed_cycletime;
601 choice[i].time /= (speed_cycletime
602 * (SIZE_TO_DIVISOR(s->size)
603 - SIZE_TO_DIVISOR(prev_size)));
608 if (option_unit == UNIT_CYCLES)
609 choice[i].time /= speed_cycletime;
610 else if (option_unit == UNIT_CYCLESPERLIMB)
611 choice[i].time /= (speed_cycletime * SIZE_TO_DIVISOR(s->size));
613 if (option_cmp == CMP_RATIO && i > 0)
615 /* A ratio isn't affected by the units chosen. */
616 if (choice[0].no_time || choice[0].time == 0.0)
617 choice[i].no_time = 1;
619 choice[i].time /= choice[0].time;
621 else if (option_cmp == CMP_DIFFERENCE && i > 0)
623 if (choice[0].no_time)
625 choice[i].no_time = 1;
628 choice[i].time -= choice[0].time;
635 /* In CMP_DIFFPREV, don't print anything for the first size, start
636 with the second where an actual difference is available.
638 In CMP_RATIO, print the first column as 1.0.
640 The 9 decimals printed is much more than the expected precision of
641 the measurements actually. */
643 if (! (option_cmp == CMP_DIFFPREV && prev_size == -1))
645 fprintf (fp, "%-6ld ", s->size);
646 for (i = 0; i < num_choices; i++)
647 fprintf (fp, " %.9e",
648 choice[i].no_time ? 0.0
649 : (option_cmp == CMP_RATIO && i == 0) ? 1.0
656 fprintf (fp, "%-6ld ", s->size);
657 for (i = 0; i < num_choices; i++)
662 if (choice[i].no_time)
664 fprintf (fp, " %*s", COLUMN_WIDTH, "n/a");
667 {if (option_unit == UNIT_CYCLESPERLIMB
668 || (option_cmp == CMP_RATIO && i > 0))
670 else if (option_unit == UNIT_CYCLES)
675 sprintf (buf, "%s%.*f%s",
676 i == fastest ? first_open_fastest : first_open_notfastest,
677 decimals, choice[i].time, first_close);
678 fprintf (fp, " %*s", COLUMN_WIDTH, buf);
695 SPEED_TMP_ALLOC_LIMBS (sp.xp_block, SPEED_BLOCK_SIZE, sp.align_xp);
696 SPEED_TMP_ALLOC_LIMBS (sp.yp_block, SPEED_BLOCK_SIZE, sp.align_yp);
698 data_fill (sp.xp_block, SPEED_BLOCK_SIZE);
699 data_fill (sp.yp_block, SPEED_BLOCK_SIZE);
701 for (i = 0; i < size_num; i++)
703 sp.size = size_array[i].start;
709 if (option_data == DATA_2FD && sp.size >= 2)
710 sp.xp[sp.size-1] = 2;
712 run_one (fp, &sp, prev_size);
715 if (option_data == DATA_2FD && sp.size >= 2)
716 sp.xp[sp.size-1] = MP_LIMB_T_MAX;
718 if (option_factor != 0.0)
720 step = (mp_size_t) (sp.size * option_factor - sp.size);
726 if (step < option_step)
730 if (sp.size > size_array[i].end)
740 fopen_for_write (const char *filename)
743 if ((fp = fopen (filename, "w")) == NULL)
745 fprintf (stderr, "Cannot create %s\n", filename);
752 fclose_written (FILE *fp, const char *filename)
761 fprintf (stderr, "Error writing %s\n", filename);
768 run_gnuplot (int argc, char *argv[])
775 plot_filename = (char *) (*__gmp_allocate_func)
776 (strlen (option_gnuplot_basename) + 20);
777 data_filename = (char *) (*__gmp_allocate_func)
778 (strlen (option_gnuplot_basename) + 20);
780 sprintf (plot_filename, "%s.gnuplot", option_gnuplot_basename);
781 sprintf (data_filename, "%s.data", option_gnuplot_basename);
783 fp = fopen_for_write (plot_filename);
785 fprintf (fp, "# Generated with:\n");
787 for (i = 0; i < argc; i++)
788 fprintf (fp, " %s", argv[i]);
792 fprintf (fp, "reset\n");
794 /* Putting the key at the top left is usually good, and you can change it
795 interactively if it's not. */
796 fprintf (fp, "set key left\n");
798 /* designed to make it possible to see crossovers easily */
799 fprintf (fp, "set data style lines\n");
801 fprintf (fp, "plot ");
802 for (i = 0; i < num_choices; i++)
804 fprintf (fp, " \"%s\" using 1:%d", data_filename, i+2);
805 fprintf (fp, " title \"%s\"", choice[i].name);
807 if (i != num_choices-1)
808 fprintf (fp, ", \\");
812 fprintf (fp, "load \"-\"\n");
813 fclose_written (fp, plot_filename);
815 fp = fopen_for_write (data_filename);
817 /* Unbuffered so you can see where the program was up to if it crashes or
822 fclose_written (fp, data_filename);
826 /* Return a limb with n many one bits (starting from the least significant) */
828 #define LIMB_ONES(n) \
829 ((n) == GMP_LIMB_BITS ? MP_LIMB_T_MAX \
830 : (n) == 0 ? CNST_LIMB(0) \
831 : (CNST_LIMB(1) << (n)) - 1)
834 r_string (const char *s)
836 const char *s_orig = s;
839 if (strcmp (s, "aas") == 0)
840 return GMP_NUMB_0xAA;
848 set = mpz_set_str (z, s, 0);
850 l = (siz == 0 ? 0 : siz > 0 ? PTR(z)[0] : -PTR(z)[0]);
854 if (siz > 1 || siz < -1)
855 printf ("Warning, r parameter %s truncated to %d bits\n",
856 s_orig, GMP_LIMB_BITS);
861 if (s[0] == '0' && (s[1] == 'x' || s[1] == 'X'))
862 n = strtoul (s+2, (char **) &s, 16);
864 n = strtol (s, (char **) &s, 10);
866 if (strcmp (s, "bits") == 0)
869 if (n > GMP_LIMB_BITS)
871 fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
876 return (l | (CNST_LIMB(1) << (n-1))) & LIMB_ONES(n);
878 else if (strcmp (s, "ones") == 0)
880 if (n > GMP_LIMB_BITS)
882 fprintf (stderr, "%ld bit parameter invalid (max %d bits)\n",
886 return LIMB_ONES (n);
890 fprintf (stderr, "invalid r parameter: %s\n", s_orig);
899 routine_find (struct choice_t *c, const char *s_orig)
906 s = strchr (s_orig, '*');
909 c->scale = atof(s_orig);
918 for (i = 0; i < numberof (routine); i++)
920 nlen = strlen (routine[i].name);
921 if (memcmp (s, routine[i].name, nlen) != 0)
926 /* match, with a .r parameter */
928 if (! (routine[i].flag & (FLAG_R|FLAG_R_OPTIONAL)))
931 "Choice %s bad: doesn't take a \".<r>\" parameter\n",
937 c->r = r_string (s + nlen + 1);
943 /* match, with no parameter */
945 if (routine[i].flag & FLAG_R)
948 "Choice %s bad: needs a \".<r>\" parameter\n",
959 fprintf (stderr, "Choice %s unrecognised\n", s_orig);
971 printf ("Usage: speed [-options] -s size <routine>...\n");
972 printf ("Measure the speed of some routines.\n");
973 printf ("Times are in seconds, accuracy is shown.\n");
975 printf (" -p num set precision as number of time units each routine must run\n");
976 printf (" -s size[-end][,size[-end]]... sizes to measure\n");
977 printf (" single sizes or ranges, sep with comma or use multiple -s\n");
978 printf (" -t step step through sizes by given amount\n");
979 printf (" -f factor step through sizes by given factor (eg. 1.05)\n");
980 printf (" -r show times as ratios of the first routine\n");
981 printf (" -d show times as difference from the first routine\n");
982 printf (" -D show times as difference from previous size shown\n");
983 printf (" -c show times in CPU cycles\n");
984 printf (" -C show times in cycles per limb\n");
985 printf (" -u print resource usage (memory) at end\n");
986 printf (" -P name output plot files \"name.gnuplot\" and \"name.data\"\n");
987 printf (" -a <type> use given data: random(default), random2, zeros, aas, ffs, 2fd\n");
988 printf (" -x, -y, -w, -W <align> specify data alignments, sources and dests\n");
989 printf (" -o addrs print addresses of data blocks\n");
991 printf ("If both -t and -f are used, it means step by the factor or the step, whichever\n");
992 printf ("is greater.\n");
993 printf ("If both -C and -D are used, it means cycles per however many limbs between a\n");
994 printf ("size and the previous size.\n");
996 printf ("After running with -P, plots can be viewed with Gnuplot or Quickplot.\n");
997 printf ("\"gnuplot name.gnuplot\" (use \"set logscale xy; replot\" at the prompt for\n");
998 printf ("a log/log plot).\n");
999 printf ("\"quickplot -s name.data\" (has interactive zooming, and note -s is important\n");
1000 printf ("when viewing more than one routine, it means same axis scales for all data).\n");
1002 printf ("The available routines are as follows.\n");
1005 for (i = 0; i < numberof (routine); i++)
1007 if (routine[i].flag & FLAG_R)
1008 printf ("\t%s.r\n", routine[i].name);
1009 else if (routine[i].flag & FLAG_R_OPTIONAL)
1010 printf ("\t%s (optional .r)\n", routine[i].name);
1012 printf ("\t%s\n", routine[i].name);
1015 printf ("Routines with a \".r\" need an extra parameter, for example mpn_lshift.6\n");
1016 printf ("r should be in decimal, or use 0xN for hexadecimal.\n");
1018 printf ("Special forms for r are \"<N>bits\" for a random N bit number, \"<N>ones\" for\n");
1019 printf ("N one bits, or \"aas\" for 0xAA..AA.\n");
1021 printf ("Times for sizes out of the range accepted by a routine are shown as 0.\n");
1022 printf ("The fastest routine at each size is marked with a # (free form output only).\n");
1024 printf ("%s", speed_time_string);
1026 printf ("Gnuplot home page http://www.gnuplot.info/\n");
1027 printf ("Quickplot home page http://quickplot.sourceforge.net/\n");
1031 check_align_option (const char *name, mp_size_t align)
1033 if (align < 0 || align > SPEED_TMP_ALLOC_ADJUST_MASK)
1035 fprintf (stderr, "Alignment request out of range: %s %ld\n",
1036 name, (long) align);
1037 fprintf (stderr, " should be 0 to %d (limbs), inclusive\n",
1038 SPEED_TMP_ALLOC_ADJUST_MASK);
1044 main (int argc, char *argv[])
1049 /* Unbuffered so output goes straight out when directed to a pipe or file
1050 and isn't lost on killing the program half way. */
1051 setbuf (stdout, NULL);
1055 opt = getopt(argc, argv, "a:CcDdEFf:o:p:P:rRs:t:ux:y:w:W:z");
1061 if (strcmp (optarg, "random") == 0) option_data = DATA_RANDOM;
1062 else if (strcmp (optarg, "random2") == 0) option_data = DATA_RANDOM2;
1063 else if (strcmp (optarg, "zeros") == 0) option_data = DATA_ZEROS;
1064 else if (strcmp (optarg, "aas") == 0) option_data = DATA_AAS;
1065 else if (strcmp (optarg, "ffs") == 0) option_data = DATA_FFS;
1066 else if (strcmp (optarg, "2fd") == 0) option_data = DATA_2FD;
1069 fprintf (stderr, "unrecognised data option: %s\n", optarg);
1074 if (option_unit != UNIT_SECONDS) goto bad_unit;
1075 option_unit = UNIT_CYCLESPERLIMB;
1078 if (option_unit != UNIT_SECONDS)
1081 fprintf (stderr, "cannot use more than one of -c, -C\n");
1084 option_unit = UNIT_CYCLES;
1087 if (option_cmp != CMP_ABSOLUTE) goto bad_cmp;
1088 option_cmp = CMP_DIFFPREV;
1091 if (option_cmp != CMP_ABSOLUTE)
1094 fprintf (stderr, "cannot use more than one of -d, -D, -r\n");
1097 option_cmp = CMP_DIFFERENCE;
1106 option_factor = atof (optarg);
1107 if (option_factor <= 1.0)
1109 fprintf (stderr, "-f factor must be > 1.0\n");
1114 speed_option_set (optarg);
1118 option_gnuplot_basename = optarg;
1121 speed_precision = atoi (optarg);
1124 option_seed = time (NULL);
1127 if (option_cmp != CMP_ABSOLUTE)
1129 option_cmp = CMP_RATIO;
1134 for (s = strtok (optarg, ","); s != NULL; s = strtok (NULL, ","))
1136 if (size_num == size_allocnum)
1138 size_array = (struct size_array_t *)
1139 __gmp_allocate_or_reallocate
1141 size_allocnum * sizeof(size_array[0]),
1142 (size_allocnum+10) * sizeof(size_array[0]));
1143 size_allocnum += 10;
1145 if (sscanf (s, "%ld-%ld",
1146 &size_array[size_num].start,
1147 &size_array[size_num].end) != 2)
1149 size_array[size_num].start = size_array[size_num].end
1153 if (size_array[size_num].start < 0
1154 || size_array[size_num].end < 0
1155 || size_array[size_num].start > size_array[size_num].end)
1157 fprintf (stderr, "invalid size parameter: %s\n", s);
1166 option_step = atol (optarg);
1167 if (option_step < 1)
1169 fprintf (stderr, "-t step must be >= 1\n");
1174 option_resource_usage = 1;
1180 sp.align_xp = atol (optarg);
1181 check_align_option ("-x", sp.align_xp);
1184 sp.align_yp = atol (optarg);
1185 check_align_option ("-y", sp.align_yp);
1188 sp.align_wp = atol (optarg);
1189 check_align_option ("-w", sp.align_wp);
1192 sp.align_wp2 = atol (optarg);
1193 check_align_option ("-W", sp.align_wp2);
1208 fprintf (stderr, "-s <size> must be specified\n");
1212 gmp_randinit_default (__gmp_rands);
1213 __gmp_rands_initialized = 1;
1214 gmp_randseed_ui (__gmp_rands, option_seed);
1216 choice = (struct choice_t *) (*__gmp_allocate_func)
1217 ((argc - optind) * sizeof(choice[0]));
1218 for ( ; optind < argc; optind++)
1221 routine_find (&c, argv[optind]);
1222 choice[num_choices] = c;
1226 if ((option_cmp == CMP_RATIO || option_cmp == CMP_DIFFERENCE) &&
1229 fprintf (stderr, "WARNING, -d or -r does nothing when only one routine requested\n");
1233 if (option_unit == UNIT_CYCLES || option_unit == UNIT_CYCLESPERLIMB)
1234 speed_cycletime_need_cycles ();
1236 speed_cycletime_need_seconds ();
1240 run_gnuplot (argc, argv);
1244 if (option_unit == UNIT_SECONDS)
1245 printf ("overhead %.9f secs", speed_measure (speed_noop, NULL));
1247 printf ("overhead %.2f cycles",
1248 speed_measure (speed_noop, NULL) / speed_cycletime);
1249 printf (", precision %d units of %.2e secs",
1250 speed_precision, speed_unittime);
1252 if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
1253 printf (", CPU freq unknown\n");
1255 printf (", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
1258 for (i = 0; i < num_choices; i++)
1259 printf (" %*s", COLUMN_WIDTH, choice[i].name);
1265 if (option_resource_usage)
1269 /* This doesn't give data sizes on linux 2.0.x, only utime. */
1271 if (getrusage (RUSAGE_SELF, &r) != 0)
1272 perror ("getrusage");
1274 printf ("getrusage(): utime %ld.%06ld data %ld stack %ld maxresident %ld\n",
1275 r.ru_utime.tv_sec, r.ru_utime.tv_usec,
1276 r.ru_idrss, r.ru_isrss, r.ru_ixrss);
1279 printf ("getrusage() not available\n");
1285 sprintf (buf, "/proc/%d/status", getpid());
1286 if (access (buf, R_OK) == 0)
1288 sprintf (buf, "cat /proc/%d/status", getpid());