1 /* Shared speed subroutines.
3 Copyright 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2008, 2009, 2010
4 Free Software Foundation, Inc.
6 This file is part of the GNU MP Library.
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
21 #define __GMP_NO_ATTRIBUTE_CONST_PURE
27 #include <stdlib.h> /* for qsort */
31 #include <sys/ioctl.h>
42 int speed_option_addrs = 0;
43 int speed_option_verbose = 0;
46 /* Provide __clz_tab even if it's not required, for the benefit of new code
47 being tested with many.pl. */
48 #ifndef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
49 #define COUNT_LEADING_ZEROS_NEED_CLZ_TAB
50 #include "mp_clz_tab.c"
51 #undef COUNT_LEADING_ZEROS_NEED_CLZ_TAB
64 fd = open ("/dev/wbinvd", O_RDWR);
66 perror ("open /dev/wbinvd");
75 #define WBINVDSIZE 1024*1024*2
77 static char *p = NULL;
81 p = malloc (WBINVDSIZE);
84 for (i = 0; i < WBINVDSIZE; i++)
89 for (i = 0; i < WBINVDSIZE; i++)
92 mpn_cache_fill_dummy (sum);
99 double_cmp_ptr (const double *p, const double *q)
101 if (*p > *q) return 1;
102 if (*p < *q) return -1;
107 /* Measure the speed of a given routine.
109 The routine is run with enough repetitions to make it take at least
110 speed_precision * speed_unittime. This aims to minimize the effects of a
111 limited accuracy time base and the overhead of the measuring itself.
113 Measurements are made looking for 4 results within TOLERANCE of each
114 other (or 3 for routines taking longer than 2 seconds). This aims to get
115 an accurate reading even if some runs are bloated by interrupts or task
116 switches or whatever.
118 The given (*fun)() is expected to run its function "s->reps" many times
119 and return the total elapsed time measured using speed_starttime() and
120 speed_endtime(). If the function doesn't support the given s->size or
121 s->r, -1.0 should be returned. See the various base routines below. */
124 speed_measure (double (*fun) __GMP_PROTO ((struct speed_params *s)),
125 struct speed_params *s)
127 #define TOLERANCE 1.005 /* 0.5% */
128 const int max_zeros = 10;
130 struct speed_params s_dummy;
133 double t_unsorted[30];
137 /* Use dummy parameters if caller doesn't provide any. Only a few special
138 "fun"s will cope with this, speed_noop() is one. */
141 memset (&s_dummy, '\0', sizeof (s_dummy));
146 s->time_divisor = 1.0;
147 for (i = 0; i < numberof (t); i++)
156 if (speed_option_verbose >= 3)
157 gmp_printf("size=%ld reps=%u r=%Md attempt=%d %.9f\n",
158 (long) s->size, s->reps, s->r, i, t[i]);
163 if (zeros > max_zeros)
165 fprintf (stderr, "Fatal error: too many (%d) failed measurements (0.0)\n", zeros);
174 if (t[i] >= speed_unittime * speed_precision)
177 /* go to a value of reps to make t[i] >= precision */
178 reps_d = ceil (1.1 * s->reps
179 * speed_unittime * speed_precision
180 / MAX (t[i], speed_unittime));
181 if (reps_d > 2e9 || reps_d < 1.0)
183 fprintf (stderr, "Fatal error: new reps bad: %.2f\n", reps_d);
184 fprintf (stderr, " (old reps %u, unittime %.4g, precision %d, t[i] %.4g)\n",
185 s->reps, speed_unittime, speed_precision, t[i]);
188 s->reps = (unsigned) reps_d;
191 t_unsorted[i] = t[i];
193 if (speed_precision == 0)
196 /* require 3 values within TOLERANCE when >= 2 secs, 4 when below */
202 /* Look for e many t[]'s within TOLERANCE of each other to consider a
203 valid measurement. Return smallest among them. */
206 qsort (t, i+1, sizeof(t[0]), (qsort_function_t) double_cmp_ptr);
207 for (j = e-1; j < i; j++)
208 if (t[j] <= t[j-e+1] * TOLERANCE)
209 return t[j-e+1] / s->time_divisor;
213 fprintf (stderr, "speed_measure() could not get %d results within %.1f%%\n",
214 e, (TOLERANCE-1.0)*100.0);
215 fprintf (stderr, " unsorted sorted\n");
216 fprintf (stderr, " %.12f %.12f is about 0.5%%\n",
217 t_unsorted[0]*(TOLERANCE-1.0), t[0]*(TOLERANCE-1.0));
218 for (i = 0; i < numberof (t); i++)
219 fprintf (stderr, " %.09f %.09f\n", t_unsorted[i], t[i]);
225 /* Read all of ptr,size to get it into the CPU memory cache.
227 A call to mpn_cache_fill_dummy() is used to make sure the compiler
228 doesn't optimize away the whole loop. Using "volatile mp_limb_t sum"
229 would work too, but the function call means we don't rely on every
230 compiler actually implementing volatile properly.
232 mpn_cache_fill_dummy() is in a separate source file to stop gcc thinking
236 mpn_cache_fill (mp_srcptr ptr, mp_size_t size)
241 for (i = 0; i < size; i++)
244 mpn_cache_fill_dummy(sum);
249 mpn_cache_fill_write (mp_ptr ptr, mp_size_t size)
251 mpn_cache_fill (ptr, size);
254 mpn_random (ptr, size);
260 for (i = 0; i < size; i++)
267 speed_operand_src (struct speed_params *s, mp_ptr ptr, mp_size_t size)
269 if (s->src_num >= numberof (s->src))
271 fprintf (stderr, "speed_operand_src: no room left in s->src[]\n");
274 s->src[s->src_num].ptr = ptr;
275 s->src[s->src_num].size = size;
281 speed_operand_dst (struct speed_params *s, mp_ptr ptr, mp_size_t size)
283 if (s->dst_num >= numberof (s->dst))
285 fprintf (stderr, "speed_operand_dst: no room left in s->dst[]\n");
288 s->dst[s->dst_num].ptr = ptr;
289 s->dst[s->dst_num].size = size;
295 speed_cache_fill (struct speed_params *s)
297 static struct speed_params prev;
300 /* FIXME: need a better way to get the format string for a pointer */
302 if (speed_option_addrs)
306 different = (s->dst_num != prev.dst_num || s->src_num != prev.src_num);
307 for (i = 0; i < s->dst_num; i++)
308 different |= (s->dst[i].ptr != prev.dst[i].ptr);
309 for (i = 0; i < s->src_num; i++)
310 different |= (s->src[i].ptr != prev.src[i].ptr);
317 for (i = 0; i < s->dst_num; i++)
318 printf (" %08lX", (unsigned long) s->dst[i].ptr);
325 for (i = 0; i < s->src_num; i++)
326 printf (" %08lX", (unsigned long) s->src[i].ptr);
329 printf (" (cf sp approx %08lX)\n", (unsigned long) &different);
333 memcpy (&prev, s, sizeof(prev));
338 for (i = 0; i < s->dst_num; i++)
339 mpn_cache_fill_write (s->dst[i].ptr, s->dst[i].size);
340 for (i = 0; i < s->src_num; i++)
341 mpn_cache_fill (s->src[i].ptr, s->src[i].size);
350 /* Miscellanous options accepted by tune and speed programs under -o. */
353 speed_option_set (const char *s)
357 if (strcmp (s, "addrs") == 0)
359 speed_option_addrs = 1;
361 else if (strcmp (s, "verbose") == 0)
363 speed_option_verbose++;
365 else if (sscanf (s, "verbose=%d", &n) == 1)
367 speed_option_verbose = n;
371 printf ("Unrecognised -o option: %s\n", s);
377 /* The following are basic speed running routines for various gmp functions.
378 Many are very similar and use speed.h macros.
380 Each routine allocates it's own destination space for the result of the
381 function, because only it can know what the function needs.
383 speed_starttime() and speed_endtime() are put tight around the code to be
384 measured. Any setups are done outside the timed portion.
386 Each routine is responsible for its own cache priming.
387 speed_cache_fill() is a good way to do this, see examples in speed.h.
388 One cache priming possibility, for CPUs with write-allocate cache, and
389 functions that don't take too long, is to do one dummy call before timing
390 so as to cache everything that gets used. But speed_measure() runs a
391 routine at least twice and will take the smaller time, so this might not
394 Data alignment will be important, for source, destination and temporary
395 workspace. A routine can align its destination and workspace. Programs
396 using the routines will ensure s->xp and s->yp are aligned. Aligning
397 onto a CACHE_LINE_SIZE boundary is suggested. s->align_wp and
398 s->align_wp2 should be respected where it makes sense to do so.
399 SPEED_TMP_ALLOC_LIMBS is a good way to do this.
401 A loop of the following form can be expected to turn into good assembler
402 code on most CPUs, thereby minimizing overhead in the measurement. It
403 can always be assumed s->reps >= 1.
410 Additional parameters might be added to "struct speed_params" in the
411 future. Routines should ignore anything they don't use.
413 s->size can be used creatively, and s->xp and s->yp can be ignored. For
414 example, speed_mpz_fac_ui() uses s->size as n for the factorial. s->r is
415 just a user-supplied parameter. speed_mpn_lshift() uses it as a shift,
416 speed_mpn_mul_1() uses it as a multiplier. */
419 /* MPN_COPY etc can be macros, so the _CALL forms are necessary */
421 speed_MPN_COPY (struct speed_params *s)
423 SPEED_ROUTINE_MPN_COPY (MPN_COPY);
426 speed_MPN_COPY_INCR (struct speed_params *s)
428 SPEED_ROUTINE_MPN_COPY (MPN_COPY_INCR);
431 speed_MPN_COPY_DECR (struct speed_params *s)
433 SPEED_ROUTINE_MPN_COPY (MPN_COPY_DECR);
435 #if HAVE_NATIVE_mpn_copyi
437 speed_mpn_copyi (struct speed_params *s)
439 SPEED_ROUTINE_MPN_COPY (mpn_copyi);
442 #if HAVE_NATIVE_mpn_copyd
444 speed_mpn_copyd (struct speed_params *s)
446 SPEED_ROUTINE_MPN_COPY (mpn_copyd);
450 speed_memcpy (struct speed_params *s)
452 SPEED_ROUTINE_MPN_COPY_BYTES (memcpy);
455 speed_mpn_com (struct speed_params *s)
457 SPEED_ROUTINE_MPN_COPY (mpn_com);
462 speed_mpn_addmul_1 (struct speed_params *s)
464 SPEED_ROUTINE_MPN_UNARY_1 (mpn_addmul_1);
467 speed_mpn_submul_1 (struct speed_params *s)
469 SPEED_ROUTINE_MPN_UNARY_1 (mpn_submul_1);
472 #if HAVE_NATIVE_mpn_addmul_2
474 speed_mpn_addmul_2 (struct speed_params *s)
476 SPEED_ROUTINE_MPN_UNARY_2 (mpn_addmul_2);
479 #if HAVE_NATIVE_mpn_addmul_3
481 speed_mpn_addmul_3 (struct speed_params *s)
483 SPEED_ROUTINE_MPN_UNARY_3 (mpn_addmul_3);
486 #if HAVE_NATIVE_mpn_addmul_4
488 speed_mpn_addmul_4 (struct speed_params *s)
490 SPEED_ROUTINE_MPN_UNARY_4 (mpn_addmul_4);
493 #if HAVE_NATIVE_mpn_addmul_5
495 speed_mpn_addmul_5 (struct speed_params *s)
497 SPEED_ROUTINE_MPN_UNARY_5 (mpn_addmul_5);
500 #if HAVE_NATIVE_mpn_addmul_6
502 speed_mpn_addmul_6 (struct speed_params *s)
504 SPEED_ROUTINE_MPN_UNARY_6 (mpn_addmul_6);
507 #if HAVE_NATIVE_mpn_addmul_7
509 speed_mpn_addmul_7 (struct speed_params *s)
511 SPEED_ROUTINE_MPN_UNARY_7 (mpn_addmul_7);
514 #if HAVE_NATIVE_mpn_addmul_8
516 speed_mpn_addmul_8 (struct speed_params *s)
518 SPEED_ROUTINE_MPN_UNARY_8 (mpn_addmul_8);
523 speed_mpn_mul_1 (struct speed_params *s)
525 SPEED_ROUTINE_MPN_UNARY_1 (mpn_mul_1);
528 speed_mpn_mul_1_inplace (struct speed_params *s)
530 SPEED_ROUTINE_MPN_UNARY_1_INPLACE (mpn_mul_1);
533 #if HAVE_NATIVE_mpn_mul_2
535 speed_mpn_mul_2 (struct speed_params *s)
537 SPEED_ROUTINE_MPN_UNARY_2 (mpn_mul_2);
540 #if HAVE_NATIVE_mpn_mul_3
542 speed_mpn_mul_3 (struct speed_params *s)
544 SPEED_ROUTINE_MPN_UNARY_3 (mpn_mul_3);
547 #if HAVE_NATIVE_mpn_mul_4
549 speed_mpn_mul_4 (struct speed_params *s)
551 SPEED_ROUTINE_MPN_UNARY_4 (mpn_mul_4);
557 speed_mpn_lshift (struct speed_params *s)
559 SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshift);
562 speed_mpn_lshiftc (struct speed_params *s)
564 SPEED_ROUTINE_MPN_UNARY_1 (mpn_lshiftc);
567 speed_mpn_rshift (struct speed_params *s)
569 SPEED_ROUTINE_MPN_UNARY_1 (mpn_rshift);
573 /* The carry-in variants (if available) are good for measuring because they
574 won't skip a division if high<divisor. Alternately, use -1 as a divisor
575 with the plain _1 forms. */
577 speed_mpn_divrem_1 (struct speed_params *s)
579 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1);
582 speed_mpn_divrem_1f (struct speed_params *s)
584 SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1);
586 #if HAVE_NATIVE_mpn_divrem_1c
588 speed_mpn_divrem_1c (struct speed_params *s)
590 SPEED_ROUTINE_MPN_DIVREM_1C (mpn_divrem_1c);
593 speed_mpn_divrem_1cf (struct speed_params *s)
595 SPEED_ROUTINE_MPN_DIVREM_1CF (mpn_divrem_1c);
600 speed_mpn_divrem_1_div (struct speed_params *s)
602 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_div);
605 speed_mpn_divrem_1f_div (struct speed_params *s)
607 SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_div);
610 speed_mpn_divrem_1_inv (struct speed_params *s)
612 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_inv);
615 speed_mpn_divrem_1f_inv (struct speed_params *s)
617 SPEED_ROUTINE_MPN_DIVREM_1F (mpn_divrem_1_inv);
620 speed_mpn_mod_1_div (struct speed_params *s)
622 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_div);
625 speed_mpn_mod_1_inv (struct speed_params *s)
627 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_inv);
631 speed_mpn_preinv_divrem_1 (struct speed_params *s)
633 SPEED_ROUTINE_MPN_PREINV_DIVREM_1 (mpn_preinv_divrem_1);
636 speed_mpn_preinv_divrem_1f (struct speed_params *s)
638 SPEED_ROUTINE_MPN_PREINV_DIVREM_1F (mpn_preinv_divrem_1);
641 #if GMP_NUMB_BITS % 4 == 0
643 speed_mpn_mod_34lsub1 (struct speed_params *s)
645 SPEED_ROUTINE_MPN_MOD_34LSUB1 (mpn_mod_34lsub1);
650 speed_mpn_divrem_2 (struct speed_params *s)
652 SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2);
655 speed_mpn_divrem_2_div (struct speed_params *s)
657 SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_div);
660 speed_mpn_divrem_2_inv (struct speed_params *s)
662 SPEED_ROUTINE_MPN_DIVREM_2 (mpn_divrem_2_inv);
666 speed_mpn_mod_1 (struct speed_params *s)
668 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1);
670 #if HAVE_NATIVE_mpn_mod_1c
672 speed_mpn_mod_1c (struct speed_params *s)
674 SPEED_ROUTINE_MPN_MOD_1C (mpn_mod_1c);
678 speed_mpn_preinv_mod_1 (struct speed_params *s)
680 SPEED_ROUTINE_MPN_PREINV_MOD_1 (mpn_preinv_mod_1);
683 speed_mpn_mod_1_1 (struct speed_params *s)
685 SPEED_ROUTINE_MPN_MOD_1_1 (mpn_mod_1_1p,mpn_mod_1_1p_cps);
688 speed_mpn_mod_1_2 (struct speed_params *s)
690 SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_2p,mpn_mod_1s_2p_cps,2);
693 speed_mpn_mod_1_3 (struct speed_params *s)
695 SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_3p,mpn_mod_1s_3p_cps,3);
698 speed_mpn_mod_1_4 (struct speed_params *s)
700 SPEED_ROUTINE_MPN_MOD_1_N (mpn_mod_1s_4p,mpn_mod_1s_4p_cps,4);
704 speed_mpn_divexact_1 (struct speed_params *s)
706 SPEED_ROUTINE_MPN_DIVEXACT_1 (mpn_divexact_1);
710 speed_mpn_divexact_by3 (struct speed_params *s)
712 SPEED_ROUTINE_MPN_COPY (mpn_divexact_by3);
716 speed_mpn_bdiv_dbm1c (struct speed_params *s)
718 SPEED_ROUTINE_MPN_BDIV_DBM1C (mpn_bdiv_dbm1c);
722 speed_mpn_bdiv_q_1 (struct speed_params *s)
724 SPEED_ROUTINE_MPN_BDIV_Q_1 (mpn_bdiv_q_1);
728 speed_mpn_pi1_bdiv_q_1 (struct speed_params *s)
730 SPEED_ROUTINE_MPN_PI1_BDIV_Q_1 (mpn_pi1_bdiv_q_1);
733 #if HAVE_NATIVE_mpn_modexact_1_odd
735 speed_mpn_modexact_1_odd (struct speed_params *s)
737 SPEED_ROUTINE_MPN_MODEXACT_1_ODD (mpn_modexact_1_odd);
742 speed_mpn_modexact_1c_odd (struct speed_params *s)
744 SPEED_ROUTINE_MPN_MODEXACT_1C_ODD (mpn_modexact_1c_odd);
748 speed_mpz_mod (struct speed_params *s)
750 SPEED_ROUTINE_MPZ_MOD (mpz_mod);
754 speed_mpn_sbpi1_div_qr (struct speed_params *s)
756 SPEED_ROUTINE_MPN_PI1_DIV (mpn_sbpi1_div_qr, inv.inv32, 2,0);
759 speed_mpn_dcpi1_div_qr (struct speed_params *s)
761 SPEED_ROUTINE_MPN_PI1_DIV (mpn_dcpi1_div_qr, &inv, 6,3);
764 speed_mpn_sbpi1_divappr_q (struct speed_params *s)
766 SPEED_ROUTINE_MPN_PI1_DIV (mpn_sbpi1_divappr_q, inv.inv32, 2,0);
769 speed_mpn_dcpi1_divappr_q (struct speed_params *s)
771 SPEED_ROUTINE_MPN_PI1_DIV (mpn_dcpi1_divappr_q, &inv, 6,3);
774 speed_mpn_mu_div_qr (struct speed_params *s)
776 SPEED_ROUTINE_MPN_MU_DIV_QR (mpn_mu_div_qr, mpn_mu_div_qr_itch);
779 speed_mpn_mu_divappr_q (struct speed_params *s)
781 SPEED_ROUTINE_MPN_MU_DIV_Q (mpn_mu_divappr_q, mpn_mu_divappr_q_itch);
784 speed_mpn_mu_div_q (struct speed_params *s)
786 SPEED_ROUTINE_MPN_MU_DIV_Q (mpn_mu_div_q, mpn_mu_div_q_itch);
789 speed_mpn_mupi_div_qr (struct speed_params *s)
791 SPEED_ROUTINE_MPN_MUPI_DIV_QR (mpn_preinv_mu_div_qr, mpn_mu_div_qr_itch);
795 speed_mpn_sbpi1_bdiv_qr (struct speed_params *s)
797 SPEED_ROUTINE_MPN_PI1_BDIV_QR (mpn_sbpi1_bdiv_qr);
800 speed_mpn_dcpi1_bdiv_qr (struct speed_params *s)
802 SPEED_ROUTINE_MPN_PI1_BDIV_QR (mpn_dcpi1_bdiv_qr);
805 speed_mpn_sbpi1_bdiv_q (struct speed_params *s)
807 SPEED_ROUTINE_MPN_PI1_BDIV_Q (mpn_sbpi1_bdiv_q);
810 speed_mpn_dcpi1_bdiv_q (struct speed_params *s)
812 SPEED_ROUTINE_MPN_PI1_BDIV_Q (mpn_dcpi1_bdiv_q);
815 speed_mpn_mu_bdiv_q (struct speed_params *s)
817 SPEED_ROUTINE_MPN_MU_BDIV_Q (mpn_mu_bdiv_q, mpn_mu_bdiv_q_itch);
820 speed_mpn_mu_bdiv_qr (struct speed_params *s)
822 SPEED_ROUTINE_MPN_MU_BDIV_QR (mpn_mu_bdiv_qr, mpn_mu_bdiv_qr_itch);
826 speed_mpn_binvert (struct speed_params *s)
828 SPEED_ROUTINE_MPN_BINVERT (mpn_binvert, mpn_binvert_itch);
832 speed_mpn_invert (struct speed_params *s)
834 SPEED_ROUTINE_MPN_INVERT (mpn_invert, mpn_invert_itch);
838 speed_mpn_invertappr (struct speed_params *s)
840 SPEED_ROUTINE_MPN_INVERTAPPR (mpn_invertappr, mpn_invertappr_itch);
844 speed_mpn_ni_invertappr (struct speed_params *s)
846 SPEED_ROUTINE_MPN_INVERTAPPR (mpn_ni_invertappr, mpn_invertappr_itch);
850 speed_mpn_redc_1 (struct speed_params *s)
852 SPEED_ROUTINE_REDC_1 (mpn_redc_1);
855 speed_mpn_redc_2 (struct speed_params *s)
857 SPEED_ROUTINE_REDC_2 (mpn_redc_2);
860 speed_mpn_redc_n (struct speed_params *s)
862 SPEED_ROUTINE_REDC_N (mpn_redc_n);
867 speed_mpn_popcount (struct speed_params *s)
869 SPEED_ROUTINE_MPN_POPCOUNT (mpn_popcount);
872 speed_mpn_hamdist (struct speed_params *s)
874 SPEED_ROUTINE_MPN_HAMDIST (mpn_hamdist);
879 speed_mpn_add_n (struct speed_params *s)
881 SPEED_ROUTINE_MPN_BINARY_N (mpn_add_n);
884 speed_mpn_sub_n (struct speed_params *s)
886 SPEED_ROUTINE_MPN_BINARY_N (mpn_sub_n);
889 #if HAVE_NATIVE_mpn_add_n_sub_n
891 speed_mpn_add_n_sub_n (struct speed_params *s)
893 SPEED_ROUTINE_MPN_ADDSUB_N_CALL (mpn_add_n_sub_n (ap, sp, s->xp, s->yp, s->size));
897 #if HAVE_NATIVE_mpn_addlsh1_n
899 speed_mpn_addlsh1_n (struct speed_params *s)
901 SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh1_n);
904 #if HAVE_NATIVE_mpn_sublsh1_n
906 speed_mpn_sublsh1_n (struct speed_params *s)
908 SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh1_n);
911 #if HAVE_NATIVE_mpn_rsblsh1_n
913 speed_mpn_rsblsh1_n (struct speed_params *s)
915 SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh1_n);
918 #if HAVE_NATIVE_mpn_addlsh2_n
920 speed_mpn_addlsh2_n (struct speed_params *s)
922 SPEED_ROUTINE_MPN_BINARY_N (mpn_addlsh2_n);
925 #if HAVE_NATIVE_mpn_sublsh2_n
927 speed_mpn_sublsh2_n (struct speed_params *s)
929 SPEED_ROUTINE_MPN_BINARY_N (mpn_sublsh2_n);
932 #if HAVE_NATIVE_mpn_rsblsh2_n
934 speed_mpn_rsblsh2_n (struct speed_params *s)
936 SPEED_ROUTINE_MPN_BINARY_N (mpn_rsblsh2_n);
939 #if HAVE_NATIVE_mpn_rsh1add_n
941 speed_mpn_rsh1add_n (struct speed_params *s)
943 SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1add_n);
946 #if HAVE_NATIVE_mpn_rsh1sub_n
948 speed_mpn_rsh1sub_n (struct speed_params *s)
950 SPEED_ROUTINE_MPN_BINARY_N (mpn_rsh1sub_n);
954 /* mpn_and_n etc can be macros and so have to be handled with
955 SPEED_ROUTINE_MPN_BINARY_N_CALL forms */
957 speed_mpn_and_n (struct speed_params *s)
959 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_and_n (wp, s->xp, s->yp, s->size));
962 speed_mpn_andn_n (struct speed_params *s)
964 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_andn_n (wp, s->xp, s->yp, s->size));
967 speed_mpn_nand_n (struct speed_params *s)
969 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nand_n (wp, s->xp, s->yp, s->size));
972 speed_mpn_ior_n (struct speed_params *s)
974 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_ior_n (wp, s->xp, s->yp, s->size));
977 speed_mpn_iorn_n (struct speed_params *s)
979 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_iorn_n (wp, s->xp, s->yp, s->size));
982 speed_mpn_nior_n (struct speed_params *s)
984 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_nior_n (wp, s->xp, s->yp, s->size));
987 speed_mpn_xor_n (struct speed_params *s)
989 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xor_n (wp, s->xp, s->yp, s->size));
992 speed_mpn_xnor_n (struct speed_params *s)
994 SPEED_ROUTINE_MPN_BINARY_N_CALL (mpn_xnor_n (wp, s->xp, s->yp, s->size));
999 speed_mpn_mul_n (struct speed_params *s)
1001 SPEED_ROUTINE_MPN_MUL_N (mpn_mul_n);
1004 speed_mpn_sqr (struct speed_params *s)
1006 SPEED_ROUTINE_MPN_SQR (mpn_sqr);
1009 speed_mpn_mul_n_sqr (struct speed_params *s)
1011 SPEED_ROUTINE_MPN_SQR_CALL (mpn_mul_n (wp, s->xp, s->xp, s->size));
1015 speed_mpn_mul_basecase (struct speed_params *s)
1017 SPEED_ROUTINE_MPN_MUL(mpn_mul_basecase);
1020 speed_mpn_mul (struct speed_params *s)
1022 SPEED_ROUTINE_MPN_MUL(mpn_mul);
1025 speed_mpn_sqr_basecase (struct speed_params *s)
1027 /* FIXME: size restrictions on some versions of sqr_basecase */
1028 SPEED_ROUTINE_MPN_SQR (mpn_sqr_basecase);
1031 #if HAVE_NATIVE_mpn_sqr_diagonal
1033 speed_mpn_sqr_diagonal (struct speed_params *s)
1035 SPEED_ROUTINE_MPN_SQR (mpn_sqr_diagonal);
1040 speed_mpn_toom2_sqr (struct speed_params *s)
1042 SPEED_ROUTINE_MPN_TOOM2_SQR (mpn_toom2_sqr);
1045 speed_mpn_toom3_sqr (struct speed_params *s)
1047 SPEED_ROUTINE_MPN_TOOM3_SQR (mpn_toom3_sqr);
1050 speed_mpn_toom4_sqr (struct speed_params *s)
1052 SPEED_ROUTINE_MPN_TOOM4_SQR (mpn_toom4_sqr);
1055 speed_mpn_toom6_sqr (struct speed_params *s)
1057 SPEED_ROUTINE_MPN_TOOM6_SQR (mpn_toom6_sqr);
1060 speed_mpn_toom8_sqr (struct speed_params *s)
1062 SPEED_ROUTINE_MPN_TOOM8_SQR (mpn_toom8_sqr);
1065 speed_mpn_toom22_mul (struct speed_params *s)
1067 SPEED_ROUTINE_MPN_TOOM22_MUL_N (mpn_toom22_mul);
1070 speed_mpn_toom33_mul (struct speed_params *s)
1072 SPEED_ROUTINE_MPN_TOOM33_MUL_N (mpn_toom33_mul);
1075 speed_mpn_toom44_mul (struct speed_params *s)
1077 SPEED_ROUTINE_MPN_TOOM44_MUL_N (mpn_toom44_mul);
1080 speed_mpn_toom6h_mul (struct speed_params *s)
1082 SPEED_ROUTINE_MPN_TOOM6H_MUL_N (mpn_toom6h_mul);
1085 speed_mpn_toom8h_mul (struct speed_params *s)
1087 SPEED_ROUTINE_MPN_TOOM8H_MUL_N (mpn_toom8h_mul);
1091 speed_mpn_toom32_mul (struct speed_params *s)
1093 SPEED_ROUTINE_MPN_TOOM32_MUL (mpn_toom32_mul);
1096 speed_mpn_toom42_mul (struct speed_params *s)
1098 SPEED_ROUTINE_MPN_TOOM42_MUL (mpn_toom42_mul);
1101 speed_mpn_toom43_mul (struct speed_params *s)
1103 SPEED_ROUTINE_MPN_TOOM43_MUL (mpn_toom43_mul);
1106 speed_mpn_toom63_mul (struct speed_params *s)
1108 SPEED_ROUTINE_MPN_TOOM63_MUL (mpn_toom63_mul);
1111 speed_mpn_toom32_for_toom43_mul (struct speed_params *s)
1113 SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM43_MUL (mpn_toom32_mul);
1116 speed_mpn_toom43_for_toom32_mul (struct speed_params *s)
1118 SPEED_ROUTINE_MPN_TOOM43_FOR_TOOM32_MUL (mpn_toom43_mul);
1121 speed_mpn_toom32_for_toom53_mul (struct speed_params *s)
1123 SPEED_ROUTINE_MPN_TOOM32_FOR_TOOM53_MUL (mpn_toom32_mul);
1126 speed_mpn_toom53_for_toom32_mul (struct speed_params *s)
1128 SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM32_MUL (mpn_toom53_mul);
1131 speed_mpn_toom42_for_toom53_mul (struct speed_params *s)
1133 SPEED_ROUTINE_MPN_TOOM42_FOR_TOOM53_MUL (mpn_toom42_mul);
1136 speed_mpn_toom53_for_toom42_mul (struct speed_params *s)
1138 SPEED_ROUTINE_MPN_TOOM53_FOR_TOOM42_MUL (mpn_toom53_mul);
1142 speed_mpn_nussbaumer_mul (struct speed_params *s)
1144 SPEED_ROUTINE_MPN_MUL_N_CALL
1145 (mpn_nussbaumer_mul (wp, s->xp, s->size, s->yp, s->size));
1148 speed_mpn_nussbaumer_mul_sqr (struct speed_params *s)
1150 SPEED_ROUTINE_MPN_SQR_CALL
1151 (mpn_nussbaumer_mul (wp, s->xp, s->size, s->xp, s->size));
1154 #if WANT_OLD_FFT_FULL
1156 speed_mpn_mul_fft_full (struct speed_params *s)
1158 SPEED_ROUTINE_MPN_MUL_N_CALL
1159 (mpn_mul_fft_full (wp, s->xp, s->size, s->yp, s->size));
1162 speed_mpn_mul_fft_full_sqr (struct speed_params *s)
1164 SPEED_ROUTINE_MPN_SQR_CALL
1165 (mpn_mul_fft_full (wp, s->xp, s->size, s->xp, s->size));
1169 /* These are mod 2^N+1 multiplies and squares. If s->r is supplied it's
1170 used as k, otherwise the best k for the size is used. If s->size isn't a
1171 multiple of 2^k it's rounded up to make the effective operation size. */
1173 #define SPEED_ROUTINE_MPN_MUL_FFT_CALL(call, sqr) \
1182 SPEED_RESTRICT_COND (s->size >= 1); \
1187 k = mpn_fft_best_k (s->size, sqr); \
1190 pl = mpn_fft_next_size (s->size, k); \
1191 SPEED_TMP_ALLOC_LIMBS (wp, pl+1, s->align_wp); \
1193 speed_operand_src (s, s->xp, s->size); \
1195 speed_operand_src (s, s->yp, s->size); \
1196 speed_operand_dst (s, wp, pl+1); \
1197 speed_cache_fill (s); \
1199 speed_starttime (); \
1204 t = speed_endtime (); \
1211 speed_mpn_mul_fft (struct speed_params *s)
1213 SPEED_ROUTINE_MPN_MUL_FFT_CALL
1214 (mpn_mul_fft (wp, pl, s->xp, s->size, s->yp, s->size, k), 0);
1218 speed_mpn_mul_fft_sqr (struct speed_params *s)
1220 SPEED_ROUTINE_MPN_MUL_FFT_CALL
1221 (mpn_mul_fft (wp, pl, s->xp, s->size, s->xp, s->size, k), 1);
1225 speed_mpn_fft_mul (struct speed_params *s)
1227 SPEED_ROUTINE_MPN_MUL_N_CALL (mpn_fft_mul (wp, s->xp, s->size, s->yp, s->size));
1231 speed_mpn_fft_sqr (struct speed_params *s)
1233 SPEED_ROUTINE_MPN_SQR_CALL (mpn_fft_mul (wp, s->xp, s->size, s->xp, s->size));
1237 speed_mpn_mullo_n (struct speed_params *s)
1239 SPEED_ROUTINE_MPN_MULLO_N (mpn_mullo_n);
1242 speed_mpn_mullo_basecase (struct speed_params *s)
1244 SPEED_ROUTINE_MPN_MULLO_BASECASE (mpn_mullo_basecase);
1248 speed_mpn_mulmod_bnm1 (struct speed_params *s)
1250 SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_mulmod_bnm1 (wp, s->size, s->xp, s->size, s->yp, s->size, tp));
1254 speed_mpn_bc_mulmod_bnm1 (struct speed_params *s)
1256 SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_bc_mulmod_bnm1 (wp, s->xp, s->yp, s->size, tp));
1260 speed_mpn_mulmod_bnm1_rounded (struct speed_params *s)
1262 SPEED_ROUTINE_MPN_MULMOD_BNM1_ROUNDED (mpn_mulmod_bnm1);
1266 speed_mpn_sqrmod_bnm1 (struct speed_params *s)
1268 SPEED_ROUTINE_MPN_MULMOD_BNM1_CALL (mpn_sqrmod_bnm1 (wp, s->size, s->xp, s->size, tp));
1272 speed_mpn_matrix22_mul (struct speed_params *s)
1274 /* Speed params only includes 2 inputs, so we have to invent the
1287 SPEED_TMP_ALLOC_LIMBS (a, 4 * s->size, s->align_xp);
1288 SPEED_TMP_ALLOC_LIMBS (b, 4 * s->size, s->align_yp);
1289 SPEED_TMP_ALLOC_LIMBS (r, 8 * s->size + 4, s->align_wp);
1291 MPN_COPY (a, s->xp, s->size);
1292 mpn_random (a + s->size, 3 * s->size);
1293 MPN_COPY (b, s->yp, s->size);
1294 mpn_random (b + s->size, 3 * s->size);
1296 itch = mpn_matrix22_mul_itch (s->size, s->size);
1297 SPEED_TMP_ALLOC_LIMBS (tp, itch, s->align_wp2);
1299 speed_operand_src (s, a, 4 * s->size);
1300 speed_operand_src (s, b, 4 * s->size);
1301 speed_operand_dst (s, r, 8 * s->size + 4);
1302 speed_operand_dst (s, tp, itch);
1303 speed_cache_fill (s);
1309 mp_size_t sz = s->size;
1310 MPN_COPY (r + 0 * sz + 0, a + 0 * sz, sz);
1311 MPN_COPY (r + 2 * sz + 1, a + 1 * sz, sz);
1312 MPN_COPY (r + 4 * sz + 2, a + 2 * sz, sz);
1313 MPN_COPY (r + 6 * sz + 3, a + 3 * sz, sz);
1314 mpn_matrix22_mul (r, r + 2 * sz + 1, r + 4 * sz + 2, r + 6 * sz + 3, sz,
1315 b, b + 1 * sz, b + 2 * sz, b + 3 * sz, sz,
1319 t = speed_endtime();
1325 speed_mpn_hgcd (struct speed_params *s)
1328 mp_size_t hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);
1329 mp_size_t hgcd_scratch = mpn_hgcd_itch (s->size);
1334 struct hgcd_matrix hgcd;
1345 SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);
1346 SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);
1348 s->xp[s->size - 1] |= 1;
1349 s->yp[s->size - 1] |= 1;
1351 SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_scratch, s->align_wp);
1352 SPEED_TMP_ALLOC_LIMBS (wp, hgcd_scratch, s->align_wp);
1358 MPN_COPY (ap, s->xp, s->size);
1359 MPN_COPY (bp, s->yp, s->size);
1360 mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);
1361 res = mpn_hgcd (ap, bp, s->size, &hgcd, wp);
1364 t = speed_endtime ();
1370 speed_mpn_hgcd_lehmer (struct speed_params *s)
1373 mp_size_t hgcd_init_scratch = MPN_HGCD_MATRIX_INIT_ITCH (s->size);
1374 mp_size_t hgcd_scratch = MPN_HGCD_LEHMER_ITCH (s->size);
1379 struct hgcd_matrix hgcd;
1390 SPEED_TMP_ALLOC_LIMBS (ap, s->size + 1, s->align_xp);
1391 SPEED_TMP_ALLOC_LIMBS (bp, s->size + 1, s->align_yp);
1393 s->xp[s->size - 1] |= 1;
1394 s->yp[s->size - 1] |= 1;
1396 SPEED_TMP_ALLOC_LIMBS (tmp1, hgcd_init_scratch, s->align_wp);
1397 SPEED_TMP_ALLOC_LIMBS (wp, hgcd_scratch, s->align_wp);
1403 MPN_COPY (ap, s->xp, s->size);
1404 MPN_COPY (bp, s->yp, s->size);
1405 mpn_hgcd_matrix_init (&hgcd, s->size, tmp1);
1406 res = mpn_hgcd_lehmer (ap, bp, s->size, &hgcd, wp);
1409 t = speed_endtime ();
1415 speed_mpn_gcd (struct speed_params *s)
1417 SPEED_ROUTINE_MPN_GCD (mpn_gcd);
1421 speed_mpn_gcdext (struct speed_params *s)
1423 SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext);
1427 speed_mpn_gcdext_lehmer (struct speed_params *s)
1429 SPEED_ROUTINE_MPN_GCDEXT (__gmpn_gcdext_lehmer);
1433 speed_mpn_gcdext_single (struct speed_params *s)
1435 SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_single);
1438 speed_mpn_gcdext_double (struct speed_params *s)
1440 SPEED_ROUTINE_MPN_GCDEXT (mpn_gcdext_double);
1443 speed_mpn_gcdext_one_single (struct speed_params *s)
1445 SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_single);
1448 speed_mpn_gcdext_one_double (struct speed_params *s)
1450 SPEED_ROUTINE_MPN_GCDEXT_ONE (mpn_gcdext_one_double);
1453 speed_mpn_gcd_1 (struct speed_params *s)
1455 SPEED_ROUTINE_MPN_GCD_1 (mpn_gcd_1);
1458 speed_mpn_gcd_1N (struct speed_params *s)
1460 SPEED_ROUTINE_MPN_GCD_1N (mpn_gcd_1);
1465 speed_mpz_jacobi (struct speed_params *s)
1467 SPEED_ROUTINE_MPZ_JACOBI (mpz_jacobi);
1470 speed_mpn_jacobi_base (struct speed_params *s)
1472 SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base);
1475 speed_mpn_jacobi_base_1 (struct speed_params *s)
1477 SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_1);
1480 speed_mpn_jacobi_base_2 (struct speed_params *s)
1482 SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_2);
1485 speed_mpn_jacobi_base_3 (struct speed_params *s)
1487 SPEED_ROUTINE_MPN_JACBASE (mpn_jacobi_base_3);
1492 speed_mpn_sqrtrem (struct speed_params *s)
1494 SPEED_ROUTINE_MPN_SQRTREM (mpn_sqrtrem);
1498 speed_mpn_rootrem (struct speed_params *s)
1500 SPEED_ROUTINE_MPN_ROOTREM (mpn_rootrem);
1505 speed_mpz_fac_ui (struct speed_params *s)
1507 SPEED_ROUTINE_MPZ_FAC_UI (mpz_fac_ui);
1512 speed_mpn_fib2_ui (struct speed_params *s)
1514 SPEED_ROUTINE_MPN_FIB2_UI (mpn_fib2_ui);
1517 speed_mpz_fib_ui (struct speed_params *s)
1519 SPEED_ROUTINE_MPZ_FIB_UI (mpz_fib_ui);
1522 speed_mpz_fib2_ui (struct speed_params *s)
1524 SPEED_ROUTINE_MPZ_FIB2_UI (mpz_fib2_ui);
1527 speed_mpz_lucnum_ui (struct speed_params *s)
1529 SPEED_ROUTINE_MPZ_LUCNUM_UI (mpz_lucnum_ui);
1532 speed_mpz_lucnum2_ui (struct speed_params *s)
1534 SPEED_ROUTINE_MPZ_LUCNUM2_UI (mpz_lucnum2_ui);
1539 speed_mpz_powm (struct speed_params *s)
1541 SPEED_ROUTINE_MPZ_POWM (mpz_powm);
1544 speed_mpz_powm_mod (struct speed_params *s)
1546 SPEED_ROUTINE_MPZ_POWM (mpz_powm_mod);
1549 speed_mpz_powm_redc (struct speed_params *s)
1551 SPEED_ROUTINE_MPZ_POWM (mpz_powm_redc);
1554 speed_mpz_powm_ui (struct speed_params *s)
1556 SPEED_ROUTINE_MPZ_POWM_UI (mpz_powm_ui);
1561 speed_binvert_limb (struct speed_params *s)
1563 SPEED_ROUTINE_MODLIMB_INVERT (binvert_limb);
1568 speed_noop (struct speed_params *s)
1577 return speed_endtime ();
1581 speed_noop_wxs (struct speed_params *s)
1589 wp = TMP_ALLOC_LIMBS (1);
1594 noop_wxs (wp, s->xp, s->size);
1596 t = speed_endtime ();
1603 speed_noop_wxys (struct speed_params *s)
1611 wp = TMP_ALLOC_LIMBS (1);
1616 noop_wxys (wp, s->xp, s->yp, s->size);
1618 t = speed_endtime ();
1625 #define SPEED_ROUTINE_ALLOC_FREE(variables, calls) \
1630 speed_starttime (); \
1637 return speed_endtime (); \
1641 /* Compare these to see how much malloc/free costs and then how much
1642 __gmp_default_allocate/free and mpz_init/clear add. mpz_init/clear or
1643 mpq_init/clear will be doing a 1 limb allocate, so use that as the size
1644 when including them in comparisons. */
1647 speed_malloc_free (struct speed_params *s)
1649 size_t bytes = s->size * BYTES_PER_MP_LIMB;
1650 SPEED_ROUTINE_ALLOC_FREE (void *p,
1656 speed_malloc_realloc_free (struct speed_params *s)
1658 size_t bytes = s->size * BYTES_PER_MP_LIMB;
1659 SPEED_ROUTINE_ALLOC_FREE (void *p,
1660 p = malloc (BYTES_PER_MP_LIMB);
1661 p = realloc (p, bytes);
1666 speed_gmp_allocate_free (struct speed_params *s)
1668 size_t bytes = s->size * BYTES_PER_MP_LIMB;
1669 SPEED_ROUTINE_ALLOC_FREE (void *p,
1670 p = (*__gmp_allocate_func) (bytes);
1671 (*__gmp_free_func) (p, bytes));
1675 speed_gmp_allocate_reallocate_free (struct speed_params *s)
1677 size_t bytes = s->size * BYTES_PER_MP_LIMB;
1678 SPEED_ROUTINE_ALLOC_FREE
1680 p = (*__gmp_allocate_func) (BYTES_PER_MP_LIMB);
1681 p = (*__gmp_reallocate_func) (p, bytes, BYTES_PER_MP_LIMB);
1682 (*__gmp_free_func) (p, bytes));
1686 speed_mpz_init_clear (struct speed_params *s)
1688 SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
1694 speed_mpz_init_realloc_clear (struct speed_params *s)
1696 SPEED_ROUTINE_ALLOC_FREE (mpz_t z,
1698 _mpz_realloc (z, s->size);
1703 speed_mpq_init_clear (struct speed_params *s)
1705 SPEED_ROUTINE_ALLOC_FREE (mpq_t q,
1711 speed_mpf_init_clear (struct speed_params *s)
1713 SPEED_ROUTINE_ALLOC_FREE (mpf_t f,
1719 /* Compare this to mpn_add_n to see how much overhead mpz_add adds. Note
1720 that repeatedly calling mpz_add with the same data gives branch prediction
1721 in it an advantage. */
1724 speed_mpz_add (struct speed_params *s)
1734 mpz_set_n (x, s->xp, s->size);
1735 mpz_set_n (y, s->yp, s->size);
1745 t = speed_endtime ();
1754 /* If r==0, calculate (size,size/2),
1755 otherwise calculate (size,r). */
1758 speed_mpz_bin_uiui (struct speed_params *s)
1775 mpz_bin_uiui (w, s->size, k);
1778 t = speed_endtime ();
1785 /* The multiplies are successively dependent so the latency is measured, not
1786 the issue rate. There's only 10 per loop so the code doesn't get too big
1787 since umul_ppmm is several instructions on some cpus.
1789 Putting the arguments as "h,l,l,h" gets slightly better code from gcc
1790 2.95.2 on x86, it puts only one mov between each mul, not two. That mov
1791 though will probably show up as a bogus extra cycle though.
1793 The measuring function macros are into three parts to avoid overflowing
1794 preprocessor expansion space if umul_ppmm is big.
1798 Don't blindly use this to set UMUL_TIME in gmp-mparam.h, check the code
1799 generated first, especially on CPUs with low latency multipliers.
1801 The default umul_ppmm doing h*l will be getting increasing numbers of
1802 high zero bits in the calculation. CPUs with data-dependent multipliers
1803 will want to use umul_ppmm.1 to get some randomization into the
1804 calculation. The extra xors and fetches will be a slowdown of course. */
1806 #define SPEED_MACRO_UMUL_PPMM_A \
1812 s->time_divisor = 10; \
1819 speed_starttime (); \
1824 #define SPEED_MACRO_UMUL_PPMM_B \
1827 t = speed_endtime (); \
1831 speed_starttime (); \
1836 #define SPEED_MACRO_UMUL_PPMM_C \
1839 t = speed_endtime (); \
1842 /* stop the compiler optimizing away the whole calculation! */ \
1851 speed_umul_ppmm (struct speed_params *s)
1853 SPEED_MACRO_UMUL_PPMM_A;
1855 umul_ppmm (h, l, l, h); h ^= s->xp_block[0]; l ^= s->yp_block[0];
1856 umul_ppmm (h, l, l, h); h ^= s->xp_block[1]; l ^= s->yp_block[1];
1857 umul_ppmm (h, l, l, h); h ^= s->xp_block[2]; l ^= s->yp_block[2];
1858 umul_ppmm (h, l, l, h); h ^= s->xp_block[3]; l ^= s->yp_block[3];
1859 umul_ppmm (h, l, l, h); h ^= s->xp_block[4]; l ^= s->yp_block[4];
1860 umul_ppmm (h, l, l, h); h ^= s->xp_block[5]; l ^= s->yp_block[5];
1861 umul_ppmm (h, l, l, h); h ^= s->xp_block[6]; l ^= s->yp_block[6];
1862 umul_ppmm (h, l, l, h); h ^= s->xp_block[7]; l ^= s->yp_block[7];
1863 umul_ppmm (h, l, l, h); h ^= s->xp_block[8]; l ^= s->yp_block[8];
1864 umul_ppmm (h, l, l, h); h ^= s->xp_block[9]; l ^= s->yp_block[9];
1866 SPEED_MACRO_UMUL_PPMM_B;
1868 umul_ppmm (h, l, l, h);
1869 umul_ppmm (h, l, l, h);
1870 umul_ppmm (h, l, l, h);
1871 umul_ppmm (h, l, l, h);
1872 umul_ppmm (h, l, l, h);
1873 umul_ppmm (h, l, l, h);
1874 umul_ppmm (h, l, l, h);
1875 umul_ppmm (h, l, l, h);
1876 umul_ppmm (h, l, l, h);
1877 umul_ppmm (h, l, l, h);
1879 SPEED_MACRO_UMUL_PPMM_C;
1883 #if HAVE_NATIVE_mpn_umul_ppmm
1885 speed_mpn_umul_ppmm (struct speed_params *s)
1887 SPEED_MACRO_UMUL_PPMM_A;
1889 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[0]; l ^= s->yp_block[0];
1890 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[1]; l ^= s->yp_block[1];
1891 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[2]; l ^= s->yp_block[2];
1892 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[3]; l ^= s->yp_block[3];
1893 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[4]; l ^= s->yp_block[4];
1894 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[5]; l ^= s->yp_block[5];
1895 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[6]; l ^= s->yp_block[6];
1896 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[7]; l ^= s->yp_block[7];
1897 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[8]; l ^= s->yp_block[8];
1898 h = mpn_umul_ppmm (&l, h, l); h ^= s->xp_block[9]; l ^= s->yp_block[9];
1900 SPEED_MACRO_UMUL_PPMM_B;
1902 h = mpn_umul_ppmm (&l, h, l);
1903 h = mpn_umul_ppmm (&l, h, l);
1904 h = mpn_umul_ppmm (&l, h, l);
1905 h = mpn_umul_ppmm (&l, h, l);
1906 h = mpn_umul_ppmm (&l, h, l);
1907 h = mpn_umul_ppmm (&l, h, l);
1908 h = mpn_umul_ppmm (&l, h, l);
1909 h = mpn_umul_ppmm (&l, h, l);
1910 h = mpn_umul_ppmm (&l, h, l);
1911 h = mpn_umul_ppmm (&l, h, l);
1913 SPEED_MACRO_UMUL_PPMM_C;
1917 #if HAVE_NATIVE_mpn_umul_ppmm_r
1919 speed_mpn_umul_ppmm_r (struct speed_params *s)
1921 SPEED_MACRO_UMUL_PPMM_A;
1923 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[0]; l ^= s->yp_block[0];
1924 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[1]; l ^= s->yp_block[1];
1925 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[2]; l ^= s->yp_block[2];
1926 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[3]; l ^= s->yp_block[3];
1927 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[4]; l ^= s->yp_block[4];
1928 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[5]; l ^= s->yp_block[5];
1929 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[6]; l ^= s->yp_block[6];
1930 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[7]; l ^= s->yp_block[7];
1931 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[8]; l ^= s->yp_block[8];
1932 h = mpn_umul_ppmm_r (h, l, &l); h ^= s->xp_block[9]; l ^= s->yp_block[9];
1934 SPEED_MACRO_UMUL_PPMM_B;
1936 h = mpn_umul_ppmm_r (h, l, &l);
1937 h = mpn_umul_ppmm_r (h, l, &l);
1938 h = mpn_umul_ppmm_r (h, l, &l);
1939 h = mpn_umul_ppmm_r (h, l, &l);
1940 h = mpn_umul_ppmm_r (h, l, &l);
1941 h = mpn_umul_ppmm_r (h, l, &l);
1942 h = mpn_umul_ppmm_r (h, l, &l);
1943 h = mpn_umul_ppmm_r (h, l, &l);
1944 h = mpn_umul_ppmm_r (h, l, &l);
1945 h = mpn_umul_ppmm_r (h, l, &l);
1947 SPEED_MACRO_UMUL_PPMM_C;
1952 /* The divisions are successively dependent so latency is measured, not
1953 issue rate. There's only 10 per loop so the code doesn't get too big,
1954 especially for udiv_qrnnd_preinv and preinv2norm, which are several
1957 Note that it's only the division which is measured here, there's no data
1958 fetching and no shifting if the divisor gets normalized.
1960 In speed_udiv_qrnnd with gcc 2.95.2 on x86 the parameters "q,r,r,q,d"
1961 generate x86 div instructions with nothing in between.
1963 The measuring function macros are in two parts to avoid overflowing
1964 preprocessor expansion space if udiv_qrnnd etc are big.
1968 Don't blindly use this to set UDIV_TIME in gmp-mparam.h, check the code
1971 CPUs with data-dependent divisions may want more attention paid to the
1972 randomness of the data used. Probably the measurement wanted is over
1973 uniformly distributed numbers, but what's here might not be giving that. */
1975 #define SPEED_ROUTINE_UDIV_QRNND_A(normalize) \
1979 mp_limb_t q, r, d; \
1982 s->time_divisor = 10; \
1984 /* divisor from "r" parameter, or a default */ \
1987 d = mp_bases[10].big_base; \
1992 count_leading_zeros (norm, d); \
1994 invert_limb (dinv, d); \
2000 speed_starttime (); \
2005 #define SPEED_ROUTINE_UDIV_QRNND_B \
2008 t = speed_endtime (); \
2010 /* stop the compiler optimizing away the whole calculation! */ \
2018 speed_udiv_qrnnd (struct speed_params *s)
2020 SPEED_ROUTINE_UDIV_QRNND_A (UDIV_NEEDS_NORMALIZATION);
2022 udiv_qrnnd (q, r, r, q, d);
2023 udiv_qrnnd (q, r, r, q, d);
2024 udiv_qrnnd (q, r, r, q, d);
2025 udiv_qrnnd (q, r, r, q, d);
2026 udiv_qrnnd (q, r, r, q, d);
2027 udiv_qrnnd (q, r, r, q, d);
2028 udiv_qrnnd (q, r, r, q, d);
2029 udiv_qrnnd (q, r, r, q, d);
2030 udiv_qrnnd (q, r, r, q, d);
2031 udiv_qrnnd (q, r, r, q, d);
2033 SPEED_ROUTINE_UDIV_QRNND_B;
2037 speed_udiv_qrnnd_preinv1 (struct speed_params *s)
2039 SPEED_ROUTINE_UDIV_QRNND_A (1);
2041 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2042 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2043 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2044 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2045 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2046 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2047 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2048 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2049 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2050 udiv_qrnnd_preinv1 (q, r, r, q, d, dinv);
2052 SPEED_ROUTINE_UDIV_QRNND_B;
2056 speed_udiv_qrnnd_preinv2 (struct speed_params *s)
2058 SPEED_ROUTINE_UDIV_QRNND_A (1);
2060 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2061 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2062 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2063 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2064 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2065 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2066 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2067 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2068 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2069 udiv_qrnnd_preinv2 (q, r, r, q, d, dinv);
2071 SPEED_ROUTINE_UDIV_QRNND_B;
2075 speed_udiv_qrnnd_c (struct speed_params *s)
2077 SPEED_ROUTINE_UDIV_QRNND_A (1);
2079 __udiv_qrnnd_c (q, r, r, q, d);
2080 __udiv_qrnnd_c (q, r, r, q, d);
2081 __udiv_qrnnd_c (q, r, r, q, d);
2082 __udiv_qrnnd_c (q, r, r, q, d);
2083 __udiv_qrnnd_c (q, r, r, q, d);
2084 __udiv_qrnnd_c (q, r, r, q, d);
2085 __udiv_qrnnd_c (q, r, r, q, d);
2086 __udiv_qrnnd_c (q, r, r, q, d);
2087 __udiv_qrnnd_c (q, r, r, q, d);
2088 __udiv_qrnnd_c (q, r, r, q, d);
2090 SPEED_ROUTINE_UDIV_QRNND_B;
2093 #if HAVE_NATIVE_mpn_udiv_qrnnd
2095 speed_mpn_udiv_qrnnd (struct speed_params *s)
2097 SPEED_ROUTINE_UDIV_QRNND_A (1);
2099 q = mpn_udiv_qrnnd (&r, r, q, d);
2100 q = mpn_udiv_qrnnd (&r, r, q, d);
2101 q = mpn_udiv_qrnnd (&r, r, q, d);
2102 q = mpn_udiv_qrnnd (&r, r, q, d);
2103 q = mpn_udiv_qrnnd (&r, r, q, d);
2104 q = mpn_udiv_qrnnd (&r, r, q, d);
2105 q = mpn_udiv_qrnnd (&r, r, q, d);
2106 q = mpn_udiv_qrnnd (&r, r, q, d);
2107 q = mpn_udiv_qrnnd (&r, r, q, d);
2108 q = mpn_udiv_qrnnd (&r, r, q, d);
2110 SPEED_ROUTINE_UDIV_QRNND_B;
2114 #if HAVE_NATIVE_mpn_udiv_qrnnd_r
2116 speed_mpn_udiv_qrnnd_r (struct speed_params *s)
2118 SPEED_ROUTINE_UDIV_QRNND_A (1);
2120 q = mpn_udiv_qrnnd_r (r, q, d, &r);
2121 q = mpn_udiv_qrnnd_r (r, q, d, &r);
2122 q = mpn_udiv_qrnnd_r (r, q, d, &r);
2123 q = mpn_udiv_qrnnd_r (r, q, d, &r);
2124 q = mpn_udiv_qrnnd_r (r, q, d, &r);
2125 q = mpn_udiv_qrnnd_r (r, q, d, &r);
2126 q = mpn_udiv_qrnnd_r (r, q, d, &r);
2127 q = mpn_udiv_qrnnd_r (r, q, d, &r);
2128 q = mpn_udiv_qrnnd_r (r, q, d, &r);
2129 q = mpn_udiv_qrnnd_r (r, q, d, &r);
2131 SPEED_ROUTINE_UDIV_QRNND_B;
2137 speed_invert_limb (struct speed_params *s)
2139 SPEED_ROUTINE_INVERT_LIMB_CALL (invert_limb (dinv, d));
2143 /* xp[0] might not be particularly random, but should give an indication how
2144 "/" runs. Same for speed_operator_mod below. */
2146 speed_operator_div (struct speed_params *s)
2152 s->time_divisor = 10;
2154 /* divisor from "r" parameter, or a default */
2157 d = mp_bases[10].big_base;
2178 t = speed_endtime ();
2180 /* stop the compiler optimizing away the whole calculation! */
2187 speed_operator_mod (struct speed_params *s)
2193 s->time_divisor = 10;
2195 /* divisor from "r" parameter, or a default */
2198 d = mp_bases[10].big_base;
2219 t = speed_endtime ();
2221 /* stop the compiler optimizing away the whole calculation! */
2228 /* r==0 measures on data with the values uniformly distributed. This will
2229 be typical for count_trailing_zeros in a GCD etc.
2231 r==1 measures on data with the resultant count uniformly distributed
2232 between 0 and GMP_LIMB_BITS-1. This is probably sensible for
2233 count_leading_zeros on the high limbs of divisors. */
2236 speed_routine_count_zeros_setup (struct speed_params *s,
2237 mp_ptr xp, int leading, int zero)
2244 /* Make uniformly distributed data. If zero isn't allowed then change
2245 it to 1 for leading, or 0x800..00 for trailing. */
2246 MPN_COPY (xp, s->xp_block, SPEED_BLOCK_SIZE);
2248 for (i = 0; i < SPEED_BLOCK_SIZE; i++)
2250 xp[i] = leading ? 1 : GMP_LIMB_HIGHBIT;
2254 /* Make counts uniformly distributed. A randomly chosen bit is set, and
2255 for leading the rest above it are cleared, or for trailing then the
2257 for (i = 0; i < SPEED_BLOCK_SIZE; i++)
2259 mp_limb_t set = CNST_LIMB(1) << (s->yp_block[i] % GMP_LIMB_BITS);
2260 mp_limb_t keep_below = set-1;
2261 mp_limb_t keep_above = MP_LIMB_T_MAX ^ keep_below;
2262 mp_limb_t keep = (leading ? keep_below : keep_above);
2263 xp[i] = (s->xp_block[i] & keep) | set;
2271 /* Account for the effect of n^=c. */
2273 for (i = 0; i < SPEED_BLOCK_SIZE; i++)
2279 count_leading_zeros (c, n);
2281 count_trailing_zeros (c, n);
2288 speed_count_leading_zeros (struct speed_params *s)
2290 #ifdef COUNT_LEADING_ZEROS_0
2291 #define COUNT_LEADING_ZEROS_0_ALLOWED 1
2293 #define COUNT_LEADING_ZEROS_0_ALLOWED 0
2296 SPEED_ROUTINE_COUNT_ZEROS_A (1, COUNT_LEADING_ZEROS_0_ALLOWED);
2297 count_leading_zeros (c, n);
2298 SPEED_ROUTINE_COUNT_ZEROS_B ();
2301 speed_count_trailing_zeros (struct speed_params *s)
2303 SPEED_ROUTINE_COUNT_ZEROS_A (0, 0);
2304 count_trailing_zeros (c, n);
2305 SPEED_ROUTINE_COUNT_ZEROS_B ();
2310 speed_mpn_get_str (struct speed_params *s)
2312 SPEED_ROUTINE_MPN_GET_STR (mpn_get_str);
2316 speed_mpn_set_str (struct speed_params *s)
2318 SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_set_str (wp, xp, s->size, base));
2321 speed_mpn_bc_set_str (struct speed_params *s)
2323 SPEED_ROUTINE_MPN_SET_STR_CALL (mpn_bc_set_str (wp, xp, s->size, base));
2327 speed_MPN_ZERO (struct speed_params *s)
2329 SPEED_ROUTINE_MPN_ZERO_CALL (MPN_ZERO (wp, s->size));
2334 speed_randinit (struct speed_params *s, gmp_randstate_ptr rstate)
2337 gmp_randinit_default (rstate);
2339 gmp_randinit_mt (rstate);
2342 return gmp_randinit_lc_2exp_size (rstate, s->r);
2348 speed_gmp_randseed (struct speed_params *s)
2350 gmp_randstate_t rstate;
2355 SPEED_RESTRICT_COND (s->size >= 1);
2356 SPEED_RESTRICT_COND (speed_randinit (s, rstate));
2358 /* s->size bits of seed */
2359 mpz_init_set_n (x, s->xp, s->size);
2360 mpz_fdiv_r_2exp (x, x, (unsigned long) s->size);
2363 gmp_randseed (rstate, x);
2368 gmp_randseed (rstate, x);
2370 t = speed_endtime ();
2372 gmp_randclear (rstate);
2378 speed_gmp_randseed_ui (struct speed_params *s)
2380 gmp_randstate_t rstate;
2384 SPEED_RESTRICT_COND (speed_randinit (s, rstate));
2387 gmp_randseed_ui (rstate, 123L);
2394 gmp_randseed_ui (rstate, (unsigned long) s->xp_block[j]);
2396 if (j >= SPEED_BLOCK_SIZE)
2400 t = speed_endtime ();
2402 gmp_randclear (rstate);
2407 speed_mpz_urandomb (struct speed_params *s)
2409 gmp_randstate_t rstate;
2414 SPEED_RESTRICT_COND (s->size >= 0);
2415 SPEED_RESTRICT_COND (speed_randinit (s, rstate));
2420 mpz_urandomb (z, rstate, (unsigned long) s->size);
2421 mpz_urandomb (z, rstate, (unsigned long) s->size);
2426 mpz_urandomb (z, rstate, (unsigned long) s->size);
2428 t = speed_endtime ();
2431 gmp_randclear (rstate);