1 /* Create tuned thresholds for various algorithms.
3 Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010 Free
4 Software Foundation, Inc.
6 This file is part of the GNU MP Library.
8 The GNU MP Library is free software; you can redistribute it and/or modify
9 it under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 3 of the License, or (at your
11 option) any later version.
13 The GNU MP Library is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
22 /* Usage: tuneup [-t] [-t] [-p precision]
24 -t turns on some diagnostic traces, a second -t turns on more traces.
28 The code here isn't a vision of loveliness, mainly because it's subject
29 to ongoing changes according to new things wanting to be tuned, and
30 practical requirements of systems tested.
32 Sometimes running the program twice produces slightly different results.
33 This is probably because there's so little separating algorithms near
34 their crossover, and on that basis it should make little or no difference
35 to the final speed of the relevant routines, but nothing has been done to
40 The thresholds are determined as follows. A crossover may not be a
41 single size but rather a range where it oscillates between method A or
42 method B faster. If the threshold is set making B used where A is faster
43 (or vice versa) that's bad. Badness is the percentage time lost and
44 total badness is the sum of this over all sizes measured. The threshold
45 is set to minimize total badness.
47 Suppose, as sizes increase, method B becomes faster than method A. The
48 effect of the rule is that, as you look at increasing sizes, isolated
49 points where B is faster are ignored, but when it's consistently faster,
50 or faster on balance, then the threshold is set there. The same result
51 is obtained thinking in the other direction of A becoming faster at
54 In practice the thresholds tend to be chosen to bring on the next
55 algorithm fairly quickly.
57 This rule is attractive because it's got a basis in reason and is fairly
58 easy to implement, but no work has been done to actually compare it in
59 absolute terms to other possibilities.
63 In a normal library build the thresholds are constants. To tune them
64 selected objects are recompiled with the thresholds as global variables
65 instead. #define TUNE_PROGRAM_BUILD does this, with help from code at
66 the end of gmp-impl.h, and rules in tune/Makefile.am.
68 MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n. The
69 threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
70 level, but recurse into the basecase.
72 MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
73 Other routines in turn will make use of both of those. Naturally the
74 dependants must be tuned first.
76 In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
77 just a threshold based on comparing two routines (mpn_divrem_1 and
78 mpn_divexact_1), and no further use of the value determined.
80 Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
81 just comparisons between certain routines on representative data.
83 Shortcuts are applied when native (assembler) versions of routines exist.
84 For instance a native mpn_sqr_basecase is assumed to be always faster
85 than mpn_mul_basecase, with no measuring.
87 No attempt is made to tune within assembler routines, for instance
88 DIVREM_1_NORM_THRESHOLD. An assembler mpn_divrem_1 is expected to be
89 written and tuned all by hand. Assembler routines that might have hard
90 limits are recompiled though, to make them accept a bigger range of sizes
91 than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
95 The FFTs aren't subject to the same badness rule as the other thresholds,
96 so each k is probably being brought on a touch early. This isn't likely
97 to make a difference, and the simpler probing means fewer tests.
101 #define TUNE_PROGRAM_BUILD 1 /* for gmp-impl.h */
114 #include "gmp-impl.h"
115 #include "longlong.h"
120 #if !HAVE_DECL_OPTARG
122 extern int optind, opterr;
126 #define DEFAULT_MAX_SIZE 1000 /* limbs */
129 mp_size_t option_fft_max_size = 50000; /* limbs */
131 mp_size_t option_fft_max_size = 0;
133 int option_trace = 0;
134 int option_fft_trace = 0;
135 struct speed_params s;
144 /* This is not defined if mpn_sqr_basecase doesn't declare a limit. In that
145 case use zero here, which for params.max_size means no limit. */
146 #ifndef TUNE_SQR_TOOM2_MAX
147 #define TUNE_SQR_TOOM2_MAX 0
150 mp_size_t mul_toom22_threshold = MP_SIZE_T_MAX;
151 mp_size_t mul_toom33_threshold = MUL_TOOM33_THRESHOLD_LIMIT;
152 mp_size_t mul_toom44_threshold = MUL_TOOM44_THRESHOLD_LIMIT;
153 mp_size_t mul_toom6h_threshold = MUL_TOOM6H_THRESHOLD_LIMIT;
154 mp_size_t mul_toom8h_threshold = MUL_TOOM8H_THRESHOLD_LIMIT;
155 mp_size_t mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
156 mp_size_t mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
157 mp_size_t mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
158 mp_size_t mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
159 mp_size_t mul_fft_threshold = MP_SIZE_T_MAX;
160 mp_size_t mul_fft_modf_threshold = MP_SIZE_T_MAX;
161 mp_size_t sqr_basecase_threshold = MP_SIZE_T_MAX;
162 mp_size_t sqr_toom2_threshold
163 = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
164 mp_size_t sqr_toom3_threshold = SQR_TOOM3_THRESHOLD_LIMIT;
165 mp_size_t sqr_toom4_threshold = SQR_TOOM4_THRESHOLD_LIMIT;
166 mp_size_t sqr_toom6_threshold = SQR_TOOM6_THRESHOLD_LIMIT;
167 mp_size_t sqr_toom8_threshold = SQR_TOOM8_THRESHOLD_LIMIT;
168 mp_size_t sqr_fft_threshold = MP_SIZE_T_MAX;
169 mp_size_t sqr_fft_modf_threshold = MP_SIZE_T_MAX;
170 mp_size_t mullo_basecase_threshold = MP_SIZE_T_MAX;
171 mp_size_t mullo_dc_threshold = MP_SIZE_T_MAX;
172 mp_size_t mullo_mul_n_threshold = MP_SIZE_T_MAX;
173 mp_size_t mulmod_bnm1_threshold = MP_SIZE_T_MAX;
174 mp_size_t sqrmod_bnm1_threshold = MP_SIZE_T_MAX;
175 mp_size_t div_sb_preinv_threshold = MP_SIZE_T_MAX;
176 mp_size_t dc_div_qr_threshold = MP_SIZE_T_MAX;
177 mp_size_t dc_divappr_q_threshold = MP_SIZE_T_MAX;
178 mp_size_t mu_div_qr_threshold = MP_SIZE_T_MAX;
179 mp_size_t mu_divappr_q_threshold = MP_SIZE_T_MAX;
180 mp_size_t mupi_div_qr_threshold = MP_SIZE_T_MAX;
181 mp_size_t mu_div_q_threshold = MP_SIZE_T_MAX;
182 mp_size_t dc_bdiv_qr_threshold = MP_SIZE_T_MAX;
183 mp_size_t dc_bdiv_q_threshold = MP_SIZE_T_MAX;
184 mp_size_t mu_bdiv_qr_threshold = MP_SIZE_T_MAX;
185 mp_size_t mu_bdiv_q_threshold = MP_SIZE_T_MAX;
186 mp_size_t inv_mulmod_bnm1_threshold = MP_SIZE_T_MAX;
187 mp_size_t inv_newton_threshold = MP_SIZE_T_MAX;
188 mp_size_t inv_appr_threshold = MP_SIZE_T_MAX;
189 mp_size_t binv_newton_threshold = MP_SIZE_T_MAX;
190 mp_size_t redc_1_to_redc_2_threshold = MP_SIZE_T_MAX;
191 mp_size_t redc_1_to_redc_n_threshold = MP_SIZE_T_MAX;
192 mp_size_t redc_2_to_redc_n_threshold = MP_SIZE_T_MAX;
193 mp_size_t powm_threshold = MP_SIZE_T_MAX;
194 mp_size_t matrix22_strassen_threshold = MP_SIZE_T_MAX;
195 mp_size_t hgcd_threshold = MP_SIZE_T_MAX;
196 mp_size_t gcd_accel_threshold = MP_SIZE_T_MAX;
197 mp_size_t gcd_dc_threshold = MP_SIZE_T_MAX;
198 mp_size_t gcdext_dc_threshold = MP_SIZE_T_MAX;
199 mp_size_t divrem_1_norm_threshold = MP_SIZE_T_MAX;
200 mp_size_t divrem_1_unnorm_threshold = MP_SIZE_T_MAX;
201 mp_size_t mod_1_norm_threshold = MP_SIZE_T_MAX;
202 mp_size_t mod_1_unnorm_threshold = MP_SIZE_T_MAX;
203 mp_size_t mod_1n_to_mod_1_1_threshold = MP_SIZE_T_MAX;
204 mp_size_t mod_1u_to_mod_1_1_threshold = MP_SIZE_T_MAX;
205 mp_size_t mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
206 mp_size_t mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
207 mp_size_t preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
208 mp_size_t divrem_2_threshold = MP_SIZE_T_MAX;
209 mp_size_t get_str_dc_threshold = MP_SIZE_T_MAX;
210 mp_size_t get_str_precompute_threshold = MP_SIZE_T_MAX;
211 mp_size_t set_str_dc_threshold = MP_SIZE_T_MAX;
212 mp_size_t set_str_precompute_threshold = MP_SIZE_T_MAX;
214 mp_size_t fft_modf_sqr_threshold = MP_SIZE_T_MAX;
215 mp_size_t fft_modf_mul_threshold = MP_SIZE_T_MAX;
219 speed_function_t function;
220 speed_function_t function2;
221 double step_factor; /* how much to step relatively */
222 int step; /* how much to step absolutely */
223 double function_fudge; /* multiplier for "function" speeds */
224 int stop_since_change;
229 mp_size_t check_size;
230 mp_size_t size_extra;
232 #define DATA_HIGH_LT_R 1
233 #define DATA_HIGH_GE_R 2
240 /* These are normally undefined when false, which suits "#if" fine.
241 But give them zero values so they can be used in plain C "if"s. */
242 #ifndef UDIV_PREINV_ALWAYS
243 #define UDIV_PREINV_ALWAYS 0
245 #ifndef HAVE_NATIVE_mpn_divexact_1
246 #define HAVE_NATIVE_mpn_divexact_1 0
248 #ifndef HAVE_NATIVE_mpn_divrem_1
249 #define HAVE_NATIVE_mpn_divrem_1 0
251 #ifndef HAVE_NATIVE_mpn_divrem_2
252 #define HAVE_NATIVE_mpn_divrem_2 0
254 #ifndef HAVE_NATIVE_mpn_mod_1
255 #define HAVE_NATIVE_mpn_mod_1 0
257 #ifndef HAVE_NATIVE_mpn_modexact_1_odd
258 #define HAVE_NATIVE_mpn_modexact_1_odd 0
260 #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
261 #define HAVE_NATIVE_mpn_preinv_divrem_1 0
263 #ifndef HAVE_NATIVE_mpn_preinv_mod_1
264 #define HAVE_NATIVE_mpn_preinv_mod_1 0
266 #ifndef HAVE_NATIVE_mpn_sqr_basecase
267 #define HAVE_NATIVE_mpn_sqr_basecase 0
271 #define MAX3(a,b,c) MAX (MAX (a, b), c)
278 n |= GMP_NUMB_HIGHBIT;
282 #define GMP_NUMB_HALFMASK ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
289 n &= GMP_NUMB_HALFMASK;
295 /* Add an entry to the end of the dat[] array, reallocing to make it bigger
298 add_dat (mp_size_t size, double d)
300 #define ALLOCDAT_STEP 500
302 ASSERT_ALWAYS (ndat <= allocdat);
304 if (ndat == allocdat)
306 dat = (struct dat_t *) __gmp_allocate_or_reallocate
307 (dat, allocdat * sizeof(dat[0]),
308 (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
309 allocdat += ALLOCDAT_STEP;
312 dat[ndat].size = size;
318 /* Return the threshold size based on the data accumulated. */
320 analyze_dat (int final)
325 /* If the threshold is set at dat[0].size, any positive values are bad. */
327 for (j = 0; j < ndat; j++)
331 if (option_trace >= 2 && final)
334 printf ("x is the sum of the badness from setting thresh at given size\n");
335 printf (" (minimum x is sought)\n");
336 printf ("size=%ld first x=%.4f\n", (long) dat[j].size, x);
343 /* When stepping to the next dat[j].size, positive values are no longer
344 bad (so subtracted), negative values become bad (so add the absolute
345 value, meaning subtract). */
346 for (j = 0; j < ndat; x -= dat[j].d, j++)
348 if (option_trace >= 2 && final)
349 printf ("size=%ld x=%.4f\n", (long) dat[j].size, x);
362 /* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */
364 mp_limb_t mpn_divrem_1_tune
365 __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t));
366 mp_limb_t mpn_mod_1_tune
367 __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t));
370 speed_mpn_mod_1_tune (struct speed_params *s)
372 SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
375 speed_mpn_divrem_1_tune (struct speed_params *s)
377 SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
382 tuneup_measure (speed_function_t fun,
383 const struct param_t *param,
384 struct speed_params *s)
386 static struct param_t dummy;
393 s->size += param->size_extra;
396 SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
397 SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
399 mpn_random (s->xp, s->size);
400 mpn_random (s->yp, s->size);
402 switch (param->data_high) {
404 s->xp[s->size-1] %= s->r;
405 s->yp[s->size-1] %= s->r;
408 s->xp[s->size-1] |= s->r;
409 s->yp[s->size-1] |= s->r;
413 t = speed_measure (fun, s);
415 s->size -= param->size_extra;
422 #define PRINT_WIDTH 31
425 print_define_start (const char *name)
427 printf ("#define %-*s ", PRINT_WIDTH, name);
433 print_define_end_remark (const char *name, mp_size_t value, const char *remark)
436 printf ("#define %-*s ", PRINT_WIDTH, name);
438 if (value == MP_SIZE_T_MAX)
439 printf ("MP_SIZE_T_MAX");
441 printf ("%5ld", (long) value);
444 printf (" /* %s */", remark);
450 print_define_end (const char *name, mp_size_t value)
453 if (value == MP_SIZE_T_MAX)
459 print_define_end_remark (name, value, remark);
463 print_define (const char *name, mp_size_t value)
465 print_define_start (name);
466 print_define_end (name, value);
470 print_define_remark (const char *name, mp_size_t value, const char *remark)
472 print_define_start (name);
473 print_define_end_remark (name, value, remark);
478 one (mp_size_t *threshold, struct param_t *param)
480 int since_positive, since_thresh_change;
481 int thresh_idx, new_thresh_idx;
483 #define DEFAULT(x,n) do { if (! (x)) (x) = (n); } while (0)
485 DEFAULT (param->function_fudge, 1.0);
486 DEFAULT (param->function2, param->function);
487 DEFAULT (param->step_factor, 0.01); /* small steps by default */
488 DEFAULT (param->step, 1); /* small steps by default */
489 DEFAULT (param->stop_since_change, 80);
490 DEFAULT (param->stop_factor, 1.2);
491 DEFAULT (param->min_size, 10);
492 DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
494 if (param->check_size != 0)
497 s.size = param->check_size;
499 *threshold = s.size+1;
500 t1 = tuneup_measure (param->function, param, &s);
503 t2 = tuneup_measure (param->function2, param, &s);
504 if (t1 == -1.0 || t2 == -1.0)
506 printf ("Oops, can't run both functions at size %ld\n",
510 t1 *= param->function_fudge;
512 /* ask that t2 is at least 4% below t1 */
516 printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
517 *threshold = MP_SIZE_T_MAX;
518 if (! param->noprint)
519 print_define (param->name, *threshold);
523 if (option_trace >= 2)
524 printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
525 (long) s.size, t1, t2);
528 if (! param->noprint || option_trace)
529 print_define_start (param->name);
533 since_thresh_change = 0;
536 if (option_trace >= 2)
538 printf (" algorithm-A algorithm-B ratio possible\n");
539 printf (" (seconds) (seconds) diff thresh\n");
542 for (s.size = param->min_size;
543 s.size < param->max_size;
544 s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
546 double ti, tiplus1, d;
549 FIXME: check minimum size requirements are met, possibly by just
550 checking for the -1 returns from the speed functions.
553 /* using method A at this size */
554 *threshold = s.size+1;
555 ti = tuneup_measure (param->function, param, &s);
558 ti *= param->function_fudge;
560 /* using method B at this size */
562 tiplus1 = tuneup_measure (param->function2, param, &s);
566 /* Calculate the fraction by which the one or the other routine is
569 d = (tiplus1 - ti) / tiplus1; /* negative */
571 d = (tiplus1 - ti) / ti; /* positive */
575 new_thresh_idx = analyze_dat (0);
577 if (option_trace >= 2)
578 printf ("size=%ld %.9f %.9f % .4f %c %ld\n",
579 (long) s.size, ti, tiplus1, d,
580 ti > tiplus1 ? '#' : ' ',
581 (long) dat[new_thresh_idx].size);
583 /* Stop if the last time method i was faster was more than a
584 certain number of measurements ago. */
585 #define STOP_SINCE_POSITIVE 200
589 if (++since_positive > STOP_SINCE_POSITIVE)
591 if (option_trace >= 1)
592 printf ("stopped due to since_positive (%d)\n",
593 STOP_SINCE_POSITIVE);
597 /* Stop if method A has become slower by a certain factor. */
598 if (ti >= tiplus1 * param->stop_factor)
600 if (option_trace >= 1)
601 printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
606 /* Stop if the threshold implied hasn't changed in a certain
607 number of measurements. (It's this condition that usually
609 if (thresh_idx != new_thresh_idx)
610 since_thresh_change = 0, thresh_idx = new_thresh_idx;
612 if (++since_thresh_change > param->stop_since_change)
614 if (option_trace >= 1)
615 printf ("stopped due to since_thresh_change (%d)\n",
616 param->stop_since_change);
620 /* Stop if the threshold implied is more than a certain number of
622 #define STOP_SINCE_AFTER 500
623 if (ndat - thresh_idx > STOP_SINCE_AFTER)
625 if (option_trace >= 1)
626 printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
631 /* Stop when the size limit is reached before the end of the
632 crossover, but only show this as an error for >= the default max
633 size. FIXME: Maybe should make it a param choice whether this is
635 if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
637 fprintf (stderr, "%s\n", param->name);
638 fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
639 (long) dat[0].size, (long) dat[ndat-1].size, ndat);
640 fprintf (stderr, " max size reached before end of crossover\n");
645 if (option_trace >= 1)
646 printf ("sizes %ld to %ld total %d measurements\n",
647 (long) dat[0].size, (long) dat[ndat-1].size, ndat);
649 *threshold = dat[analyze_dat (1)].size;
651 if (param->min_is_always)
653 if (*threshold == param->min_size)
657 if (! param->noprint || option_trace)
658 print_define_end (param->name, *threshold);
662 /* Special probing for the fft thresholds. The size restrictions on the
663 FFTs mean the graph of time vs size has a step effect. See this for
666 ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
669 The current approach is to compare routines at the midpoint of relevant
670 steps. Arguably a more sophisticated system of threshold data is wanted
671 if this step effect remains. */
674 const char *table_name;
675 const char *threshold_name;
676 const char *modf_threshold_name;
677 mp_size_t *p_threshold;
678 mp_size_t *p_modf_threshold;
679 mp_size_t first_size;
681 speed_function_t function;
682 speed_function_t mul_modf_function;
683 speed_function_t mul_function;
688 /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
689 N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
693 fft_step_size (int k)
697 step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
698 step *= (mp_size_t) 1 << k;
702 printf ("Can't handle k=%d\n", k);
710 fft_next_size (mp_size_t pl, int k)
712 mp_size_t m = fft_step_size (k);
714 /* printf ("[k=%d %ld] %ld ->", k, m, pl); */
716 if (pl == 0 || (pl & (m-1)) != 0)
717 pl = (pl | (m-1)) + 1;
719 /* printf (" %ld\n", pl); */
723 #define NMAX_DEFAULT 1000000
728 mpn_mul_fft_lcm (size_t a, unsigned int k)
732 while (a % 2 == 0 && k > 0)
741 fftfill (mp_size_t pl, int k, int sqr)
744 mp_bitcnt_t N, Nprime, nprime, M;
746 N = pl * GMP_NUMB_BITS;
749 maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
751 Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
752 nprime = Nprime / GMP_NUMB_BITS;
753 if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
758 K2 = 1L << mpn_fft_best_k (nprime, sqr);
759 if ((nprime & (K2 - 1)) == 0)
761 nprime = (nprime + K2 - 1) & -K2;
762 Nprime = nprime * GMP_LIMB_BITS;
765 ASSERT_ALWAYS (nprime < pl);
771 compare_double (const void *ap, const void *bp)
773 double a = * (const double *) ap;
774 double b = * (const double *) bp;
785 median (double *times, int n)
787 qsort (times, n, sizeof (double), compare_double);
791 #define FFT_CACHE_SIZE 25
792 typedef struct fft_cache
798 fft_cache_t fft_cache[FFT_CACHE_SIZE];
801 cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
805 double t, ttab[MAX_REPS];
807 if (fft_cache[k].n == n)
808 return fft_cache[k].time;
810 for (i = 0; i < n_measurements; i++)
813 mpn_mul_fft (rp, n, ap, n, bp, n, k);
814 ttab[i] = speed_endtime ();
817 t = median (ttab, n_measurements);
819 fft_cache[k].time = t;
823 #define INSERT_FFTTAB(idx, nval, kval) \
825 fft_tab[idx].n = nval; \
826 fft_tab[idx].k = kval; \
827 fft_tab[idx+1].n = -1; /* sentinel */ \
828 fft_tab[idx+1].k = -1; \
832 fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
834 mp_size_t n, n1, prev_n1;
835 int k, best_k, last_best_k, kmax;
839 mp_limb_t *ap, *bp, *rp;
842 struct fft_table_nk *fft_tab;
844 fft_tab = mpn_fft_table3[p->sqr];
846 for (k = 0; k < FFT_CACHE_SIZE; k++)
849 if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
851 nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
855 printf ("#define %s%*s", p->table_name, 38, "");
859 INSERT_FFTTAB (0, nmin, initial_k);
864 printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
871 ap = malloc (sizeof (mp_limb_t));
875 bp = malloc (sizeof (mp_limb_t));
876 rp = malloc (sizeof (mp_limb_t));
879 /* Round n to comply to initial k value */
880 n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
882 n_measurements = (18 - initial_k) | 1;
883 n_measurements = MAX (n_measurements, MIN_REPS);
884 n_measurements = MIN (n_measurements, MAX_REPS);
886 last_best_k = initial_k;
893 /* Assume the current best k is best until we hit its next FFT step. */
898 start_k = MAX (4, best_k - 4);
899 end_k = MIN (24, best_k + 4);
900 for (k = start_k; k <= end_k; k++)
902 n1 = mpn_fft_next_size (prev_n1, k);
904 eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
906 if (eff < 70) /* avoid measuring too slow fft:s */
914 ap = realloc (ap, sizeof (mp_limb_t));
915 rp = realloc (rp, sizeof (mp_limb_t));
916 ap = bp = realloc (ap, alloc * sizeof (mp_limb_t));
917 mpn_random (ap, alloc);
918 rp = realloc (rp, alloc * sizeof (mp_limb_t));
922 ap = realloc (ap, sizeof (mp_limb_t));
923 bp = realloc (bp, sizeof (mp_limb_t));
924 rp = realloc (rp, sizeof (mp_limb_t));
925 ap = realloc (ap, alloc * sizeof (mp_limb_t));
926 mpn_random (ap, alloc);
927 bp = realloc (bp, alloc * sizeof (mp_limb_t));
928 mpn_random (bp, alloc);
929 rp = realloc (rp, alloc * sizeof (mp_limb_t));
933 t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
935 if (t1 * n_measurements > 0.3)
937 n_measurements = MAX (n_measurements, MIN_REPS);
946 n1 = mpn_fft_next_size (prev_n1, best_k);
948 if (last_best_k != best_k)
950 ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
952 if (idx >= FFT_TABLE3_SIZE)
954 printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
957 INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
964 printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
967 if (option_trace >= 2)
969 printf ("{%lu,%u}\n", prev_n1, best_k);
973 last_best_k = best_k;
980 prev_eff = fftfill (prev_n1, best_k, p->sqr);
981 n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
982 eff = fftfill (n1, best_k, p->sqr);
991 kmax = sizeof (mp_size_t) * 4; /* GMP_MP_SIZE_T_BITS / 2 */
992 kmax = MIN (kmax, 25-1);
993 for (k = last_best_k + 1; k <= kmax; k++)
995 if (idx >= FFT_TABLE3_SIZE)
997 printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
1000 INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
1007 printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
1025 fft (struct fft_param_t *p)
1028 int k, idx, initial_k;
1030 /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
1034 /* Use plain one() mechanism, for some reasonable initial values of k. The
1035 advantage is that we don't depend on mpn_fft_table3, which can therefore
1036 leave it completely uninitialized. */
1038 static struct param_t param;
1039 mp_size_t thres, best_thres;
1043 best_thres = MP_SIZE_T_MAX;
1046 for (k = 5; k <= 7; k++)
1048 param.name = p->modf_threshold_name;
1049 param.min_size = 100;
1050 param.max_size = 2000;
1051 param.function = p->mul_function;
1052 param.step_factor = 0.0;
1054 param.function2 = p->mul_modf_function;
1057 one (&thres, ¶m);
1058 if (thres < best_thres)
1065 *(p->p_modf_threshold) = best_thres;
1066 sprintf (buf, "k = %d", best_k);
1067 print_define_remark (p->modf_threshold_name, best_thres, buf);
1071 size = p->first_size;
1076 size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
1077 k = mpn_fft_best_k (size, p->sqr);
1079 if (size >= p->max_size)
1082 s.size = size + fft_step_size (k) / 2;
1084 tk = tuneup_measure (p->mul_modf_function, NULL, &s);
1088 tm = tuneup_measure (p->mul_function, NULL, &s);
1092 if (option_trace >= 2)
1093 printf ("at %ld size=%ld k=%d %.9f size=%ld modf %.9f\n",
1095 (long) size + fft_step_size (k) / 2, k, tk,
1100 *p->p_modf_threshold = s.size;
1101 print_define (p->modf_threshold_name, *p->p_modf_threshold);
1108 /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
1110 idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
1111 printf ("#define %s_SIZE %d\n", p->table_name, idx);
1113 /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
1115 size = 2 * *p->p_modf_threshold; /* OK? */
1119 mp_size_t mulmod_size, mul_size;;
1121 if (size >= p->max_size)
1124 mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
1125 mul_size = (size + mulmod_size) / 2; /* middle of step */
1127 s.size = mulmod_size;
1128 tk = tuneup_measure (p->function, NULL, &s);
1133 tm = tuneup_measure (p->mul_function, NULL, &s);
1137 if (option_trace >= 2)
1138 printf ("at %ld size=%ld %.9f size=%ld mul %.9f\n",
1140 (long) mulmod_size, tk,
1141 (long) mul_size, tm);
1147 *p->p_threshold = s.size;
1148 print_define (p->threshold_name, *p->p_threshold);
1156 /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
1157 giving wrong results. */
1161 static struct param_t param;
1163 param.function = speed_mpn_mul_n;
1165 param.name = "MUL_TOOM22_THRESHOLD";
1166 param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
1167 param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
1168 one (&mul_toom22_threshold, ¶m);
1170 param.name = "MUL_TOOM33_THRESHOLD";
1171 param.min_size = MAX (mul_toom22_threshold, MPN_TOOM33_MUL_MINSIZE);
1172 param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
1173 one (&mul_toom33_threshold, ¶m);
1175 param.name = "MUL_TOOM44_THRESHOLD";
1176 param.min_size = MAX (mul_toom33_threshold, MPN_TOOM44_MUL_MINSIZE);
1177 param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
1178 one (&mul_toom44_threshold, ¶m);
1180 param.name = "MUL_TOOM6H_THRESHOLD";
1181 param.min_size = MAX (mul_toom44_threshold, MPN_TOOM6H_MUL_MINSIZE);
1182 param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
1183 one (&mul_toom6h_threshold, ¶m);
1185 param.name = "MUL_TOOM8H_THRESHOLD";
1186 param.min_size = MAX (mul_toom6h_threshold, MPN_TOOM8H_MUL_MINSIZE);
1187 param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
1188 one (&mul_toom8h_threshold, ¶m);
1190 /* disabled until tuned */
1191 MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
1197 static struct param_t param;
1202 param.function = speed_mpn_toom32_for_toom43_mul;
1203 param.function2 = speed_mpn_toom43_for_toom32_mul;
1204 param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
1205 param.min_size = MPN_TOOM43_MUL_MINSIZE;
1206 one (&thres, ¶m);
1207 mul_toom32_to_toom43_threshold = 17*thres/24;
1208 print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
1210 param.function = speed_mpn_toom32_for_toom53_mul;
1211 param.function2 = speed_mpn_toom53_for_toom32_mul;
1212 param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
1213 param.min_size = MPN_TOOM53_MUL_MINSIZE;
1214 one (&thres, ¶m);
1215 mul_toom32_to_toom53_threshold = 19*thres/30;
1216 print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
1218 param.function = speed_mpn_toom42_for_toom53_mul;
1219 param.function2 = speed_mpn_toom53_for_toom42_mul;
1220 param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
1221 param.min_size = MPN_TOOM53_MUL_MINSIZE;
1222 one (&thres, ¶m);
1223 mul_toom42_to_toom53_threshold = 11*thres/20;
1224 print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
1226 param.function = speed_mpn_toom42_mul;
1227 param.function2 = speed_mpn_toom63_mul;
1228 param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
1229 param.min_size = MPN_TOOM63_MUL_MINSIZE;
1230 one (&thres, ¶m);
1231 mul_toom42_to_toom63_threshold = thres/2;
1232 print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
1239 static struct param_t param;
1241 param.function = speed_mpn_mullo_n;
1243 param.name = "MULLO_BASECASE_THRESHOLD";
1245 param.min_is_always = 1;
1246 param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
1247 param.stop_factor = 1.5;
1249 one (&mullo_basecase_threshold, ¶m);
1251 param.name = "MULLO_DC_THRESHOLD";
1253 param.min_is_always = 0;
1254 param.max_size = 1000;
1255 one (&mullo_dc_threshold, ¶m);
1257 if (mullo_basecase_threshold >= mullo_dc_threshold)
1259 print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
1260 print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
1264 print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
1265 print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
1269 param.name = "MULLO_MUL_N_THRESHOLD";
1270 param.min_size = mullo_dc_threshold;
1271 param.max_size = 2 * mul_fft_threshold;
1273 param.step_factor = 0.03;
1274 one (&mullo_mul_n_threshold, ¶m);
1276 print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
1277 "without FFT use mullo forever");
1282 tune_mulmod_bnm1 (void)
1284 static struct param_t param;
1286 param.name = "MULMOD_BNM1_THRESHOLD";
1287 param.function = speed_mpn_mulmod_bnm1;
1289 param.max_size = 100;
1290 one (&mulmod_bnm1_threshold, ¶m);
1294 tune_sqrmod_bnm1 (void)
1296 static struct param_t param;
1298 param.name = "SQRMOD_BNM1_THRESHOLD";
1299 param.function = speed_mpn_sqrmod_bnm1;
1301 param.max_size = 100;
1302 one (&sqrmod_bnm1_threshold, ¶m);
1306 /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
1307 is faster only at size==2 then we don't want to bother with extra code
1308 just for that. Start karatsuba from 4 same as MUL above. */
1313 /* disabled until tuned */
1314 SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
1316 if (HAVE_NATIVE_mpn_sqr_basecase)
1318 print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
1319 sqr_basecase_threshold = 0;
1323 static struct param_t param;
1324 param.name = "SQR_BASECASE_THRESHOLD";
1325 param.function = speed_mpn_sqr;
1327 param.min_is_always = 1;
1328 param.max_size = TUNE_SQR_TOOM2_MAX;
1330 one (&sqr_basecase_threshold, ¶m);
1334 static struct param_t param;
1335 param.name = "SQR_TOOM2_THRESHOLD";
1336 param.function = speed_mpn_sqr;
1337 param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
1338 param.max_size = TUNE_SQR_TOOM2_MAX;
1340 one (&sqr_toom2_threshold, ¶m);
1342 if (! HAVE_NATIVE_mpn_sqr_basecase
1343 && sqr_toom2_threshold < sqr_basecase_threshold)
1345 /* Karatsuba becomes faster than mul_basecase before
1346 sqr_basecase does. Arrange for the expression
1347 "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
1348 selects mpn_sqr_basecase in mpn_sqr to be false, by setting
1349 SQR_TOOM2_THRESHOLD to zero, making
1350 SQR_BASECASE_THRESHOLD the toom2 threshold. */
1352 sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
1353 SQR_TOOM2_THRESHOLD = 0;
1355 print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
1357 print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
1358 "never sqr_basecase");
1362 if (! HAVE_NATIVE_mpn_sqr_basecase)
1363 print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
1364 print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
1369 static struct param_t param;
1370 mp_size_t toom3_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
1372 param.function = speed_mpn_sqr;
1374 param.name = "SQR_TOOM3_THRESHOLD";
1375 param.min_size = MAX (toom3_start, MPN_TOOM3_SQR_MINSIZE);
1376 param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
1377 one (&sqr_toom3_threshold, ¶m);
1379 param.name = "SQR_TOOM4_THRESHOLD";
1380 param.min_size = MAX (sqr_toom3_threshold, MPN_TOOM4_SQR_MINSIZE);
1381 param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
1382 one (&sqr_toom4_threshold, ¶m);
1384 param.name = "SQR_TOOM6_THRESHOLD";
1385 param.min_size = MAX (sqr_toom4_threshold, MPN_TOOM6_SQR_MINSIZE);
1386 param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
1387 one (&sqr_toom6_threshold, ¶m);
1389 param.name = "SQR_TOOM8_THRESHOLD";
1390 param.min_size = MAX (sqr_toom6_threshold, MPN_TOOM8_SQR_MINSIZE);
1391 param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
1392 one (&sqr_toom8_threshold, ¶m);
1400 s.r = 0; /* clear to make speed function do 2n/n */
1402 static struct param_t param;
1403 param.name = "DC_DIV_QR_THRESHOLD";
1404 param.function = speed_mpn_sbpi1_div_qr;
1405 param.function2 = speed_mpn_dcpi1_div_qr;
1407 one (&dc_div_qr_threshold, ¶m);
1410 static struct param_t param;
1411 param.name = "DC_DIVAPPR_Q_THRESHOLD";
1412 param.function = speed_mpn_sbpi1_divappr_q;
1413 param.function2 = speed_mpn_dcpi1_divappr_q;
1415 one (&dc_divappr_q_threshold, ¶m);
1420 speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
1422 if (s->size < DC_DIV_QR_THRESHOLD)
1423 return speed_mpn_sbpi1_div_qr (s);
1425 return speed_mpn_dcpi1_div_qr (s);
1431 s.r = 0; /* clear to make speed function do 2n/n */
1433 static struct param_t param;
1434 param.name = "MU_DIV_QR_THRESHOLD";
1435 param.function = speed_mpn_dcpi1_div_qr;
1436 param.function2 = speed_mpn_mu_div_qr;
1438 param.max_size = 5000;
1439 param.step_factor = 0.02;
1440 one (&mu_div_qr_threshold, ¶m);
1443 static struct param_t param;
1444 param.name = "MU_DIVAPPR_Q_THRESHOLD";
1445 param.function = speed_mpn_dcpi1_divappr_q;
1446 param.function2 = speed_mpn_mu_divappr_q;
1448 param.max_size = 5000;
1449 param.step_factor = 0.02;
1450 one (&mu_divappr_q_threshold, ¶m);
1453 static struct param_t param;
1454 param.name = "MUPI_DIV_QR_THRESHOLD";
1455 param.function = speed_mpn_sbordcpi1_div_qr;
1456 param.function2 = speed_mpn_mupi_div_qr;
1458 param.min_is_always = 1;
1459 param.max_size = 1000;
1460 param.step_factor = 0.02;
1461 one (&mupi_div_qr_threshold, ¶m);
1468 s.r = 0; /* clear to make speed function do 2n/n*/
1470 static struct param_t param;
1471 param.name = "DC_BDIV_QR_THRESHOLD";
1472 param.function = speed_mpn_sbpi1_bdiv_qr;
1473 param.function2 = speed_mpn_dcpi1_bdiv_qr;
1475 one (&dc_bdiv_qr_threshold, ¶m);
1478 static struct param_t param;
1479 param.name = "DC_BDIV_Q_THRESHOLD";
1480 param.function = speed_mpn_sbpi1_bdiv_q;
1481 param.function2 = speed_mpn_dcpi1_bdiv_q;
1483 one (&dc_bdiv_q_threshold, ¶m);
1490 s.r = 0; /* clear to make speed function do 2n/n*/
1492 static struct param_t param;
1493 param.name = "MU_BDIV_QR_THRESHOLD";
1494 param.function = speed_mpn_dcpi1_bdiv_qr;
1495 param.function2 = speed_mpn_mu_bdiv_qr;
1497 param.max_size = 5000;
1498 param.step_factor = 0.02;
1499 one (&mu_bdiv_qr_threshold, ¶m);
1502 static struct param_t param;
1503 param.name = "MU_BDIV_Q_THRESHOLD";
1504 param.function = speed_mpn_dcpi1_bdiv_q;
1505 param.function2 = speed_mpn_mu_bdiv_q;
1507 param.max_size = 5000;
1508 param.step_factor = 0.02;
1509 one (&mu_bdiv_q_threshold, ¶m);
1514 tune_invertappr (void)
1516 static struct param_t param;
1518 param.function = speed_mpn_ni_invertappr;
1519 param.name = "INV_MULMOD_BNM1_THRESHOLD";
1521 one (&inv_mulmod_bnm1_threshold, ¶m);
1523 param.function = speed_mpn_invertappr;
1524 param.name = "INV_NEWTON_THRESHOLD";
1526 one (&inv_newton_threshold, ¶m);
1532 static struct param_t param;
1534 param.function = speed_mpn_invert;
1535 param.name = "INV_APPR_THRESHOLD";
1537 one (&inv_appr_threshold, ¶m);
1543 static struct param_t param;
1545 param.function = speed_mpn_binvert;
1546 param.name = "BINV_NEWTON_THRESHOLD";
1547 param.min_size = 8; /* pointless with smaller operands */
1548 one (&binv_newton_threshold, ¶m);
1554 #define TUNE_REDC_2_MAX 100
1555 #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
1556 #define WANT_REDC_2 1
1561 static struct param_t param;
1562 param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1563 param.function = speed_mpn_redc_1;
1564 param.function2 = speed_mpn_redc_2;
1565 param.max_size = TUNE_REDC_2_MAX;
1567 one (&redc_1_to_redc_2_threshold, ¶m);
1570 static struct param_t param;
1571 param.name = "REDC_2_TO_REDC_N_THRESHOLD";
1572 param.function = speed_mpn_redc_2;
1573 param.function2 = speed_mpn_redc_n;
1574 param.min_size = 16;
1576 one (&redc_2_to_redc_n_threshold, ¶m);
1578 if (redc_1_to_redc_2_threshold >= TUNE_REDC_2_MAX - 1)
1580 /* Disable REDC_2. This is not supposed to happen. */
1581 print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1582 print_define_remark ("REDC_2_TO_REDC_N_THRESHOLD", 0, "anomaly: never REDC_2");
1586 print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
1587 print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1591 static struct param_t param;
1592 param.name = "REDC_1_TO_REDC_N_THRESHOLD";
1593 param.function = speed_mpn_redc_1;
1594 param.function2 = speed_mpn_redc_n;
1595 param.min_size = 16;
1596 one (&redc_1_to_redc_n_threshold, ¶m);
1602 tune_matrix22_mul (void)
1604 static struct param_t param;
1605 param.name = "MATRIX22_STRASSEN_THRESHOLD";
1606 param.function = speed_mpn_matrix22_mul;
1608 one (&matrix22_strassen_threshold, ¶m);
1614 static struct param_t param;
1615 param.name = "HGCD_THRESHOLD";
1616 param.function = speed_mpn_hgcd;
1617 /* We seem to get strange results for small sizes */
1618 param.min_size = 30;
1619 one (&hgcd_threshold, ¶m);
1625 static struct param_t param;
1626 param.name = "GCD_DC_THRESHOLD";
1627 param.function = speed_mpn_gcd;
1628 param.min_size = hgcd_threshold;
1629 param.max_size = 3000;
1630 param.step_factor = 0.02;
1631 one (&gcd_dc_threshold, ¶m);
1635 tune_gcdext_dc (void)
1637 static struct param_t param;
1638 param.name = "GCDEXT_DC_THRESHOLD";
1639 param.function = speed_mpn_gcdext;
1640 param.min_size = hgcd_threshold;
1641 param.max_size = 3000;
1642 param.step_factor = 0.02;
1643 one (&gcdext_dc_threshold, ¶m);
1647 /* size_extra==1 reflects the fact that with high<divisor one division is
1648 always skipped. Forcing high<divisor while testing ensures consistency
1649 while stepping through sizes, ie. that size-1 divides will be done each
1652 min_size==2 and min_is_always are used so that if plain division is only
1653 better at size==1 then don't bother including that code just for that
1654 case, instead go with preinv always and get a size saving. */
1656 #define DIV_1_PARAMS \
1657 param.check_size = 256; \
1658 param.min_size = 2; \
1659 param.min_is_always = 1; \
1660 param.data_high = DATA_HIGH_LT_R; \
1661 param.size_extra = 1; \
1662 param.stop_factor = 2.0;
1665 double (*tuned_speed_mpn_divrem_1) __GMP_PROTO ((struct speed_params *));
1668 tune_divrem_1 (void)
1670 /* plain version by default */
1671 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
1673 /* No support for tuning native assembler code, do that by hand and put
1674 the results in the .asm file, there's no need for such thresholds to
1675 appear in gmp-mparam.h. */
1676 if (HAVE_NATIVE_mpn_divrem_1)
1679 if (GMP_NAIL_BITS != 0)
1681 print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
1682 "no preinv with nails");
1683 print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
1684 "no preinv with nails");
1688 if (UDIV_PREINV_ALWAYS)
1690 print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
1691 print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
1695 tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
1697 /* Tune for the integer part of mpn_divrem_1. This will very possibly be
1698 a bit out for the fractional part, but that's too bad, the integer part
1699 is more important. */
1701 static struct param_t param;
1702 param.name = "DIVREM_1_NORM_THRESHOLD";
1704 s.r = randlimb_norm ();
1705 param.function = speed_mpn_divrem_1_tune;
1706 one (&divrem_1_norm_threshold, ¶m);
1709 static struct param_t param;
1710 param.name = "DIVREM_1_UNNORM_THRESHOLD";
1712 s.r = randlimb_half ();
1713 param.function = speed_mpn_divrem_1_tune;
1714 one (&divrem_1_unnorm_threshold, ¶m);
1722 /* No support for tuning native assembler code, do that by hand and put
1723 the results in the .asm file, there's no need for such thresholds to
1724 appear in gmp-mparam.h. */
1725 if (HAVE_NATIVE_mpn_mod_1)
1728 if (GMP_NAIL_BITS != 0)
1730 print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
1731 "no preinv with nails");
1732 print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
1733 "no preinv with nails");
1737 if (UDIV_PREINV_ALWAYS)
1739 print_define ("MOD_1_NORM_THRESHOLD", 0L);
1740 print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
1745 static struct param_t param;
1746 param.name = "MOD_1_NORM_THRESHOLD";
1748 s.r = randlimb_norm ();
1749 param.function = speed_mpn_mod_1_tune;
1750 one (&mod_1_norm_threshold, ¶m);
1753 static struct param_t param;
1754 param.name = "MOD_1_UNNORM_THRESHOLD";
1756 s.r = randlimb_half ();
1757 param.function = speed_mpn_mod_1_tune;
1758 one (&mod_1_unnorm_threshold, ¶m);
1762 static struct param_t param;
1764 param.check_size = 256;
1766 s.r = randlimb_norm ();
1767 param.function = speed_mpn_mod_1_tune;
1769 param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
1771 one (&mod_1n_to_mod_1_1_threshold, ¶m);
1774 static struct param_t param;
1776 param.check_size = 256;
1778 s.r = randlimb_norm () / 5;
1779 param.function = speed_mpn_mod_1_tune;
1782 param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
1784 one (&mod_1u_to_mod_1_1_threshold, ¶m);
1786 param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
1787 param.min_size = mod_1u_to_mod_1_1_threshold;
1788 one (&mod_1_1_to_mod_1_2_threshold, ¶m);
1790 if (mod_1u_to_mod_1_1_threshold + 2 >= mod_1_1_to_mod_1_2_threshold)
1792 /* Disable mod_1_1, mod_1_2 is always faster. Measure when to switch
1793 (from mod_1_unnorm) to mod_1_2. */
1794 mod_1_1_to_mod_1_2_threshold = 0;
1796 /* This really measures mod_1u -> mod_1_2 */
1798 one (&mod_1u_to_mod_1_1_threshold, ¶m);
1800 print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
1802 param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
1803 param.min_size = mod_1_1_to_mod_1_2_threshold;
1804 one (&mod_1_2_to_mod_1_4_threshold, ¶m);
1806 if (mod_1_1_to_mod_1_2_threshold + 2 >= mod_1_2_to_mod_1_4_threshold)
1808 /* Disable mod_1_2, mod_1_4 is always faster. Measure when to switch
1809 (from mod_1_unnorm or mod_1_1) to mod_1_4. */
1810 mod_1_2_to_mod_1_4_threshold = 0;
1813 one (&mod_1_1_to_mod_1_2_threshold, ¶m);
1815 print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold, NULL);
1816 print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold, NULL);
1820 static struct param_t param;
1822 param.check_size = 256;
1824 param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
1825 s.r = randlimb_norm ();
1826 param.function = speed_mpn_preinv_mod_1;
1827 param.function2 = speed_mpn_mod_1_tune;
1828 one (&preinv_mod_1_to_mod_1_threshold, ¶m);
1833 /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
1834 imply that udiv_qrnnd_preinv is worth using, but it seems most
1835 straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
1839 tune_preinv_divrem_1 (void)
1841 static struct param_t param;
1842 speed_function_t divrem_1;
1843 const char *divrem_1_name;
1846 if (GMP_NAIL_BITS != 0)
1848 print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
1852 /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
1853 it's faster than mpn_divrem_1. */
1854 if (HAVE_NATIVE_mpn_preinv_divrem_1)
1856 print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
1860 /* If udiv_qrnnd_preinv is the only division method then of course
1861 mpn_preinv_divrem_1 should be used. */
1862 if (UDIV_PREINV_ALWAYS)
1864 print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
1868 /* If we've got an assembler version of mpn_divrem_1, then compare against
1869 that, not the mpn_divrem_1_div generic C. */
1870 if (HAVE_NATIVE_mpn_divrem_1)
1872 divrem_1 = speed_mpn_divrem_1;
1873 divrem_1_name = "mpn_divrem_1";
1877 divrem_1 = speed_mpn_divrem_1_div;
1878 divrem_1_name = "mpn_divrem_1_div";
1881 param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
1882 s.size = 200; /* generous but not too big */
1883 /* Divisor, nonzero. Unnormalized so as to exercise the shift!=0 case,
1884 since in general that's probably most common, though in fact for a
1885 64-bit limb mp_bases[10].big_base is normalized. */
1886 s.r = urandom() & (GMP_NUMB_MASK >> 4);
1887 if (s.r == 0) s.r = 123;
1889 t1 = tuneup_measure (speed_mpn_preinv_divrem_1, ¶m, &s);
1890 t2 = tuneup_measure (divrem_1, ¶m, &s);
1891 if (t1 == -1.0 || t2 == -1.0)
1893 printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
1894 divrem_1_name, (long) s.size);
1897 if (option_trace >= 1)
1898 printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
1899 (long) s.size, t1, divrem_1_name, t2);
1901 print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
1907 tune_divrem_2 (void)
1909 static struct param_t param;
1911 /* No support for tuning native assembler code, do that by hand and put
1912 the results in the .asm file, and there's no need for such thresholds
1913 to appear in gmp-mparam.h. */
1914 if (HAVE_NATIVE_mpn_divrem_2)
1917 if (GMP_NAIL_BITS != 0)
1919 print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
1920 "no preinv with nails");
1924 if (UDIV_PREINV_ALWAYS)
1926 print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
1930 /* Tune for the integer part of mpn_divrem_2. This will very possibly be
1931 a bit out for the fractional part, but that's too bad, the integer part
1934 min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
1935 code space if plain division is better only at size==2 or size==3. */
1936 param.name = "DIVREM_2_THRESHOLD";
1937 param.check_size = 256;
1939 param.min_is_always = 1;
1940 param.size_extra = 2; /* does qsize==nsize-2 divisions */
1941 param.stop_factor = 2.0;
1943 s.r = randlimb_norm ();
1944 param.function = speed_mpn_divrem_2;
1945 one (&divrem_2_threshold, ¶m);
1949 /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
1950 tune for that. Its speed can differ on odd or even divisor, so take an
1951 average threshold for the two.
1953 mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
1954 might not vary that way, but don't test this since high<divisor isn't
1955 expected to occur often with small divisors. */
1958 tune_divexact_1 (void)
1960 static struct param_t param;
1961 mp_size_t thresh[2], average;
1964 /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
1965 full mpn_divrem_1. */
1966 if (HAVE_NATIVE_mpn_divexact_1)
1968 print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
1972 ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
1974 param.name = "DIVEXACT_1_THRESHOLD";
1975 param.data_high = DATA_HIGH_GE_R;
1976 param.check_size = 256;
1978 param.stop_factor = 1.5;
1979 param.function = tuned_speed_mpn_divrem_1;
1980 param.function2 = speed_mpn_divexact_1;
1983 print_define_start (param.name);
1985 for (low = 0; low <= 1; low++)
1987 s.r = randlimb_half();
1991 s.r &= ~CNST_LIMB(7);
1993 one (&thresh[low], ¶m);
1995 printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
1997 if (thresh[low] == MP_SIZE_T_MAX)
1999 average = MP_SIZE_T_MAX;
2000 goto divexact_1_done;
2006 printf ("average of:");
2007 for (i = 0; i < numberof(thresh); i++)
2008 printf (" %ld", (long) thresh[i]);
2013 for (i = 0; i < numberof(thresh); i++)
2014 average += thresh[i];
2015 average /= numberof(thresh);
2017 /* If divexact turns out to be better as early as 3 limbs, then use it
2018 always, so as to reduce code size and conditional jumps. */
2023 print_define_end (param.name, average);
2027 /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
2028 same as mpn_mod_1, but this might not be true of an assembler
2029 implementation. The threshold used is an average based on data where a
2030 divide can be skipped and where it can't.
2032 If modexact turns out to be better as early as 3 limbs, then use it
2033 always, so as to reduce code size and conditional jumps. */
2036 tune_modexact_1_odd (void)
2038 static struct param_t param;
2039 mp_size_t thresh_lt, thresh_ge, average;
2042 /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
2043 of a full mpn_mod_1. */
2044 if (HAVE_NATIVE_mpn_modexact_1_odd)
2046 print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
2051 param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
2052 param.check_size = 256;
2054 param.stop_factor = 1.5;
2055 param.function = speed_mpn_modexact_1c_odd;
2056 param.function2 = speed_mpn_mod_1_tune;
2058 s.r = randlimb_half () | 1;
2060 print_define_start (param.name);
2062 param.data_high = DATA_HIGH_LT_R;
2063 one (&thresh_lt, ¶m);
2065 printf ("lt thresh %ld\n", (long) thresh_lt);
2067 average = thresh_lt;
2068 if (thresh_lt != MP_SIZE_T_MAX)
2070 param.data_high = DATA_HIGH_GE_R;
2071 one (&thresh_ge, ¶m);
2073 printf ("ge thresh %ld\n", (long) thresh_ge);
2075 if (thresh_ge != MP_SIZE_T_MAX)
2077 average = (thresh_ge + thresh_lt) / 2;
2083 print_define_end (param.name, average);
2088 tune_jacobi_base (void)
2090 static struct param_t param;
2094 s.size = GMP_LIMB_BITS * 3 / 4;
2096 t1 = tuneup_measure (speed_mpn_jacobi_base_1, ¶m, &s);
2097 if (option_trace >= 1)
2098 printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1);
2100 t2 = tuneup_measure (speed_mpn_jacobi_base_2, ¶m, &s);
2101 if (option_trace >= 1)
2102 printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2);
2104 t3 = tuneup_measure (speed_mpn_jacobi_base_3, ¶m, &s);
2105 if (option_trace >= 1)
2106 printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3);
2108 if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0)
2110 printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
2115 if (t1 < t2 && t1 < t3)
2122 print_define ("JACOBI_BASE_METHOD", method);
2129 /* Tune for decimal, it being most common. Some rough testing suggests
2130 other bases are different, but not by very much. */
2133 static struct param_t param;
2134 GET_STR_PRECOMPUTE_THRESHOLD = 0;
2135 param.name = "GET_STR_DC_THRESHOLD";
2136 param.function = speed_mpn_get_str;
2138 param.max_size = GET_STR_THRESHOLD_LIMIT;
2139 one (&get_str_dc_threshold, ¶m);
2142 static struct param_t param;
2143 param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
2144 param.function = speed_mpn_get_str;
2145 param.min_size = GET_STR_DC_THRESHOLD;
2146 param.max_size = GET_STR_THRESHOLD_LIMIT;
2147 one (&get_str_precompute_threshold, ¶m);
2153 speed_mpn_pre_set_str (struct speed_params *s)
2161 mp_ptr powtab_mem, tp;
2162 powers_t powtab[GMP_LIMB_BITS];
2167 SPEED_RESTRICT_COND (s->size >= 1);
2169 base = s->r == 0 ? 10 : s->r;
2170 SPEED_RESTRICT_COND (base >= 2 && base <= 256);
2174 str = TMP_ALLOC (s->size);
2175 for (i = 0; i < s->size; i++)
2176 str[i] = s->xp[i] % base;
2178 wn = ((mp_size_t) (s->size / mp_bases[base].chars_per_bit_exactly))
2179 / GMP_LIMB_BITS + 2;
2180 SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
2182 /* use this during development to check wn is big enough */
2184 ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
2187 speed_operand_src (s, (mp_ptr) str, s->size/BYTES_PER_MP_LIMB);
2188 speed_operand_dst (s, wp, wn);
2189 speed_cache_fill (s);
2191 chars_per_limb = mp_bases[base].chars_per_limb;
2192 un = s->size / chars_per_limb + 1;
2193 powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un));
2194 mpn_set_str_compute_powtab (powtab, powtab_mem, un, base);
2195 tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
2201 mpn_pre_set_str (wp, str, s->size, powtab, tp);
2204 t = speed_endtime ();
2213 s.r = 10; /* decimal */
2215 static struct param_t param;
2216 SET_STR_PRECOMPUTE_THRESHOLD = 0;
2217 param.step_factor = 0.01;
2218 param.name = "SET_STR_DC_THRESHOLD";
2219 param.function = speed_mpn_pre_set_str;
2220 param.min_size = 100;
2221 param.max_size = 50000;
2222 one (&set_str_dc_threshold, ¶m);
2225 static struct param_t param;
2226 param.step_factor = 0.02;
2227 param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
2228 param.function = speed_mpn_set_str;
2229 param.min_size = SET_STR_DC_THRESHOLD;
2230 param.max_size = 100000;
2231 one (&set_str_precompute_threshold, ¶m);
2239 static struct fft_param_t param;
2241 if (option_fft_max_size == 0)
2244 param.table_name = "MUL_FFT_TABLE3";
2245 param.threshold_name = "MUL_FFT_THRESHOLD";
2246 param.p_threshold = &mul_fft_threshold;
2247 param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
2248 param.p_modf_threshold = &mul_fft_modf_threshold;
2249 param.first_size = MUL_TOOM33_THRESHOLD / 2;
2250 param.max_size = option_fft_max_size;
2251 param.function = speed_mpn_fft_mul;
2252 param.mul_modf_function = speed_mpn_mul_fft;
2253 param.mul_function = speed_mpn_mul_n;
2262 static struct fft_param_t param;
2264 if (option_fft_max_size == 0)
2267 param.table_name = "SQR_FFT_TABLE3";
2268 param.threshold_name = "SQR_FFT_THRESHOLD";
2269 param.p_threshold = &sqr_fft_threshold;
2270 param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
2271 param.p_modf_threshold = &sqr_fft_modf_threshold;
2272 param.first_size = SQR_TOOM3_THRESHOLD / 2;
2273 param.max_size = option_fft_max_size;
2274 param.function = speed_mpn_fft_sqr;
2275 param.mul_modf_function = speed_mpn_mul_fft_sqr;
2276 param.mul_function = speed_mpn_sqr;
2284 time_t start_time, end_time;
2288 SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
2289 SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
2291 mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
2292 mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
2294 fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
2297 fprintf (stderr, "Using: %s\n", speed_time_string);
2299 fprintf (stderr, "speed_precision %d", speed_precision);
2300 if (speed_unittime == 1.0)
2301 fprintf (stderr, ", speed_unittime 1 cycle");
2303 fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
2304 if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
2305 fprintf (stderr, ", CPU freq unknown\n");
2307 fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
2309 fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
2310 DEFAULT_MAX_SIZE, (long) option_fft_max_size);
2311 fprintf (stderr, "\n");
2316 tp = localtime (&start_time);
2317 printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
2318 tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
2321 /* gcc sub-minor version doesn't seem to come through as a define */
2322 printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
2323 #define PRINTED_COMPILER
2325 #if defined (__SUNPRO_C)
2326 printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
2327 #define PRINTED_COMPILER
2329 #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
2330 /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
2331 printf ("MIPSpro C %d.%d.%d */\n",
2332 _COMPILER_VERSION / 100,
2333 _COMPILER_VERSION / 10 % 10,
2334 _COMPILER_VERSION % 10);
2335 #define PRINTED_COMPILER
2337 #if defined (__DECC) && defined (__DECC_VER)
2338 printf ("DEC C %d */\n", __DECC_VER);
2339 #define PRINTED_COMPILER
2341 #if ! defined (PRINTED_COMPILER)
2342 printf ("system compiler */\n");
2349 tune_preinv_divrem_1 ();
2352 tune_modexact_1_odd ();
2364 tune_mulmod_bnm1 ();
2365 tune_sqrmod_bnm1 ();
2393 tune_matrix22_mul ();
2397 tune_jacobi_base ();
2405 printf ("/* Tuneup completed successfully, took %ld seconds */\n",
2406 (long) (end_time - start_time));
2413 main (int argc, char *argv[])
2417 /* Unbuffered so if output is redirected to a file it isn't lost if the
2418 program is killed part way through. */
2419 setbuf (stdout, NULL);
2420 setbuf (stderr, NULL);
2422 while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
2426 if (optarg[0] == 't')
2427 option_fft_trace = 2;
2429 option_fft_max_size = atol (optarg);
2432 speed_option_set (optarg);
2435 speed_precision = atoi (optarg);