tune/tuneup.c

   1 /* Create tuned thresholds for various algorithms.
   2
   3 Copyright 1999, 2000, 2001, 2002, 2003, 2005, 2006, 2008, 2009, 2010 Free
   4 Software Foundation, Inc.
   5
   6 This file is part of the GNU MP Library.
   7
   8 The GNU MP Library is free software; you can redistribute it and/or modify
   9 it under the terms of the GNU Lesser General Public License as published by
  10 the Free Software Foundation; either version 3 of the License, or (at your
  11 option) any later version.
  12
  13 The GNU MP Library is distributed in the hope that it will be useful, but
  14 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  15 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  16 License for more details.
  17
  18 You should have received a copy of the GNU Lesser General Public License
  19 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  20
  21
  22 /* Usage: tuneup [-t] [-t] [-p precision]
  23
  24    -t turns on some diagnostic traces, a second -t turns on more traces.
  25
  26    Notes:
  27
  28    The code here isn't a vision of loveliness, mainly because it's subject
  29    to ongoing changes according to new things wanting to be tuned, and
  30    practical requirements of systems tested.
  31
  32    Sometimes running the program twice produces slightly different results.
  33    This is probably because there's so little separating algorithms near
  34    their crossover, and on that basis it should make little or no difference
  35    to the final speed of the relevant routines, but nothing has been done to
  36    check that carefully.
  37
  38    Algorithm:
  39
  40    The thresholds are determined as follows.  A crossover may not be a
  41    single size but rather a range where it oscillates between method A or
  42    method B faster.  If the threshold is set making B used where A is faster
  43    (or vice versa) that's bad.  Badness is the percentage time lost and
  44    total badness is the sum of this over all sizes measured.  The threshold
  45    is set to minimize total badness.
  46
  47    Suppose, as sizes increase, method B becomes faster than method A.  The
  48    effect of the rule is that, as you look at increasing sizes, isolated
  49    points where B is faster are ignored, but when it's consistently faster,
  50    or faster on balance, then the threshold is set there.  The same result
  51    is obtained thinking in the other direction of A becoming faster at
  52    smaller sizes.
  53
  54    In practice the thresholds tend to be chosen to bring on the next
  55    algorithm fairly quickly.
  56
  57    This rule is attractive because it's got a basis in reason and is fairly
  58    easy to implement, but no work has been done to actually compare it in
  59    absolute terms to other possibilities.
  60
  61    Implementation:
  62
  63    In a normal library build the thresholds are constants.  To tune them
  64    selected objects are recompiled with the thresholds as global variables
  65    instead.  #define TUNE_PROGRAM_BUILD does this, with help from code at
  66    the end of gmp-impl.h, and rules in tune/Makefile.am.
  67
  68    MUL_TOOM22_THRESHOLD for example uses a recompiled mpn_mul_n.  The
  69    threshold is set to "size+1" to avoid karatsuba, or to "size" to use one
  70    level, but recurse into the basecase.
  71
  72    MUL_TOOM33_THRESHOLD makes use of the tuned MUL_TOOM22_THRESHOLD value.
  73    Other routines in turn will make use of both of those.  Naturally the
  74    dependants must be tuned first.
  75
  76    In a couple of cases, like DIVEXACT_1_THRESHOLD, there's no recompiling,
  77    just a threshold based on comparing two routines (mpn_divrem_1 and
  78    mpn_divexact_1), and no further use of the value determined.
  79
  80    Flags like USE_PREINV_MOD_1 or JACOBI_BASE_METHOD are even simpler, being
  81    just comparisons between certain routines on representative data.
  82
  83    Shortcuts are applied when native (assembler) versions of routines exist.
  84    For instance a native mpn_sqr_basecase is assumed to be always faster
  85    than mpn_mul_basecase, with no measuring.
  86
  87    No attempt is made to tune within assembler routines, for instance
  88    DIVREM_1_NORM_THRESHOLD.  An assembler mpn_divrem_1 is expected to be
  89    written and tuned all by hand.  Assembler routines that might have hard
  90    limits are recompiled though, to make them accept a bigger range of sizes
  91    than normal, eg. mpn_sqr_basecase to compare against mpn_toom2_sqr.
  92
  93    Limitations:
  94
  95    The FFTs aren't subject to the same badness rule as the other thresholds,
  96    so each k is probably being brought on a touch early.  This isn't likely
  97    to make a difference, and the simpler probing means fewer tests.
  98
  99 */
 100
 101 #define TUNE_PROGRAM_BUILD  1   /* for gmp-impl.h */
 102
 103 #include "config.h"
 104
 105 #include <math.h>
 106 #include <stdio.h>
 107 #include <stdlib.h>
 108 #include <time.h>
 109 #if HAVE_UNISTD_H
 110 #include <unistd.h>
 111 #endif
 112
 113 #include "gmp.h"
 114 #include "gmp-impl.h"
 115 #include "longlong.h"
 116
 117 #include "tests.h"
 118 #include "speed.h"
 119
 120 #if !HAVE_DECL_OPTARG
 121 extern char *optarg;
 122 extern int optind, opterr;
 123 #endif
 124
 125
 126 #define DEFAULT_MAX_SIZE   1000  /* limbs */
 127
 128 #if WANT_FFT
 129 mp_size_t  option_fft_max_size = 50000;  /* limbs */
 130 #else
 131 mp_size_t  option_fft_max_size = 0;
 132 #endif
 133 int        option_trace = 0;
 134 int        option_fft_trace = 0;
 135 struct speed_params  s;
 136
 137 struct dat_t {
 138   mp_size_t  size;
 139   double     d;
 140 } *dat = NULL;
 141 int  ndat = 0;
 142 int  allocdat = 0;
 143
 144 /* This is not defined if mpn_sqr_basecase doesn't declare a limit.  In that
 145    case use zero here, which for params.max_size means no limit.  */
 146 #ifndef TUNE_SQR_TOOM2_MAX
 147 #define TUNE_SQR_TOOM2_MAX  0
 148 #endif
 149
 150 mp_size_t  mul_toom22_threshold         = MP_SIZE_T_MAX;
 151 mp_size_t  mul_toom33_threshold         = MUL_TOOM33_THRESHOLD_LIMIT;
 152 mp_size_t  mul_toom44_threshold         = MUL_TOOM44_THRESHOLD_LIMIT;
 153 mp_size_t  mul_toom6h_threshold         = MUL_TOOM6H_THRESHOLD_LIMIT;
 154 mp_size_t  mul_toom8h_threshold         = MUL_TOOM8H_THRESHOLD_LIMIT;
 155 mp_size_t  mul_toom32_to_toom43_threshold = MP_SIZE_T_MAX;
 156 mp_size_t  mul_toom32_to_toom53_threshold = MP_SIZE_T_MAX;
 157 mp_size_t  mul_toom42_to_toom53_threshold = MP_SIZE_T_MAX;
 158 mp_size_t  mul_toom42_to_toom63_threshold = MP_SIZE_T_MAX;
 159 mp_size_t  mul_fft_threshold            = MP_SIZE_T_MAX;
 160 mp_size_t  mul_fft_modf_threshold       = MP_SIZE_T_MAX;
 161 mp_size_t  sqr_basecase_threshold       = MP_SIZE_T_MAX;
 162 mp_size_t  sqr_toom2_threshold
 163   = (TUNE_SQR_TOOM2_MAX == 0 ? MP_SIZE_T_MAX : TUNE_SQR_TOOM2_MAX);
 164 mp_size_t  sqr_toom3_threshold          = SQR_TOOM3_THRESHOLD_LIMIT;
 165 mp_size_t  sqr_toom4_threshold          = SQR_TOOM4_THRESHOLD_LIMIT;
 166 mp_size_t  sqr_toom6_threshold          = SQR_TOOM6_THRESHOLD_LIMIT;
 167 mp_size_t  sqr_toom8_threshold          = SQR_TOOM8_THRESHOLD_LIMIT;
 168 mp_size_t  sqr_fft_threshold            = MP_SIZE_T_MAX;
 169 mp_size_t  sqr_fft_modf_threshold       = MP_SIZE_T_MAX;
 170 mp_size_t  mullo_basecase_threshold     = MP_SIZE_T_MAX;
 171 mp_size_t  mullo_dc_threshold           = MP_SIZE_T_MAX;
 172 mp_size_t  mullo_mul_n_threshold        = MP_SIZE_T_MAX;
 173 mp_size_t  mulmod_bnm1_threshold        = MP_SIZE_T_MAX;
 174 mp_size_t  sqrmod_bnm1_threshold        = MP_SIZE_T_MAX;
 175 mp_size_t  div_sb_preinv_threshold      = MP_SIZE_T_MAX;
 176 mp_size_t  dc_div_qr_threshold          = MP_SIZE_T_MAX;
 177 mp_size_t  dc_divappr_q_threshold       = MP_SIZE_T_MAX;
 178 mp_size_t  mu_div_qr_threshold          = MP_SIZE_T_MAX;
 179 mp_size_t  mu_divappr_q_threshold       = MP_SIZE_T_MAX;
 180 mp_size_t  mupi_div_qr_threshold        = MP_SIZE_T_MAX;
 181 mp_size_t  mu_div_q_threshold           = MP_SIZE_T_MAX;
 182 mp_size_t  dc_bdiv_qr_threshold         = MP_SIZE_T_MAX;
 183 mp_size_t  dc_bdiv_q_threshold          = MP_SIZE_T_MAX;
 184 mp_size_t  mu_bdiv_qr_threshold         = MP_SIZE_T_MAX;
 185 mp_size_t  mu_bdiv_q_threshold          = MP_SIZE_T_MAX;
 186 mp_size_t  inv_mulmod_bnm1_threshold    = MP_SIZE_T_MAX;
 187 mp_size_t  inv_newton_threshold         = MP_SIZE_T_MAX;
 188 mp_size_t  inv_appr_threshold           = MP_SIZE_T_MAX;
 189 mp_size_t  binv_newton_threshold        = MP_SIZE_T_MAX;
 190 mp_size_t  redc_1_to_redc_2_threshold   = MP_SIZE_T_MAX;
 191 mp_size_t  redc_1_to_redc_n_threshold   = MP_SIZE_T_MAX;
 192 mp_size_t  redc_2_to_redc_n_threshold   = MP_SIZE_T_MAX;
 193 mp_size_t  powm_threshold               = MP_SIZE_T_MAX;
 194 mp_size_t  matrix22_strassen_threshold  = MP_SIZE_T_MAX;
 195 mp_size_t  hgcd_threshold               = MP_SIZE_T_MAX;
 196 mp_size_t  gcd_accel_threshold          = MP_SIZE_T_MAX;
 197 mp_size_t  gcd_dc_threshold             = MP_SIZE_T_MAX;
 198 mp_size_t  gcdext_dc_threshold          = MP_SIZE_T_MAX;
 199 mp_size_t  divrem_1_norm_threshold      = MP_SIZE_T_MAX;
 200 mp_size_t  divrem_1_unnorm_threshold    = MP_SIZE_T_MAX;
 201 mp_size_t  mod_1_norm_threshold         = MP_SIZE_T_MAX;
 202 mp_size_t  mod_1_unnorm_threshold       = MP_SIZE_T_MAX;
 203 mp_size_t  mod_1n_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
 204 mp_size_t  mod_1u_to_mod_1_1_threshold  = MP_SIZE_T_MAX;
 205 mp_size_t  mod_1_1_to_mod_1_2_threshold = MP_SIZE_T_MAX;
 206 mp_size_t  mod_1_2_to_mod_1_4_threshold = MP_SIZE_T_MAX;
 207 mp_size_t  preinv_mod_1_to_mod_1_threshold = MP_SIZE_T_MAX;
 208 mp_size_t  divrem_2_threshold           = MP_SIZE_T_MAX;
 209 mp_size_t  get_str_dc_threshold         = MP_SIZE_T_MAX;
 210 mp_size_t  get_str_precompute_threshold = MP_SIZE_T_MAX;
 211 mp_size_t  set_str_dc_threshold         = MP_SIZE_T_MAX;
 212 mp_size_t  set_str_precompute_threshold = MP_SIZE_T_MAX;
 213
 214 mp_size_t  fft_modf_sqr_threshold = MP_SIZE_T_MAX;
 215 mp_size_t  fft_modf_mul_threshold = MP_SIZE_T_MAX;
 216
 217 struct param_t {
 218   const char        *name;
 219   speed_function_t  function;
 220   speed_function_t  function2;
 221   double            step_factor;    /* how much to step relatively */
 222   int               step;           /* how much to step absolutely */
 223   double            function_fudge; /* multiplier for "function" speeds */
 224   int               stop_since_change;
 225   double            stop_factor;
 226   mp_size_t         min_size;
 227   int               min_is_always;
 228   mp_size_t         max_size;
 229   mp_size_t         check_size;
 230   mp_size_t         size_extra;
 231
 232 #define DATA_HIGH_LT_R  1
 233 #define DATA_HIGH_GE_R  2
 234   int               data_high;
 235
 236   int               noprint;
 237 };
 238
 239
 240 /* These are normally undefined when false, which suits "#if" fine.
 241    But give them zero values so they can be used in plain C "if"s.  */
 242 #ifndef UDIV_PREINV_ALWAYS
 243 #define UDIV_PREINV_ALWAYS 0
 244 #endif
 245 #ifndef HAVE_NATIVE_mpn_divexact_1
 246 #define HAVE_NATIVE_mpn_divexact_1 0
 247 #endif
 248 #ifndef HAVE_NATIVE_mpn_divrem_1
 249 #define HAVE_NATIVE_mpn_divrem_1 0
 250 #endif
 251 #ifndef HAVE_NATIVE_mpn_divrem_2
 252 #define HAVE_NATIVE_mpn_divrem_2 0
 253 #endif
 254 #ifndef HAVE_NATIVE_mpn_mod_1
 255 #define HAVE_NATIVE_mpn_mod_1 0
 256 #endif
 257 #ifndef HAVE_NATIVE_mpn_modexact_1_odd
 258 #define HAVE_NATIVE_mpn_modexact_1_odd 0
 259 #endif
 260 #ifndef HAVE_NATIVE_mpn_preinv_divrem_1
 261 #define HAVE_NATIVE_mpn_preinv_divrem_1 0
 262 #endif
 263 #ifndef HAVE_NATIVE_mpn_preinv_mod_1
 264 #define HAVE_NATIVE_mpn_preinv_mod_1 0
 265 #endif
 266 #ifndef HAVE_NATIVE_mpn_sqr_basecase
 267 #define HAVE_NATIVE_mpn_sqr_basecase 0
 268 #endif
 269
 270
 271 #define MAX3(a,b,c)  MAX (MAX (a, b), c)
 272
 273 mp_limb_t
 274 randlimb_norm (void)
 275 {
 276   mp_limb_t  n;
 277   mpn_random (&n, 1);
 278   n |= GMP_NUMB_HIGHBIT;
 279   return n;
 280 }
 281
 282 #define GMP_NUMB_HALFMASK  ((CNST_LIMB(1) << (GMP_NUMB_BITS/2)) - 1)
 283
 284 mp_limb_t
 285 randlimb_half (void)
 286 {
 287   mp_limb_t  n;
 288   mpn_random (&n, 1);
 289   n &= GMP_NUMB_HALFMASK;
 290   n += (n==0);
 291   return n;
 292 }
 293
 294
 295 /* Add an entry to the end of the dat[] array, reallocing to make it bigger
 296    if necessary.  */
 297 void
 298 add_dat (mp_size_t size, double d)
 299 {
 300 #define ALLOCDAT_STEP  500
 301
 302   ASSERT_ALWAYS (ndat <= allocdat);
 303
 304   if (ndat == allocdat)
 305     {
 306       dat = (struct dat_t *) __gmp_allocate_or_reallocate
 307         (dat, allocdat * sizeof(dat[0]),
 308          (allocdat+ALLOCDAT_STEP) * sizeof(dat[0]));
 309       allocdat += ALLOCDAT_STEP;
 310     }
 311
 312   dat[ndat].size = size;
 313   dat[ndat].d = d;
 314   ndat++;
 315 }
 316
 317
 318 /* Return the threshold size based on the data accumulated. */
 319 mp_size_t
 320 analyze_dat (int final)
 321 {
 322   double  x, min_x;
 323   int     j, min_j;
 324
 325   /* If the threshold is set at dat[0].size, any positive values are bad. */
 326   x = 0.0;
 327   for (j = 0; j < ndat; j++)
 328     if (dat[j].d > 0.0)
 329       x += dat[j].d;
 330
 331   if (option_trace >= 2 && final)
 332     {
 333       printf ("\n");
 334       printf ("x is the sum of the badness from setting thresh at given size\n");
 335       printf ("  (minimum x is sought)\n");
 336       printf ("size=%ld  first x=%.4f\n", (long) dat[j].size, x);
 337     }
 338
 339   min_x = x;
 340   min_j = 0;
 341
 342
 343   /* When stepping to the next dat[j].size, positive values are no longer
 344      bad (so subtracted), negative values become bad (so add the absolute
 345      value, meaning subtract). */
 346   for (j = 0; j < ndat; x -= dat[j].d, j++)
 347     {
 348       if (option_trace >= 2 && final)
 349         printf ("size=%ld  x=%.4f\n", (long) dat[j].size, x);
 350
 351       if (x < min_x)
 352         {
 353           min_x = x;
 354           min_j = j;
 355         }
 356     }
 357
 358   return min_j;
 359 }
 360
 361
 362 /* Measuring for recompiled mpn/generic/divrem_1.c and mpn/generic/mod_1.c */
 363
 364 mp_limb_t mpn_divrem_1_tune
 365   __GMP_PROTO ((mp_ptr, mp_size_t, mp_srcptr, mp_size_t, mp_limb_t));
 366 mp_limb_t mpn_mod_1_tune
 367    __GMP_PROTO ((mp_srcptr, mp_size_t, mp_limb_t));
 368
 369 double
 370 speed_mpn_mod_1_tune (struct speed_params *s)
 371 {
 372   SPEED_ROUTINE_MPN_MOD_1 (mpn_mod_1_tune);
 373 }
 374 double
 375 speed_mpn_divrem_1_tune (struct speed_params *s)
 376 {
 377   SPEED_ROUTINE_MPN_DIVREM_1 (mpn_divrem_1_tune);
 378 }
 379
 380
 381 double
 382 tuneup_measure (speed_function_t fun,
 383                 const struct param_t *param,
 384                 struct speed_params *s)
 385 {
 386   static struct param_t  dummy;
 387   double   t;
 388   TMP_DECL;
 389
 390   if (! param)
 391     param = &dummy;
 392
 393   s->size += param->size_extra;
 394
 395   TMP_MARK;
 396   SPEED_TMP_ALLOC_LIMBS (s->xp, s->size, 0);
 397   SPEED_TMP_ALLOC_LIMBS (s->yp, s->size, 0);
 398
 399   mpn_random (s->xp, s->size);
 400   mpn_random (s->yp, s->size);
 401
 402   switch (param->data_high) {
 403   case DATA_HIGH_LT_R:
 404     s->xp[s->size-1] %= s->r;
 405     s->yp[s->size-1] %= s->r;
 406     break;
 407   case DATA_HIGH_GE_R:
 408     s->xp[s->size-1] |= s->r;
 409     s->yp[s->size-1] |= s->r;
 410     break;
 411   }
 412
 413   t = speed_measure (fun, s);
 414
 415   s->size -= param->size_extra;
 416
 417   TMP_FREE;
 418   return t;
 419 }
 420
 421
 422 #define PRINT_WIDTH  31
 423
 424 void
 425 print_define_start (const char *name)
 426 {
 427   printf ("#define %-*s  ", PRINT_WIDTH, name);
 428   if (option_trace)
 429     printf ("...\n");
 430 }
 431
 432 void
 433 print_define_end_remark (const char *name, mp_size_t value, const char *remark)
 434 {
 435   if (option_trace)
 436     printf ("#define %-*s  ", PRINT_WIDTH, name);
 437
 438   if (value == MP_SIZE_T_MAX)
 439     printf ("MP_SIZE_T_MAX");
 440   else
 441     printf ("%5ld", (long) value);
 442
 443   if (remark != NULL)
 444     printf ("  /* %s */", remark);
 445   printf ("\n");
 446   fflush (stdout);
 447 }
 448
 449 void
 450 print_define_end (const char *name, mp_size_t value)
 451 {
 452   const char  *remark;
 453   if (value == MP_SIZE_T_MAX)
 454     remark = "never";
 455   else if (value == 0)
 456     remark = "always";
 457   else
 458     remark = NULL;
 459   print_define_end_remark (name, value, remark);
 460 }
 461
 462 void
 463 print_define (const char *name, mp_size_t value)
 464 {
 465   print_define_start (name);
 466   print_define_end (name, value);
 467 }
 468
 469 void
 470 print_define_remark (const char *name, mp_size_t value, const char *remark)
 471 {
 472   print_define_start (name);
 473   print_define_end_remark (name, value, remark);
 474 }
 475
 476
 477 void
 478 one (mp_size_t *threshold, struct param_t *param)
 479 {
 480   int  since_positive, since_thresh_change;
 481   int  thresh_idx, new_thresh_idx;
 482
 483 #define DEFAULT(x,n)  do { if (! (x))  (x) = (n); } while (0)
 484
 485   DEFAULT (param->function_fudge, 1.0);
 486   DEFAULT (param->function2, param->function);
 487   DEFAULT (param->step_factor, 0.01);  /* small steps by default */
 488   DEFAULT (param->step, 1);            /* small steps by default */
 489   DEFAULT (param->stop_since_change, 80);
 490   DEFAULT (param->stop_factor, 1.2);
 491   DEFAULT (param->min_size, 10);
 492   DEFAULT (param->max_size, DEFAULT_MAX_SIZE);
 493
 494   if (param->check_size != 0)
 495     {
 496       double   t1, t2;
 497       s.size = param->check_size;
 498
 499       *threshold = s.size+1;
 500       t1 = tuneup_measure (param->function, param, &s);
 501
 502       *threshold = s.size;
 503       t2 = tuneup_measure (param->function2, param, &s);
 504       if (t1 == -1.0 || t2 == -1.0)
 505         {
 506           printf ("Oops, can't run both functions at size %ld\n",
 507                   (long) s.size);
 508           abort ();
 509         }
 510       t1 *= param->function_fudge;
 511
 512       /* ask that t2 is at least 4% below t1 */
 513       if (t1 < t2*1.04)
 514         {
 515           if (option_trace)
 516             printf ("function2 never enough faster: t1=%.9f t2=%.9f\n", t1, t2);
 517           *threshold = MP_SIZE_T_MAX;
 518           if (! param->noprint)
 519             print_define (param->name, *threshold);
 520           return;
 521         }
 522
 523       if (option_trace >= 2)
 524         printf ("function2 enough faster at size=%ld: t1=%.9f t2=%.9f\n",
 525                 (long) s.size, t1, t2);
 526     }
 527
 528   if (! param->noprint || option_trace)
 529     print_define_start (param->name);
 530
 531   ndat = 0;
 532   since_positive = 0;
 533   since_thresh_change = 0;
 534   thresh_idx = 0;
 535
 536   if (option_trace >= 2)
 537     {
 538       printf ("             algorithm-A  algorithm-B   ratio  possible\n");
 539       printf ("              (seconds)    (seconds)    diff    thresh\n");
 540     }
 541
 542   for (s.size = param->min_size;
 543        s.size < param->max_size;
 544        s.size += MAX ((mp_size_t) floor (s.size * param->step_factor), param->step))
 545     {
 546       double   ti, tiplus1, d;
 547
 548       /*
 549         FIXME: check minimum size requirements are met, possibly by just
 550         checking for the -1 returns from the speed functions.
 551       */
 552
 553       /* using method A at this size */
 554       *threshold = s.size+1;
 555       ti = tuneup_measure (param->function, param, &s);
 556       if (ti == -1.0)
 557         abort ();
 558       ti *= param->function_fudge;
 559
 560       /* using method B at this size */
 561       *threshold = s.size;
 562       tiplus1 = tuneup_measure (param->function2, param, &s);
 563       if (tiplus1 == -1.0)
 564         abort ();
 565
 566       /* Calculate the fraction by which the one or the other routine is
 567          slower.  */
 568       if (tiplus1 >= ti)
 569         d = (tiplus1 - ti) / tiplus1;  /* negative */
 570       else
 571         d = (tiplus1 - ti) / ti;       /* positive */
 572
 573       add_dat (s.size, d);
 574
 575       new_thresh_idx = analyze_dat (0);
 576
 577       if (option_trace >= 2)
 578         printf ("size=%ld  %.9f  %.9f  % .4f %c  %ld\n",
 579                 (long) s.size, ti, tiplus1, d,
 580                 ti > tiplus1 ? '#' : ' ',
 581                 (long) dat[new_thresh_idx].size);
 582
 583       /* Stop if the last time method i was faster was more than a
 584          certain number of measurements ago.  */
 585 #define STOP_SINCE_POSITIVE  200
 586       if (d >= 0)
 587         since_positive = 0;
 588       else
 589         if (++since_positive > STOP_SINCE_POSITIVE)
 590           {
 591             if (option_trace >= 1)
 592               printf ("stopped due to since_positive (%d)\n",
 593                       STOP_SINCE_POSITIVE);
 594             break;
 595           }
 596
 597       /* Stop if method A has become slower by a certain factor. */
 598       if (ti >= tiplus1 * param->stop_factor)
 599         {
 600           if (option_trace >= 1)
 601             printf ("stopped due to ti >= tiplus1 * factor (%.1f)\n",
 602                     param->stop_factor);
 603           break;
 604         }
 605
 606       /* Stop if the threshold implied hasn't changed in a certain
 607          number of measurements.  (It's this condition that usually
 608          stops the loop.) */
 609       if (thresh_idx != new_thresh_idx)
 610         since_thresh_change = 0, thresh_idx = new_thresh_idx;
 611       else
 612         if (++since_thresh_change > param->stop_since_change)
 613           {
 614             if (option_trace >= 1)
 615               printf ("stopped due to since_thresh_change (%d)\n",
 616                       param->stop_since_change);
 617             break;
 618           }
 619
 620       /* Stop if the threshold implied is more than a certain number of
 621          measurements ago.  */
 622 #define STOP_SINCE_AFTER   500
 623       if (ndat - thresh_idx > STOP_SINCE_AFTER)
 624         {
 625           if (option_trace >= 1)
 626             printf ("stopped due to ndat - thresh_idx > amount (%d)\n",
 627                     STOP_SINCE_AFTER);
 628           break;
 629         }
 630
 631       /* Stop when the size limit is reached before the end of the
 632          crossover, but only show this as an error for >= the default max
 633          size.  FIXME: Maybe should make it a param choice whether this is
 634          an error.  */
 635       if (s.size >= param->max_size && param->max_size >= DEFAULT_MAX_SIZE)
 636         {
 637           fprintf (stderr, "%s\n", param->name);
 638           fprintf (stderr, "sizes %ld to %ld total %d measurements\n",
 639                    (long) dat[0].size, (long) dat[ndat-1].size, ndat);
 640           fprintf (stderr, "    max size reached before end of crossover\n");
 641           break;
 642         }
 643     }
 644
 645   if (option_trace >= 1)
 646     printf ("sizes %ld to %ld total %d measurements\n",
 647             (long) dat[0].size, (long) dat[ndat-1].size, ndat);
 648
 649   *threshold = dat[analyze_dat (1)].size;
 650
 651   if (param->min_is_always)
 652     {
 653       if (*threshold == param->min_size)
 654         *threshold = 0;
 655     }
 656
 657   if (! param->noprint || option_trace)
 658     print_define_end (param->name, *threshold);
 659 }
 660
 661
 662 /* Special probing for the fft thresholds.  The size restrictions on the
 663    FFTs mean the graph of time vs size has a step effect.  See this for
 664    example using
 665
 666        ./speed -s 4096-16384 -t 128 -P foo mpn_mul_fft.8 mpn_mul_fft.9
 667        gnuplot foo.gnuplot
 668
 669    The current approach is to compare routines at the midpoint of relevant
 670    steps.  Arguably a more sophisticated system of threshold data is wanted
 671    if this step effect remains. */
 672
 673 struct fft_param_t {
 674   const char        *table_name;
 675   const char        *threshold_name;
 676   const char        *modf_threshold_name;
 677   mp_size_t         *p_threshold;
 678   mp_size_t         *p_modf_threshold;
 679   mp_size_t         first_size;
 680   mp_size_t         max_size;
 681   speed_function_t  function;
 682   speed_function_t  mul_modf_function;
 683   speed_function_t  mul_function;
 684   mp_size_t         sqr;
 685 };
 686
 687
 688 /* mpn_mul_fft requires pl a multiple of 2^k limbs, but with
 689    N=pl*BIT_PER_MP_LIMB it internally also pads out so N/2^k is a multiple
 690    of 2^(k-1) bits. */
 691
 692 mp_size_t
 693 fft_step_size (int k)
 694 {
 695   mp_size_t  step;
 696
 697   step = MAX ((mp_size_t) 1 << (k-1), GMP_LIMB_BITS) / GMP_LIMB_BITS;
 698   step *= (mp_size_t) 1 << k;
 699
 700   if (step <= 0)
 701     {
 702       printf ("Can't handle k=%d\n", k);
 703       abort ();
 704     }
 705
 706   return step;
 707 }
 708
 709 mp_size_t
 710 fft_next_size (mp_size_t pl, int k)
 711 {
 712   mp_size_t  m = fft_step_size (k);
 713
 714 /*    printf ("[k=%d %ld] %ld ->", k, m, pl); */
 715
 716   if (pl == 0 || (pl & (m-1)) != 0)
 717     pl = (pl | (m-1)) + 1;
 718
 719 /*    printf (" %ld\n", pl); */
 720   return pl;
 721 }
 722
 723 #define NMAX_DEFAULT 1000000
 724 #define MAX_REPS 25
 725 #define MIN_REPS 5
 726
 727 static inline size_t
 728 mpn_mul_fft_lcm (size_t a, unsigned int k)
 729 {
 730   unsigned int l = k;
 731
 732   while (a % 2 == 0 && k > 0)
 733     {
 734       a >>= 1;
 735       k--;
 736     }
 737   return a << l;
 738 }
 739
 740 mp_size_t
 741 fftfill (mp_size_t pl, int k, int sqr)
 742 {
 743   mp_size_t maxLK;
 744   mp_bitcnt_t N, Nprime, nprime, M;
 745
 746   N = pl * GMP_NUMB_BITS;
 747   M = N >> k;
 748
 749   maxLK = mpn_mul_fft_lcm ((unsigned long) GMP_NUMB_BITS, k);
 750
 751   Nprime = (1 + (2 * M + k + 2) / maxLK) * maxLK;
 752   nprime = Nprime / GMP_NUMB_BITS;
 753   if (nprime >= (sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
 754     {
 755       size_t K2;
 756       for (;;)
 757         {
 758           K2 = 1L << mpn_fft_best_k (nprime, sqr);
 759           if ((nprime & (K2 - 1)) == 0)
 760             break;
 761           nprime = (nprime + K2 - 1) & -K2;
 762           Nprime = nprime * GMP_LIMB_BITS;
 763         }
 764     }
 765   ASSERT_ALWAYS (nprime < pl);
 766
 767   return Nprime;
 768 }
 769
 770 static int
 771 compare_double (const void *ap, const void *bp)
 772 {
 773   double a = * (const double *) ap;
 774   double b = * (const double *) bp;
 775
 776   if (a < b)
 777     return -1;
 778   else if (a > b)
 779     return 1;
 780   else
 781     return 0;
 782 }
 783
 784 double
 785 median (double *times, int n)
 786 {
 787   qsort (times, n, sizeof (double), compare_double);
 788   return times[n/2];
 789 }
 790
 791 #define FFT_CACHE_SIZE 25
 792 typedef struct fft_cache
 793 {
 794   mp_size_t n;
 795   double time;
 796 } fft_cache_t;
 797
 798 fft_cache_t fft_cache[FFT_CACHE_SIZE];
 799
 800 double
 801 cached_measure (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n, int k,
 802                 int n_measurements)
 803 {
 804   int i;
 805   double t, ttab[MAX_REPS];
 806
 807   if (fft_cache[k].n == n)
 808     return fft_cache[k].time;
 809
 810   for (i = 0; i < n_measurements; i++)
 811     {
 812       speed_starttime ();
 813       mpn_mul_fft (rp, n, ap, n, bp, n, k);
 814       ttab[i] = speed_endtime ();
 815     }
 816
 817   t = median (ttab, n_measurements);
 818   fft_cache[k].n = n;
 819   fft_cache[k].time = t;
 820   return t;
 821 }
 822
 823 #define INSERT_FFTTAB(idx, nval, kval)                                  \
 824   do {                                                                  \
 825     fft_tab[idx].n = nval;                                              \
 826     fft_tab[idx].k = kval;                                              \
 827     fft_tab[idx+1].n = -1;      /* sentinel */                          \
 828     fft_tab[idx+1].k = -1;                                              \
 829   } while (0)
 830
 831 int
 832 fftmes (mp_size_t nmin, mp_size_t nmax, int initial_k, struct fft_param_t *p, int idx, int print)
 833 {
 834   mp_size_t n, n1, prev_n1;
 835   int k, best_k, last_best_k, kmax;
 836   int eff, prev_eff;
 837   double t0, t1;
 838   int n_measurements;
 839   mp_limb_t *ap, *bp, *rp;
 840   mp_size_t alloc;
 841   char *linepref;
 842   struct fft_table_nk *fft_tab;
 843
 844   fft_tab = mpn_fft_table3[p->sqr];
 845
 846   for (k = 0; k < FFT_CACHE_SIZE; k++)
 847     fft_cache[k].n = 0;
 848
 849   if (nmin < (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD))
 850     {
 851       nmin = (p->sqr ? SQR_FFT_MODF_THRESHOLD : MUL_FFT_MODF_THRESHOLD);
 852     }
 853
 854   if (print)
 855     printf ("#define %s%*s", p->table_name, 38, "");
 856
 857   if (idx == 0)
 858     {
 859       INSERT_FFTTAB (0, nmin, initial_k);
 860
 861       if (print)
 862         {
 863           printf ("\\\n  { ");
 864           printf ("{%7u,%2u}", fft_tab[0].n, fft_tab[0].k);
 865           linepref = "    ";
 866         }
 867
 868       idx = 1;
 869     }
 870
 871   ap = malloc (sizeof (mp_limb_t));
 872   if (p->sqr)
 873     bp = ap;
 874   else
 875     bp = malloc (sizeof (mp_limb_t));
 876   rp = malloc (sizeof (mp_limb_t));
 877   alloc = 1;
 878
 879   /* Round n to comply to initial k value */
 880   n = (nmin + ((1ul << initial_k) - 1)) & (MP_SIZE_T_MAX << initial_k);
 881
 882   n_measurements = (18 - initial_k) | 1;
 883   n_measurements = MAX (n_measurements, MIN_REPS);
 884   n_measurements = MIN (n_measurements, MAX_REPS);
 885
 886   last_best_k = initial_k;
 887   best_k = initial_k;
 888
 889   while (n < nmax)
 890     {
 891       int start_k, end_k;
 892
 893       /* Assume the current best k is best until we hit its next FFT step.  */
 894       t0 = 99999;
 895
 896       prev_n1 = n + 1;
 897
 898       start_k = MAX (4, best_k - 4);
 899       end_k = MIN (24, best_k + 4);
 900       for (k = start_k; k <= end_k; k++)
 901         {
 902           n1 = mpn_fft_next_size (prev_n1, k);
 903
 904           eff = 200 * (n1 * GMP_NUMB_BITS >> k) / fftfill (n1, k, p->sqr);
 905
 906           if (eff < 70)         /* avoid measuring too slow fft:s */
 907             continue;
 908
 909           if (n1 > alloc)
 910             {
 911               alloc = n1;
 912               if (p->sqr)
 913                 {
 914                   ap = realloc (ap, sizeof (mp_limb_t));
 915                   rp = realloc (rp, sizeof (mp_limb_t));
 916                   ap = bp = realloc (ap, alloc * sizeof (mp_limb_t));
 917                   mpn_random (ap, alloc);
 918                   rp = realloc (rp, alloc * sizeof (mp_limb_t));
 919                 }
 920               else
 921                 {
 922                   ap = realloc (ap, sizeof (mp_limb_t));
 923                   bp = realloc (bp, sizeof (mp_limb_t));
 924                   rp = realloc (rp, sizeof (mp_limb_t));
 925                   ap = realloc (ap, alloc * sizeof (mp_limb_t));
 926                   mpn_random (ap, alloc);
 927                   bp = realloc (bp, alloc * sizeof (mp_limb_t));
 928                   mpn_random (bp, alloc);
 929                   rp = realloc (rp, alloc * sizeof (mp_limb_t));
 930                 }
 931             }
 932
 933           t1 = cached_measure (rp, ap, bp, n1, k, n_measurements);
 934
 935           if (t1 * n_measurements > 0.3)
 936             n_measurements -= 2;
 937           n_measurements = MAX (n_measurements, MIN_REPS);
 938
 939           if (t1 < t0)
 940             {
 941               best_k = k;
 942               t0 = t1;
 943             }
 944         }
 945
 946       n1 = mpn_fft_next_size (prev_n1, best_k);
 947
 948       if (last_best_k != best_k)
 949         {
 950           ASSERT_ALWAYS ((prev_n1 & ((1ul << last_best_k) - 1)) == 1);
 951
 952           if (idx >= FFT_TABLE3_SIZE)
 953             {
 954               printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
 955               abort ();
 956             }
 957           INSERT_FFTTAB (idx, prev_n1 >> last_best_k, best_k);
 958
 959           if (print)
 960             {
 961               printf (", ");
 962               if (idx % 4 == 0)
 963                 printf ("\\\n    ");
 964               printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
 965             }
 966
 967           if (option_trace >= 2)
 968             {
 969               printf ("{%lu,%u}\n", prev_n1, best_k);
 970               fflush (stdout);
 971             }
 972
 973           last_best_k = best_k;
 974           idx++;
 975         }
 976
 977       for (;;)
 978         {
 979           prev_n1 = n1;
 980           prev_eff = fftfill (prev_n1, best_k, p->sqr);
 981           n1 = mpn_fft_next_size (prev_n1 + 1, best_k);
 982           eff = fftfill (n1, best_k, p->sqr);
 983
 984           if (eff != prev_eff)
 985             break;
 986         }
 987
 988       n = prev_n1;
 989     }
 990
 991   kmax = sizeof (mp_size_t) * 4;        /* GMP_MP_SIZE_T_BITS / 2 */
 992   kmax = MIN (kmax, 25-1);
 993   for (k = last_best_k + 1; k <= kmax; k++)
 994     {
 995       if (idx >= FFT_TABLE3_SIZE)
 996         {
 997           printf ("FFT table exhausted, increase FFT_TABLE3_SIZE in gmp-impl.h\n");
 998           abort ();
 999         }
1000       INSERT_FFTTAB (idx, ((1ul << (2*k-2)) + 1) >> (k-1), k);
1001
1002       if (print)
1003         {
1004           printf (", ");
1005           if (idx % 4 == 0)
1006             printf ("\\\n    ");
1007           printf ("{%7u,%2u}", fft_tab[idx].n, fft_tab[idx].k);
1008         }
1009
1010       idx++;
1011     }
1012
1013   if (print)
1014     printf (" }\n");
1015
1016   free (ap);
1017   if (! p->sqr)
1018     free (bp);
1019   free (rp);
1020
1021   return idx;
1022 }
1023
1024 void
1025 fft (struct fft_param_t *p)
1026 {
1027   mp_size_t  size;
1028   int        k, idx, initial_k;
1029
1030   /*** Generate MUL_FFT_MODF_THRESHOLD / SQR_FFT_MODF_THRESHOLD ***/
1031
1032 #if 1
1033   {
1034     /* Use plain one() mechanism, for some reasonable initial values of k.  The
1035        advantage is that we don't depend on mpn_fft_table3, which can therefore
1036        leave it completely uninitialized.  */
1037
1038     static struct param_t param;
1039     mp_size_t thres, best_thres;
1040     int best_k;
1041     char buf[20];
1042
1043     best_thres = MP_SIZE_T_MAX;
1044     best_k = -1;
1045
1046     for (k = 5; k <= 7; k++)
1047       {
1048         param.name = p->modf_threshold_name;
1049         param.min_size = 100;
1050         param.max_size = 2000;
1051         param.function  = p->mul_function;
1052         param.step_factor = 0.0;
1053         param.step = 4;
1054         param.function2 = p->mul_modf_function;
1055         param.noprint = 1;
1056         s.r = k;
1057         one (&thres, &param);
1058         if (thres < best_thres)
1059           {
1060             best_thres = thres;
1061             best_k = k;
1062           }
1063       }
1064
1065     *(p->p_modf_threshold) = best_thres;
1066     sprintf (buf, "k = %d", best_k);
1067     print_define_remark (p->modf_threshold_name, best_thres, buf);
1068     initial_k = best_k;
1069   }
1070 #else
1071   size = p->first_size;
1072   for (;;)
1073     {
1074       double  tk, tm;
1075
1076       size = mpn_fft_next_size (size+1, mpn_fft_best_k (size+1, p->sqr));
1077       k = mpn_fft_best_k (size, p->sqr);
1078
1079       if (size >= p->max_size)
1080         break;
1081
1082       s.size = size + fft_step_size (k) / 2;
1083       s.r = k;
1084       tk = tuneup_measure (p->mul_modf_function, NULL, &s);
1085       if (tk == -1.0)
1086         abort ();
1087
1088       tm = tuneup_measure (p->mul_function, NULL, &s);
1089       if (tm == -1.0)
1090         abort ();
1091
1092       if (option_trace >= 2)
1093         printf ("at %ld   size=%ld  k=%d  %.9f   size=%ld modf %.9f\n",
1094                 (long) size,
1095                 (long) size + fft_step_size (k) / 2, k, tk,
1096                 (long) s.size, tm);
1097
1098       if (tk < tm)
1099         {
1100           *p->p_modf_threshold = s.size;
1101           print_define (p->modf_threshold_name, *p->p_modf_threshold);
1102           break;
1103         }
1104     }
1105   initial_k = ?;
1106 #endif
1107
1108   /*** Generate MUL_FFT_TABLE3 / SQR_FFT_TABLE3 ***/
1109
1110   idx = fftmes (*p->p_modf_threshold, p->max_size, initial_k, p, 0, 1);
1111   printf ("#define %s_SIZE %d\n", p->table_name, idx);
1112
1113   /*** Generate MUL_FFT_THRESHOLD / SQR_FFT_THRESHOLD ***/
1114
1115   size = 2 * *p->p_modf_threshold;      /* OK? */
1116   for (;;)
1117     {
1118       double  tk, tm;
1119       mp_size_t mulmod_size, mul_size;;
1120
1121       if (size >= p->max_size)
1122         break;
1123
1124       mulmod_size = mpn_mulmod_bnm1_next_size (2 * (size + 1)) / 2;
1125       mul_size = (size + mulmod_size) / 2;      /* middle of step */
1126
1127       s.size = mulmod_size;
1128       tk = tuneup_measure (p->function, NULL, &s);
1129       if (tk == -1.0)
1130         abort ();
1131
1132       s.size = mul_size;
1133       tm = tuneup_measure (p->mul_function, NULL, &s);
1134       if (tm == -1.0)
1135         abort ();
1136
1137       if (option_trace >= 2)
1138         printf ("at %ld   size=%ld  %.9f   size=%ld mul %.9f\n",
1139                 (long) size,
1140                 (long) mulmod_size, tk,
1141                 (long) mul_size, tm);
1142
1143       size = mulmod_size;
1144
1145       if (tk < tm)
1146         {
1147           *p->p_threshold = s.size;
1148           print_define (p->threshold_name, *p->p_threshold);
1149           break;
1150         }
1151     }
1152 }
1153
1154
1155
1156 /* Start karatsuba from 4, since the Cray t90 ieee code is much faster at 2,
1157    giving wrong results.  */
1158 void
1159 tune_mul_n (void)
1160 {
1161   static struct param_t  param;
1162
1163   param.function = speed_mpn_mul_n;
1164
1165   param.name = "MUL_TOOM22_THRESHOLD";
1166   param.min_size = MAX (4, MPN_TOOM22_MUL_MINSIZE);
1167   param.max_size = MUL_TOOM22_THRESHOLD_LIMIT-1;
1168   one (&mul_toom22_threshold, &param);
1169
1170   param.name = "MUL_TOOM33_THRESHOLD";
1171   param.min_size = MAX (mul_toom22_threshold, MPN_TOOM33_MUL_MINSIZE);
1172   param.max_size = MUL_TOOM33_THRESHOLD_LIMIT-1;
1173   one (&mul_toom33_threshold, &param);
1174
1175   param.name = "MUL_TOOM44_THRESHOLD";
1176   param.min_size = MAX (mul_toom33_threshold, MPN_TOOM44_MUL_MINSIZE);
1177   param.max_size = MUL_TOOM44_THRESHOLD_LIMIT-1;
1178   one (&mul_toom44_threshold, &param);
1179
1180   param.name = "MUL_TOOM6H_THRESHOLD";
1181   param.min_size = MAX (mul_toom44_threshold, MPN_TOOM6H_MUL_MINSIZE);
1182   param.max_size = MUL_TOOM6H_THRESHOLD_LIMIT-1;
1183   one (&mul_toom6h_threshold, &param);
1184
1185   param.name = "MUL_TOOM8H_THRESHOLD";
1186   param.min_size = MAX (mul_toom6h_threshold, MPN_TOOM8H_MUL_MINSIZE);
1187   param.max_size = MUL_TOOM8H_THRESHOLD_LIMIT-1;
1188   one (&mul_toom8h_threshold, &param);
1189
1190   /* disabled until tuned */
1191   MUL_FFT_THRESHOLD = MP_SIZE_T_MAX;
1192 }
1193
1194 void
1195 tune_mul (void)
1196 {
1197   static struct param_t  param;
1198   mp_size_t thres;
1199
1200   param.noprint = 1;
1201
1202   param.function = speed_mpn_toom32_for_toom43_mul;
1203   param.function2 = speed_mpn_toom43_for_toom32_mul;
1204   param.name = "MUL_TOOM32_TO_TOOM43_THRESHOLD";
1205   param.min_size = MPN_TOOM43_MUL_MINSIZE;
1206   one (&thres, &param);
1207   mul_toom32_to_toom43_threshold = 17*thres/24;
1208   print_define ("MUL_TOOM32_TO_TOOM43_THRESHOLD", mul_toom32_to_toom43_threshold);
1209
1210   param.function = speed_mpn_toom32_for_toom53_mul;
1211   param.function2 = speed_mpn_toom53_for_toom32_mul;
1212   param.name = "MUL_TOOM32_TO_TOOM53_THRESHOLD";
1213   param.min_size = MPN_TOOM53_MUL_MINSIZE;
1214   one (&thres, &param);
1215   mul_toom32_to_toom53_threshold = 19*thres/30;
1216   print_define ("MUL_TOOM32_TO_TOOM53_THRESHOLD", mul_toom32_to_toom53_threshold);
1217
1218   param.function = speed_mpn_toom42_for_toom53_mul;
1219   param.function2 = speed_mpn_toom53_for_toom42_mul;
1220   param.name = "MUL_TOOM42_TO_TOOM53_THRESHOLD";
1221   param.min_size = MPN_TOOM53_MUL_MINSIZE;
1222   one (&thres, &param);
1223   mul_toom42_to_toom53_threshold = 11*thres/20;
1224   print_define ("MUL_TOOM42_TO_TOOM53_THRESHOLD", mul_toom42_to_toom53_threshold);
1225
1226   param.function = speed_mpn_toom42_mul;
1227   param.function2 = speed_mpn_toom63_mul;
1228   param.name = "MUL_TOOM42_TO_TOOM63_THRESHOLD";
1229   param.min_size = MPN_TOOM63_MUL_MINSIZE;
1230   one (&thres, &param);
1231   mul_toom42_to_toom63_threshold = thres/2;
1232   print_define ("MUL_TOOM42_TO_TOOM63_THRESHOLD", mul_toom42_to_toom63_threshold);
1233 }
1234
1235
1236 void
1237 tune_mullo (void)
1238 {
1239   static struct param_t  param;
1240
1241   param.function = speed_mpn_mullo_n;
1242
1243   param.name = "MULLO_BASECASE_THRESHOLD";
1244   param.min_size = 1;
1245   param.min_is_always = 1;
1246   param.max_size = MULLO_BASECASE_THRESHOLD_LIMIT-1;
1247   param.stop_factor = 1.5;
1248   param.noprint = 1;
1249   one (&mullo_basecase_threshold, &param);
1250
1251   param.name = "MULLO_DC_THRESHOLD";
1252   param.min_size = 8;
1253   param.min_is_always = 0;
1254   param.max_size = 1000;
1255   one (&mullo_dc_threshold, &param);
1256
1257   if (mullo_basecase_threshold >= mullo_dc_threshold)
1258     {
1259       print_define ("MULLO_BASECASE_THRESHOLD", mullo_dc_threshold);
1260       print_define_remark ("MULLO_DC_THRESHOLD", 0, "never mpn_mullo_basecase");
1261     }
1262   else
1263     {
1264       print_define ("MULLO_BASECASE_THRESHOLD", mullo_basecase_threshold);
1265       print_define ("MULLO_DC_THRESHOLD", mullo_dc_threshold);
1266     }
1267
1268 #if WANT_FFT
1269   param.name = "MULLO_MUL_N_THRESHOLD";
1270   param.min_size = mullo_dc_threshold;
1271   param.max_size = 2 * mul_fft_threshold;
1272   param.noprint = 0;
1273   param.step_factor = 0.03;
1274   one (&mullo_mul_n_threshold, &param);
1275 #else
1276   print_define_remark ("MULLO_MUL_N_THRESHOLD", MP_SIZE_T_MAX,
1277                            "without FFT use mullo forever");
1278 #endif
1279 }
1280
1281 void
1282 tune_mulmod_bnm1 (void)
1283 {
1284   static struct param_t  param;
1285
1286   param.name = "MULMOD_BNM1_THRESHOLD";
1287   param.function = speed_mpn_mulmod_bnm1;
1288   param.min_size = 4;
1289   param.max_size = 100;
1290   one (&mulmod_bnm1_threshold, &param);
1291 }
1292
1293 void
1294 tune_sqrmod_bnm1 (void)
1295 {
1296   static struct param_t  param;
1297
1298   param.name = "SQRMOD_BNM1_THRESHOLD";
1299   param.function = speed_mpn_sqrmod_bnm1;
1300   param.min_size = 4;
1301   param.max_size = 100;
1302   one (&sqrmod_bnm1_threshold, &param);
1303 }
1304
1305
1306 /* Start the basecase from 3, since 1 is a special case, and if mul_basecase
1307    is faster only at size==2 then we don't want to bother with extra code
1308    just for that.  Start karatsuba from 4 same as MUL above.  */
1309
1310 void
1311 tune_sqr (void)
1312 {
1313   /* disabled until tuned */
1314   SQR_FFT_THRESHOLD = MP_SIZE_T_MAX;
1315
1316   if (HAVE_NATIVE_mpn_sqr_basecase)
1317     {
1318       print_define_remark ("SQR_BASECASE_THRESHOLD", 0, "always (native)");
1319       sqr_basecase_threshold = 0;
1320     }
1321   else
1322     {
1323       static struct param_t  param;
1324       param.name = "SQR_BASECASE_THRESHOLD";
1325       param.function = speed_mpn_sqr;
1326       param.min_size = 3;
1327       param.min_is_always = 1;
1328       param.max_size = TUNE_SQR_TOOM2_MAX;
1329       param.noprint = 1;
1330       one (&sqr_basecase_threshold, &param);
1331     }
1332
1333   {
1334     static struct param_t  param;
1335     param.name = "SQR_TOOM2_THRESHOLD";
1336     param.function = speed_mpn_sqr;
1337     param.min_size = MAX (4, MPN_TOOM2_SQR_MINSIZE);
1338     param.max_size = TUNE_SQR_TOOM2_MAX;
1339     param.noprint = 1;
1340     one (&sqr_toom2_threshold, &param);
1341
1342     if (! HAVE_NATIVE_mpn_sqr_basecase
1343         && sqr_toom2_threshold < sqr_basecase_threshold)
1344       {
1345         /* Karatsuba becomes faster than mul_basecase before
1346            sqr_basecase does.  Arrange for the expression
1347            "BELOW_THRESHOLD (un, SQR_TOOM2_THRESHOLD))" which
1348            selects mpn_sqr_basecase in mpn_sqr to be false, by setting
1349            SQR_TOOM2_THRESHOLD to zero, making
1350            SQR_BASECASE_THRESHOLD the toom2 threshold.  */
1351
1352         sqr_basecase_threshold = SQR_TOOM2_THRESHOLD;
1353         SQR_TOOM2_THRESHOLD = 0;
1354
1355         print_define_remark ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold,
1356                              "toom2");
1357         print_define_remark ("SQR_TOOM2_THRESHOLD",SQR_TOOM2_THRESHOLD,
1358                              "never sqr_basecase");
1359       }
1360     else
1361       {
1362         if (! HAVE_NATIVE_mpn_sqr_basecase)
1363           print_define ("SQR_BASECASE_THRESHOLD", sqr_basecase_threshold);
1364         print_define ("SQR_TOOM2_THRESHOLD", SQR_TOOM2_THRESHOLD);
1365       }
1366   }
1367
1368   {
1369     static struct param_t  param;
1370     mp_size_t toom3_start = MAX (sqr_toom2_threshold, sqr_basecase_threshold);
1371
1372     param.function = speed_mpn_sqr;
1373
1374     param.name = "SQR_TOOM3_THRESHOLD";
1375     param.min_size = MAX (toom3_start, MPN_TOOM3_SQR_MINSIZE);
1376     param.max_size = SQR_TOOM3_THRESHOLD_LIMIT-1;
1377     one (&sqr_toom3_threshold, &param);
1378
1379     param.name = "SQR_TOOM4_THRESHOLD";
1380     param.min_size = MAX (sqr_toom3_threshold, MPN_TOOM4_SQR_MINSIZE);
1381     param.max_size = SQR_TOOM4_THRESHOLD_LIMIT-1;
1382     one (&sqr_toom4_threshold, &param);
1383
1384     param.name = "SQR_TOOM6_THRESHOLD";
1385     param.min_size = MAX (sqr_toom4_threshold, MPN_TOOM6_SQR_MINSIZE);
1386     param.max_size = SQR_TOOM6_THRESHOLD_LIMIT-1;
1387     one (&sqr_toom6_threshold, &param);
1388
1389     param.name = "SQR_TOOM8_THRESHOLD";
1390     param.min_size = MAX (sqr_toom6_threshold, MPN_TOOM8_SQR_MINSIZE);
1391     param.max_size = SQR_TOOM8_THRESHOLD_LIMIT-1;
1392     one (&sqr_toom8_threshold, &param);
1393   }
1394 }
1395
1396
1397 void
1398 tune_dc_div (void)
1399 {
1400   s.r = 0;              /* clear to make speed function do 2n/n */
1401   {
1402     static struct param_t  param;
1403     param.name = "DC_DIV_QR_THRESHOLD";
1404     param.function = speed_mpn_sbpi1_div_qr;
1405     param.function2 = speed_mpn_dcpi1_div_qr;
1406     param.min_size = 6;
1407     one (&dc_div_qr_threshold, &param);
1408   }
1409   {
1410     static struct param_t  param;
1411     param.name = "DC_DIVAPPR_Q_THRESHOLD";
1412     param.function = speed_mpn_sbpi1_divappr_q;
1413     param.function2 = speed_mpn_dcpi1_divappr_q;
1414     param.min_size = 6;
1415     one (&dc_divappr_q_threshold, &param);
1416   }
1417 }
1418
1419 static double
1420 speed_mpn_sbordcpi1_div_qr (struct speed_params *s)
1421 {
1422   if (s->size < DC_DIV_QR_THRESHOLD)
1423     return speed_mpn_sbpi1_div_qr (s);
1424   else
1425     return speed_mpn_dcpi1_div_qr (s);
1426 }
1427
1428 void
1429 tune_mu_div (void)
1430 {
1431   s.r = 0;              /* clear to make speed function do 2n/n */
1432   {
1433     static struct param_t  param;
1434     param.name = "MU_DIV_QR_THRESHOLD";
1435     param.function = speed_mpn_dcpi1_div_qr;
1436     param.function2 = speed_mpn_mu_div_qr;
1437     param.min_size = 6;
1438     param.max_size = 5000;
1439     param.step_factor = 0.02;
1440     one (&mu_div_qr_threshold, &param);
1441   }
1442   {
1443     static struct param_t  param;
1444     param.name = "MU_DIVAPPR_Q_THRESHOLD";
1445     param.function = speed_mpn_dcpi1_divappr_q;
1446     param.function2 = speed_mpn_mu_divappr_q;
1447     param.min_size = 6;
1448     param.max_size = 5000;
1449     param.step_factor = 0.02;
1450     one (&mu_divappr_q_threshold, &param);
1451   }
1452   {
1453     static struct param_t  param;
1454     param.name = "MUPI_DIV_QR_THRESHOLD";
1455     param.function = speed_mpn_sbordcpi1_div_qr;
1456     param.function2 = speed_mpn_mupi_div_qr;
1457     param.min_size = 6;
1458     param.min_is_always = 1;
1459     param.max_size = 1000;
1460     param.step_factor = 0.02;
1461     one (&mupi_div_qr_threshold, &param);
1462   }
1463 }
1464
1465 void
1466 tune_dc_bdiv (void)
1467 {
1468   s.r = 0;              /* clear to make speed function do 2n/n*/
1469   {
1470     static struct param_t  param;
1471     param.name = "DC_BDIV_QR_THRESHOLD";
1472     param.function = speed_mpn_sbpi1_bdiv_qr;
1473     param.function2 = speed_mpn_dcpi1_bdiv_qr;
1474     param.min_size = 4;
1475     one (&dc_bdiv_qr_threshold, &param);
1476   }
1477   {
1478     static struct param_t  param;
1479     param.name = "DC_BDIV_Q_THRESHOLD";
1480     param.function = speed_mpn_sbpi1_bdiv_q;
1481     param.function2 = speed_mpn_dcpi1_bdiv_q;
1482     param.min_size = 4;
1483     one (&dc_bdiv_q_threshold, &param);
1484   }
1485 }
1486
1487 void
1488 tune_mu_bdiv (void)
1489 {
1490   s.r = 0;              /* clear to make speed function do 2n/n*/
1491   {
1492     static struct param_t  param;
1493     param.name = "MU_BDIV_QR_THRESHOLD";
1494     param.function = speed_mpn_dcpi1_bdiv_qr;
1495     param.function2 = speed_mpn_mu_bdiv_qr;
1496     param.min_size = 4;
1497     param.max_size = 5000;
1498     param.step_factor = 0.02;
1499     one (&mu_bdiv_qr_threshold, &param);
1500   }
1501   {
1502     static struct param_t  param;
1503     param.name = "MU_BDIV_Q_THRESHOLD";
1504     param.function = speed_mpn_dcpi1_bdiv_q;
1505     param.function2 = speed_mpn_mu_bdiv_q;
1506     param.min_size = 4;
1507     param.max_size = 5000;
1508     param.step_factor = 0.02;
1509     one (&mu_bdiv_q_threshold, &param);
1510   }
1511 }
1512
1513 void
1514 tune_invertappr (void)
1515 {
1516   static struct param_t  param;
1517
1518   param.function = speed_mpn_ni_invertappr;
1519   param.name = "INV_MULMOD_BNM1_THRESHOLD";
1520   param.min_size = 4;
1521   one (&inv_mulmod_bnm1_threshold, &param);
1522
1523   param.function = speed_mpn_invertappr;
1524   param.name = "INV_NEWTON_THRESHOLD";
1525   param.min_size = 3;
1526   one (&inv_newton_threshold, &param);
1527 }
1528
1529 void
1530 tune_invert (void)
1531 {
1532   static struct param_t  param;
1533
1534   param.function = speed_mpn_invert;
1535   param.name = "INV_APPR_THRESHOLD";
1536   param.min_size = 3;
1537   one (&inv_appr_threshold, &param);
1538 }
1539
1540 void
1541 tune_binvert (void)
1542 {
1543   static struct param_t  param;
1544
1545   param.function = speed_mpn_binvert;
1546   param.name = "BINV_NEWTON_THRESHOLD";
1547   param.min_size = 8;           /* pointless with smaller operands */
1548   one (&binv_newton_threshold, &param);
1549 }
1550
1551 void
1552 tune_redc (void)
1553 {
1554 #define TUNE_REDC_2_MAX 100
1555 #if HAVE_NATIVE_mpn_addmul_2 || HAVE_NATIVE_mpn_redc_2
1556 #define WANT_REDC_2 1
1557 #endif
1558
1559 #if WANT_REDC_2
1560   {
1561     static struct param_t  param;
1562     param.name = "REDC_1_TO_REDC_2_THRESHOLD";
1563     param.function = speed_mpn_redc_1;
1564     param.function2 = speed_mpn_redc_2;
1565     param.max_size = TUNE_REDC_2_MAX;
1566     param.noprint = 1;
1567     one (&redc_1_to_redc_2_threshold, &param);
1568   }
1569   {
1570     static struct param_t  param;
1571     param.name = "REDC_2_TO_REDC_N_THRESHOLD";
1572     param.function = speed_mpn_redc_2;
1573     param.function2 = speed_mpn_redc_n;
1574     param.min_size = 16;
1575     param.noprint = 1;
1576     one (&redc_2_to_redc_n_threshold, &param);
1577   }
1578   if (redc_1_to_redc_2_threshold >= TUNE_REDC_2_MAX - 1)
1579     {
1580       /* Disable REDC_2.  This is not supposed to happen.  */
1581       print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1582       print_define_remark ("REDC_2_TO_REDC_N_THRESHOLD", 0, "anomaly: never REDC_2");
1583     }
1584   else
1585     {
1586       print_define ("REDC_1_TO_REDC_2_THRESHOLD", REDC_1_TO_REDC_2_THRESHOLD);
1587       print_define ("REDC_2_TO_REDC_N_THRESHOLD", REDC_2_TO_REDC_N_THRESHOLD);
1588     }
1589 #else
1590   {
1591     static struct param_t  param;
1592     param.name = "REDC_1_TO_REDC_N_THRESHOLD";
1593     param.function = speed_mpn_redc_1;
1594     param.function2 = speed_mpn_redc_n;
1595     param.min_size = 16;
1596     one (&redc_1_to_redc_n_threshold, &param);
1597   }
1598 #endif
1599 }
1600
1601 void
1602 tune_matrix22_mul (void)
1603 {
1604   static struct param_t  param;
1605   param.name = "MATRIX22_STRASSEN_THRESHOLD";
1606   param.function = speed_mpn_matrix22_mul;
1607   param.min_size = 2;
1608   one (&matrix22_strassen_threshold, &param);
1609 }
1610
1611 void
1612 tune_hgcd (void)
1613 {
1614   static struct param_t  param;
1615   param.name = "HGCD_THRESHOLD";
1616   param.function = speed_mpn_hgcd;
1617   /* We seem to get strange results for small sizes */
1618   param.min_size = 30;
1619   one (&hgcd_threshold, &param);
1620 }
1621
1622 void
1623 tune_gcd_dc (void)
1624 {
1625   static struct param_t  param;
1626   param.name = "GCD_DC_THRESHOLD";
1627   param.function = speed_mpn_gcd;
1628   param.min_size = hgcd_threshold;
1629   param.max_size = 3000;
1630   param.step_factor = 0.02;
1631   one (&gcd_dc_threshold, &param);
1632 }
1633
1634 void
1635 tune_gcdext_dc (void)
1636 {
1637   static struct param_t  param;
1638   param.name = "GCDEXT_DC_THRESHOLD";
1639   param.function = speed_mpn_gcdext;
1640   param.min_size = hgcd_threshold;
1641   param.max_size = 3000;
1642   param.step_factor = 0.02;
1643   one (&gcdext_dc_threshold, &param);
1644 }
1645
1646
1647 /* size_extra==1 reflects the fact that with high<divisor one division is
1648    always skipped.  Forcing high<divisor while testing ensures consistency
1649    while stepping through sizes, ie. that size-1 divides will be done each
1650    time.
1651
1652    min_size==2 and min_is_always are used so that if plain division is only
1653    better at size==1 then don't bother including that code just for that
1654    case, instead go with preinv always and get a size saving.  */
1655
1656 #define DIV_1_PARAMS                    \
1657   param.check_size = 256;               \
1658   param.min_size = 2;                   \
1659   param.min_is_always = 1;              \
1660   param.data_high = DATA_HIGH_LT_R;     \
1661   param.size_extra = 1;                 \
1662   param.stop_factor = 2.0;
1663
1664
1665 double (*tuned_speed_mpn_divrem_1) __GMP_PROTO ((struct speed_params *));
1666
1667 void
1668 tune_divrem_1 (void)
1669 {
1670   /* plain version by default */
1671   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1;
1672
1673   /* No support for tuning native assembler code, do that by hand and put
1674      the results in the .asm file, there's no need for such thresholds to
1675      appear in gmp-mparam.h.  */
1676   if (HAVE_NATIVE_mpn_divrem_1)
1677     return;
1678
1679   if (GMP_NAIL_BITS != 0)
1680     {
1681       print_define_remark ("DIVREM_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
1682                            "no preinv with nails");
1683       print_define_remark ("DIVREM_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
1684                            "no preinv with nails");
1685       return;
1686     }
1687
1688   if (UDIV_PREINV_ALWAYS)
1689     {
1690       print_define_remark ("DIVREM_1_NORM_THRESHOLD", 0L, "preinv always");
1691       print_define ("DIVREM_1_UNNORM_THRESHOLD", 0L);
1692       return;
1693     }
1694
1695   tuned_speed_mpn_divrem_1 = speed_mpn_divrem_1_tune;
1696
1697   /* Tune for the integer part of mpn_divrem_1.  This will very possibly be
1698      a bit out for the fractional part, but that's too bad, the integer part
1699      is more important. */
1700   {
1701     static struct param_t  param;
1702     param.name = "DIVREM_1_NORM_THRESHOLD";
1703     DIV_1_PARAMS;
1704     s.r = randlimb_norm ();
1705     param.function = speed_mpn_divrem_1_tune;
1706     one (&divrem_1_norm_threshold, &param);
1707   }
1708   {
1709     static struct param_t  param;
1710     param.name = "DIVREM_1_UNNORM_THRESHOLD";
1711     DIV_1_PARAMS;
1712     s.r = randlimb_half ();
1713     param.function = speed_mpn_divrem_1_tune;
1714     one (&divrem_1_unnorm_threshold, &param);
1715   }
1716 }
1717
1718
1719 void
1720 tune_mod_1 (void)
1721 {
1722   /* No support for tuning native assembler code, do that by hand and put
1723      the results in the .asm file, there's no need for such thresholds to
1724      appear in gmp-mparam.h.  */
1725   if (HAVE_NATIVE_mpn_mod_1)
1726     return;
1727
1728   if (GMP_NAIL_BITS != 0)
1729     {
1730       print_define_remark ("MOD_1_NORM_THRESHOLD", MP_SIZE_T_MAX,
1731                            "no preinv with nails");
1732       print_define_remark ("MOD_1_UNNORM_THRESHOLD", MP_SIZE_T_MAX,
1733                            "no preinv with nails");
1734       return;
1735     }
1736
1737   if (UDIV_PREINV_ALWAYS)
1738     {
1739       print_define ("MOD_1_NORM_THRESHOLD", 0L);
1740       print_define ("MOD_1_UNNORM_THRESHOLD", 0L);
1741     }
1742   else
1743     {
1744       {
1745         static struct param_t  param;
1746         param.name = "MOD_1_NORM_THRESHOLD";
1747         DIV_1_PARAMS;
1748         s.r = randlimb_norm ();
1749         param.function = speed_mpn_mod_1_tune;
1750         one (&mod_1_norm_threshold, &param);
1751       }
1752       {
1753         static struct param_t  param;
1754         param.name = "MOD_1_UNNORM_THRESHOLD";
1755         DIV_1_PARAMS;
1756         s.r = randlimb_half ();
1757         param.function = speed_mpn_mod_1_tune;
1758         one (&mod_1_unnorm_threshold, &param);
1759       }
1760     }
1761   {
1762     static struct param_t  param;
1763
1764     param.check_size = 256;
1765
1766     s.r = randlimb_norm ();
1767     param.function = speed_mpn_mod_1_tune;
1768
1769     param.name = "MOD_1N_TO_MOD_1_1_THRESHOLD";
1770     param.min_size = 2;
1771     one (&mod_1n_to_mod_1_1_threshold, &param);
1772   }
1773   {
1774     static struct param_t  param;
1775
1776     param.check_size = 256;
1777
1778     s.r = randlimb_norm () / 5;
1779     param.function = speed_mpn_mod_1_tune;
1780     param.noprint = 1;
1781
1782     param.name = "MOD_1U_TO_MOD_1_1_THRESHOLD";
1783     param.min_size = 2;
1784     one (&mod_1u_to_mod_1_1_threshold, &param);
1785
1786     param.name = "MOD_1_1_TO_MOD_1_2_THRESHOLD";
1787     param.min_size = mod_1u_to_mod_1_1_threshold;
1788     one (&mod_1_1_to_mod_1_2_threshold, &param);
1789
1790     if (mod_1u_to_mod_1_1_threshold + 2 >= mod_1_1_to_mod_1_2_threshold)
1791       {
1792         /* Disable mod_1_1, mod_1_2 is always faster.  Measure when to switch
1793            (from mod_1_unnorm) to mod_1_2.  */
1794         mod_1_1_to_mod_1_2_threshold = 0;
1795
1796         /* This really measures mod_1u -> mod_1_2 */
1797         param.min_size = 1;
1798         one (&mod_1u_to_mod_1_1_threshold, &param);
1799       }
1800     print_define_remark ("MOD_1U_TO_MOD_1_1_THRESHOLD", mod_1u_to_mod_1_1_threshold, NULL);
1801
1802     param.name = "MOD_1_2_TO_MOD_1_4_THRESHOLD";
1803     param.min_size = mod_1_1_to_mod_1_2_threshold;
1804     one (&mod_1_2_to_mod_1_4_threshold, &param);
1805
1806     if (mod_1_1_to_mod_1_2_threshold + 2 >= mod_1_2_to_mod_1_4_threshold)
1807       {
1808         /* Disable mod_1_2, mod_1_4 is always faster.  Measure when to switch
1809            (from mod_1_unnorm or mod_1_1) to mod_1_4.  */
1810         mod_1_2_to_mod_1_4_threshold = 0;
1811
1812         param.min_size = 1;
1813         one (&mod_1_1_to_mod_1_2_threshold, &param);
1814       }
1815     print_define_remark ("MOD_1_1_TO_MOD_1_2_THRESHOLD", mod_1_1_to_mod_1_2_threshold, NULL);
1816     print_define_remark ("MOD_1_2_TO_MOD_1_4_THRESHOLD", mod_1_2_to_mod_1_4_threshold, NULL);
1817   }
1818
1819   {
1820     static struct param_t  param;
1821
1822     param.check_size = 256;
1823
1824     param.name = "PREINV_MOD_1_TO_MOD_1_THRESHOLD";
1825     s.r = randlimb_norm ();
1826     param.function = speed_mpn_preinv_mod_1;
1827     param.function2 = speed_mpn_mod_1_tune;
1828     one (&preinv_mod_1_to_mod_1_threshold, &param);
1829   }
1830 }
1831
1832
1833 /* A non-zero DIVREM_1_UNNORM_THRESHOLD (or DIVREM_1_NORM_THRESHOLD) would
1834    imply that udiv_qrnnd_preinv is worth using, but it seems most
1835    straightforward to compare mpn_preinv_divrem_1 and mpn_divrem_1_div
1836    directly.  */
1837
1838 void
1839 tune_preinv_divrem_1 (void)
1840 {
1841   static struct param_t  param;
1842   speed_function_t  divrem_1;
1843   const char        *divrem_1_name;
1844   double            t1, t2;
1845
1846   if (GMP_NAIL_BITS != 0)
1847     {
1848       print_define_remark ("USE_PREINV_DIVREM_1", 0, "no preinv with nails");
1849       return;
1850     }
1851
1852   /* Any native version of mpn_preinv_divrem_1 is assumed to exist because
1853      it's faster than mpn_divrem_1.  */
1854   if (HAVE_NATIVE_mpn_preinv_divrem_1)
1855     {
1856       print_define_remark ("USE_PREINV_DIVREM_1", 1, "native");
1857       return;
1858     }
1859
1860   /* If udiv_qrnnd_preinv is the only division method then of course
1861      mpn_preinv_divrem_1 should be used.  */
1862   if (UDIV_PREINV_ALWAYS)
1863     {
1864       print_define_remark ("USE_PREINV_DIVREM_1", 1, "preinv always");
1865       return;
1866     }
1867
1868   /* If we've got an assembler version of mpn_divrem_1, then compare against
1869      that, not the mpn_divrem_1_div generic C.  */
1870   if (HAVE_NATIVE_mpn_divrem_1)
1871     {
1872       divrem_1 = speed_mpn_divrem_1;
1873       divrem_1_name = "mpn_divrem_1";
1874     }
1875   else
1876     {
1877       divrem_1 = speed_mpn_divrem_1_div;
1878       divrem_1_name = "mpn_divrem_1_div";
1879     }
1880
1881   param.data_high = DATA_HIGH_LT_R; /* allow skip one division */
1882   s.size = 200;                     /* generous but not too big */
1883   /* Divisor, nonzero.  Unnormalized so as to exercise the shift!=0 case,
1884      since in general that's probably most common, though in fact for a
1885      64-bit limb mp_bases[10].big_base is normalized.  */
1886   s.r = urandom() & (GMP_NUMB_MASK >> 4);
1887   if (s.r == 0) s.r = 123;
1888
1889   t1 = tuneup_measure (speed_mpn_preinv_divrem_1, &param, &s);
1890   t2 = tuneup_measure (divrem_1, &param, &s);
1891   if (t1 == -1.0 || t2 == -1.0)
1892     {
1893       printf ("Oops, can't measure mpn_preinv_divrem_1 and %s at %ld\n",
1894               divrem_1_name, (long) s.size);
1895       abort ();
1896     }
1897   if (option_trace >= 1)
1898     printf ("size=%ld, mpn_preinv_divrem_1 %.9f, %s %.9f\n",
1899             (long) s.size, t1, divrem_1_name, t2);
1900
1901   print_define_remark ("USE_PREINV_DIVREM_1", (mp_size_t) (t1 < t2), NULL);
1902 }
1903
1904
1905
1906 void
1907 tune_divrem_2 (void)
1908 {
1909   static struct param_t  param;
1910
1911   /* No support for tuning native assembler code, do that by hand and put
1912      the results in the .asm file, and there's no need for such thresholds
1913      to appear in gmp-mparam.h.  */
1914   if (HAVE_NATIVE_mpn_divrem_2)
1915     return;
1916
1917   if (GMP_NAIL_BITS != 0)
1918     {
1919       print_define_remark ("DIVREM_2_THRESHOLD", MP_SIZE_T_MAX,
1920                            "no preinv with nails");
1921       return;
1922     }
1923
1924   if (UDIV_PREINV_ALWAYS)
1925     {
1926       print_define_remark ("DIVREM_2_THRESHOLD", 0L, "preinv always");
1927       return;
1928     }
1929
1930   /* Tune for the integer part of mpn_divrem_2.  This will very possibly be
1931      a bit out for the fractional part, but that's too bad, the integer part
1932      is more important.
1933
1934      min_size must be >=2 since nsize>=2 is required, but is set to 4 to save
1935      code space if plain division is better only at size==2 or size==3. */
1936   param.name = "DIVREM_2_THRESHOLD";
1937   param.check_size = 256;
1938   param.min_size = 4;
1939   param.min_is_always = 1;
1940   param.size_extra = 2;      /* does qsize==nsize-2 divisions */
1941   param.stop_factor = 2.0;
1942
1943   s.r = randlimb_norm ();
1944   param.function = speed_mpn_divrem_2;
1945   one (&divrem_2_threshold, &param);
1946 }
1947
1948
1949 /* mpn_divexact_1 is vaguely expected to be used on smallish divisors, so
1950    tune for that.  Its speed can differ on odd or even divisor, so take an
1951    average threshold for the two.
1952
1953    mpn_divrem_1 can vary with high<divisor or not, whereas mpn_divexact_1
1954    might not vary that way, but don't test this since high<divisor isn't
1955    expected to occur often with small divisors.  */
1956
1957 void
1958 tune_divexact_1 (void)
1959 {
1960   static struct param_t  param;
1961   mp_size_t  thresh[2], average;
1962   int        low, i;
1963
1964   /* Any native mpn_divexact_1 is assumed to incorporate all the speed of a
1965      full mpn_divrem_1.  */
1966   if (HAVE_NATIVE_mpn_divexact_1)
1967     {
1968       print_define_remark ("DIVEXACT_1_THRESHOLD", 0, "always (native)");
1969       return;
1970     }
1971
1972   ASSERT_ALWAYS (tuned_speed_mpn_divrem_1 != NULL);
1973
1974   param.name = "DIVEXACT_1_THRESHOLD";
1975   param.data_high = DATA_HIGH_GE_R;
1976   param.check_size = 256;
1977   param.min_size = 2;
1978   param.stop_factor = 1.5;
1979   param.function  = tuned_speed_mpn_divrem_1;
1980   param.function2 = speed_mpn_divexact_1;
1981   param.noprint = 1;
1982
1983   print_define_start (param.name);
1984
1985   for (low = 0; low <= 1; low++)
1986     {
1987       s.r = randlimb_half();
1988       if (low == 0)
1989         s.r |= 1;
1990       else
1991         s.r &= ~CNST_LIMB(7);
1992
1993       one (&thresh[low], &param);
1994       if (option_trace)
1995         printf ("low=%d thresh %ld\n", low, (long) thresh[low]);
1996
1997       if (thresh[low] == MP_SIZE_T_MAX)
1998         {
1999           average = MP_SIZE_T_MAX;
2000           goto divexact_1_done;
2001         }
2002     }
2003
2004   if (option_trace)
2005     {
2006       printf ("average of:");
2007       for (i = 0; i < numberof(thresh); i++)
2008         printf (" %ld", (long) thresh[i]);
2009       printf ("\n");
2010     }
2011
2012   average = 0;
2013   for (i = 0; i < numberof(thresh); i++)
2014     average += thresh[i];
2015   average /= numberof(thresh);
2016
2017   /* If divexact turns out to be better as early as 3 limbs, then use it
2018      always, so as to reduce code size and conditional jumps.  */
2019   if (average <= 3)
2020     average = 0;
2021
2022  divexact_1_done:
2023   print_define_end (param.name, average);
2024 }
2025
2026
2027 /* The generic mpn_modexact_1_odd skips a divide step if high<divisor, the
2028    same as mpn_mod_1, but this might not be true of an assembler
2029    implementation.  The threshold used is an average based on data where a
2030    divide can be skipped and where it can't.
2031
2032    If modexact turns out to be better as early as 3 limbs, then use it
2033    always, so as to reduce code size and conditional jumps.  */
2034
2035 void
2036 tune_modexact_1_odd (void)
2037 {
2038   static struct param_t  param;
2039   mp_size_t  thresh_lt, thresh_ge, average;
2040
2041 #if 0
2042   /* Any native mpn_modexact_1_odd is assumed to incorporate all the speed
2043      of a full mpn_mod_1.  */
2044   if (HAVE_NATIVE_mpn_modexact_1_odd)
2045     {
2046       print_define_remark ("BMOD_1_TO_MOD_1_THRESHOLD", MP_SIZE_T_MAX, "always bmod_1");
2047       return;
2048     }
2049 #endif
2050
2051   param.name = "BMOD_1_TO_MOD_1_THRESHOLD";
2052   param.check_size = 256;
2053   param.min_size = 2;
2054   param.stop_factor = 1.5;
2055   param.function  = speed_mpn_modexact_1c_odd;
2056   param.function2 = speed_mpn_mod_1_tune;
2057   param.noprint = 1;
2058   s.r = randlimb_half () | 1;
2059
2060   print_define_start (param.name);
2061
2062   param.data_high = DATA_HIGH_LT_R;
2063   one (&thresh_lt, &param);
2064   if (option_trace)
2065     printf ("lt thresh %ld\n", (long) thresh_lt);
2066
2067   average = thresh_lt;
2068   if (thresh_lt != MP_SIZE_T_MAX)
2069     {
2070       param.data_high = DATA_HIGH_GE_R;
2071       one (&thresh_ge, &param);
2072       if (option_trace)
2073         printf ("ge thresh %ld\n", (long) thresh_ge);
2074
2075       if (thresh_ge != MP_SIZE_T_MAX)
2076         {
2077           average = (thresh_ge + thresh_lt) / 2;
2078           if (thresh_ge <= 3)
2079             average = 0;
2080         }
2081     }
2082
2083   print_define_end (param.name, average);
2084 }
2085
2086
2087 void
2088 tune_jacobi_base (void)
2089 {
2090   static struct param_t  param;
2091   double   t1, t2, t3;
2092   int      method;
2093
2094   s.size = GMP_LIMB_BITS * 3 / 4;
2095
2096   t1 = tuneup_measure (speed_mpn_jacobi_base_1, &param, &s);
2097   if (option_trace >= 1)
2098     printf ("size=%ld, mpn_jacobi_base_1 %.9f\n", (long) s.size, t1);
2099
2100   t2 = tuneup_measure (speed_mpn_jacobi_base_2, &param, &s);
2101   if (option_trace >= 1)
2102     printf ("size=%ld, mpn_jacobi_base_2 %.9f\n", (long) s.size, t2);
2103
2104   t3 = tuneup_measure (speed_mpn_jacobi_base_3, &param, &s);
2105   if (option_trace >= 1)
2106     printf ("size=%ld, mpn_jacobi_base_3 %.9f\n", (long) s.size, t3);
2107
2108   if (t1 == -1.0 || t2 == -1.0 || t3 == -1.0)
2109     {
2110       printf ("Oops, can't measure all mpn_jacobi_base methods at %ld\n",
2111               (long) s.size);
2112       abort ();
2113     }
2114
2115   if (t1 < t2 && t1 < t3)
2116     method = 1;
2117   else if (t2 < t3)
2118     method = 2;
2119   else
2120     method = 3;
2121
2122   print_define ("JACOBI_BASE_METHOD", method);
2123 }
2124
2125
2126 void
2127 tune_get_str (void)
2128 {
2129   /* Tune for decimal, it being most common.  Some rough testing suggests
2130      other bases are different, but not by very much.  */
2131   s.r = 10;
2132   {
2133     static struct param_t  param;
2134     GET_STR_PRECOMPUTE_THRESHOLD = 0;
2135     param.name = "GET_STR_DC_THRESHOLD";
2136     param.function = speed_mpn_get_str;
2137     param.min_size = 4;
2138     param.max_size = GET_STR_THRESHOLD_LIMIT;
2139     one (&get_str_dc_threshold, &param);
2140   }
2141   {
2142     static struct param_t  param;
2143     param.name = "GET_STR_PRECOMPUTE_THRESHOLD";
2144     param.function = speed_mpn_get_str;
2145     param.min_size = GET_STR_DC_THRESHOLD;
2146     param.max_size = GET_STR_THRESHOLD_LIMIT;
2147     one (&get_str_precompute_threshold, &param);
2148   }
2149 }
2150
2151
2152 double
2153 speed_mpn_pre_set_str (struct speed_params *s)
2154 {
2155   unsigned char *str;
2156   mp_ptr     wp;
2157   mp_size_t  wn;
2158   unsigned   i;
2159   int        base;
2160   double     t;
2161   mp_ptr powtab_mem, tp;
2162   powers_t powtab[GMP_LIMB_BITS];
2163   mp_size_t un;
2164   int chars_per_limb;
2165   TMP_DECL;
2166
2167   SPEED_RESTRICT_COND (s->size >= 1);
2168
2169   base = s->r == 0 ? 10 : s->r;
2170   SPEED_RESTRICT_COND (base >= 2 && base <= 256);
2171
2172   TMP_MARK;
2173
2174   str = TMP_ALLOC (s->size);
2175   for (i = 0; i < s->size; i++)
2176     str[i] = s->xp[i] % base;
2177
2178   wn = ((mp_size_t) (s->size / mp_bases[base].chars_per_bit_exactly))
2179     / GMP_LIMB_BITS + 2;
2180   SPEED_TMP_ALLOC_LIMBS (wp, wn, s->align_wp);
2181
2182   /* use this during development to check wn is big enough */
2183   /*
2184   ASSERT_ALWAYS (mpn_set_str (wp, str, s->size, base) <= wn);
2185   */
2186
2187   speed_operand_src (s, (mp_ptr) str, s->size/BYTES_PER_MP_LIMB);
2188   speed_operand_dst (s, wp, wn);
2189   speed_cache_fill (s);
2190
2191   chars_per_limb = mp_bases[base].chars_per_limb;
2192   un = s->size / chars_per_limb + 1;
2193   powtab_mem = TMP_BALLOC_LIMBS (mpn_dc_set_str_powtab_alloc (un));
2194   mpn_set_str_compute_powtab (powtab, powtab_mem, un, base);
2195   tp = TMP_BALLOC_LIMBS (mpn_dc_set_str_itch (un));
2196
2197   speed_starttime ();
2198   i = s->reps;
2199   do
2200     {
2201       mpn_pre_set_str (wp, str, s->size, powtab, tp);
2202     }
2203   while (--i != 0);
2204   t = speed_endtime ();
2205
2206   TMP_FREE;
2207   return t;
2208 }
2209
2210 void
2211 tune_set_str (void)
2212 {
2213   s.r = 10;  /* decimal */
2214   {
2215     static struct param_t  param;
2216     SET_STR_PRECOMPUTE_THRESHOLD = 0;
2217     param.step_factor = 0.01;
2218     param.name = "SET_STR_DC_THRESHOLD";
2219     param.function = speed_mpn_pre_set_str;
2220     param.min_size = 100;
2221     param.max_size = 50000;
2222     one (&set_str_dc_threshold, &param);
2223   }
2224   {
2225     static struct param_t  param;
2226     param.step_factor = 0.02;
2227     param.name = "SET_STR_PRECOMPUTE_THRESHOLD";
2228     param.function = speed_mpn_set_str;
2229     param.min_size = SET_STR_DC_THRESHOLD;
2230     param.max_size = 100000;
2231     one (&set_str_precompute_threshold, &param);
2232   }
2233 }
2234
2235
2236 void
2237 tune_fft_mul (void)
2238 {
2239   static struct fft_param_t  param;
2240
2241   if (option_fft_max_size == 0)
2242     return;
2243
2244   param.table_name          = "MUL_FFT_TABLE3";
2245   param.threshold_name      = "MUL_FFT_THRESHOLD";
2246   param.p_threshold         = &mul_fft_threshold;
2247   param.modf_threshold_name = "MUL_FFT_MODF_THRESHOLD";
2248   param.p_modf_threshold    = &mul_fft_modf_threshold;
2249   param.first_size          = MUL_TOOM33_THRESHOLD / 2;
2250   param.max_size            = option_fft_max_size;
2251   param.function            = speed_mpn_fft_mul;
2252   param.mul_modf_function   = speed_mpn_mul_fft;
2253   param.mul_function        = speed_mpn_mul_n;
2254   param.sqr = 0;
2255   fft (&param);
2256 }
2257
2258
2259 void
2260 tune_fft_sqr (void)
2261 {
2262   static struct fft_param_t  param;
2263
2264   if (option_fft_max_size == 0)
2265     return;
2266
2267   param.table_name          = "SQR_FFT_TABLE3";
2268   param.threshold_name      = "SQR_FFT_THRESHOLD";
2269   param.p_threshold         = &sqr_fft_threshold;
2270   param.modf_threshold_name = "SQR_FFT_MODF_THRESHOLD";
2271   param.p_modf_threshold    = &sqr_fft_modf_threshold;
2272   param.first_size          = SQR_TOOM3_THRESHOLD / 2;
2273   param.max_size            = option_fft_max_size;
2274   param.function            = speed_mpn_fft_sqr;
2275   param.mul_modf_function   = speed_mpn_mul_fft_sqr;
2276   param.mul_function        = speed_mpn_sqr;
2277   param.sqr = 1;
2278   fft (&param);
2279 }
2280
2281 void
2282 all (void)
2283 {
2284   time_t  start_time, end_time;
2285   TMP_DECL;
2286
2287   TMP_MARK;
2288   SPEED_TMP_ALLOC_LIMBS (s.xp_block, SPEED_BLOCK_SIZE, 0);
2289   SPEED_TMP_ALLOC_LIMBS (s.yp_block, SPEED_BLOCK_SIZE, 0);
2290
2291   mpn_random (s.xp_block, SPEED_BLOCK_SIZE);
2292   mpn_random (s.yp_block, SPEED_BLOCK_SIZE);
2293
2294   fprintf (stderr, "Parameters for %s\n", GMP_MPARAM_H_SUGGEST);
2295
2296   speed_time_init ();
2297   fprintf (stderr, "Using: %s\n", speed_time_string);
2298
2299   fprintf (stderr, "speed_precision %d", speed_precision);
2300   if (speed_unittime == 1.0)
2301     fprintf (stderr, ", speed_unittime 1 cycle");
2302   else
2303     fprintf (stderr, ", speed_unittime %.2e secs", speed_unittime);
2304   if (speed_cycletime == 1.0 || speed_cycletime == 0.0)
2305     fprintf (stderr, ", CPU freq unknown\n");
2306   else
2307     fprintf (stderr, ", CPU freq %.2f MHz\n", 1e-6/speed_cycletime);
2308
2309   fprintf (stderr, "DEFAULT_MAX_SIZE %d, fft_max_size %ld\n",
2310            DEFAULT_MAX_SIZE, (long) option_fft_max_size);
2311   fprintf (stderr, "\n");
2312
2313   time (&start_time);
2314   {
2315     struct tm  *tp;
2316     tp = localtime (&start_time);
2317     printf ("/* Generated by tuneup.c, %d-%02d-%02d, ",
2318             tp->tm_year+1900, tp->tm_mon+1, tp->tm_mday);
2319
2320 #ifdef __GNUC__
2321     /* gcc sub-minor version doesn't seem to come through as a define */
2322     printf ("gcc %d.%d */\n", __GNUC__, __GNUC_MINOR__);
2323 #define PRINTED_COMPILER
2324 #endif
2325 #if defined (__SUNPRO_C)
2326     printf ("Sun C %d.%d */\n", __SUNPRO_C / 0x100, __SUNPRO_C % 0x100);
2327 #define PRINTED_COMPILER
2328 #endif
2329 #if ! defined (__GNUC__) && defined (__sgi) && defined (_COMPILER_VERSION)
2330     /* gcc defines __sgi and _COMPILER_VERSION on irix 6, avoid that */
2331     printf ("MIPSpro C %d.%d.%d */\n",
2332             _COMPILER_VERSION / 100,
2333             _COMPILER_VERSION / 10 % 10,
2334             _COMPILER_VERSION % 10);
2335 #define PRINTED_COMPILER
2336 #endif
2337 #if defined (__DECC) && defined (__DECC_VER)
2338     printf ("DEC C %d */\n", __DECC_VER);
2339 #define PRINTED_COMPILER
2340 #endif
2341 #if ! defined (PRINTED_COMPILER)
2342     printf ("system compiler */\n");
2343 #endif
2344   }
2345   printf ("\n");
2346
2347   tune_divrem_1 ();
2348   tune_mod_1 ();
2349   tune_preinv_divrem_1 ();
2350   tune_divrem_2 ();
2351   tune_divexact_1 ();
2352   tune_modexact_1_odd ();
2353   printf("\n");
2354
2355   tune_mul_n ();
2356   printf("\n");
2357
2358   tune_mul ();
2359   printf("\n");
2360
2361   tune_sqr ();
2362   printf("\n");
2363
2364   tune_mulmod_bnm1 ();
2365   tune_sqrmod_bnm1 ();
2366   printf("\n");
2367
2368   tune_fft_mul ();
2369   printf("\n");
2370
2371   tune_fft_sqr ();
2372   printf ("\n");
2373
2374   tune_mullo ();
2375   printf("\n");
2376
2377   tune_dc_div ();
2378   tune_dc_bdiv ();
2379
2380   printf("\n");
2381   tune_invertappr ();
2382   tune_invert ();
2383   printf("\n");
2384
2385   tune_binvert ();
2386   tune_redc ();
2387   printf("\n");
2388
2389   tune_mu_div ();
2390   tune_mu_bdiv ();
2391   printf("\n");
2392
2393   tune_matrix22_mul ();
2394   tune_hgcd ();
2395   tune_gcd_dc ();
2396   tune_gcdext_dc ();
2397   tune_jacobi_base ();
2398   printf("\n");
2399
2400   tune_get_str ();
2401   tune_set_str ();
2402   printf("\n");
2403
2404   time (&end_time);
2405   printf ("/* Tuneup completed successfully, took %ld seconds */\n",
2406           (long) (end_time - start_time));
2407
2408   TMP_FREE;
2409 }
2410
2411
2412 int
2413 main (int argc, char *argv[])
2414 {
2415   int  opt;
2416
2417   /* Unbuffered so if output is redirected to a file it isn't lost if the
2418      program is killed part way through.  */
2419   setbuf (stdout, NULL);
2420   setbuf (stderr, NULL);
2421
2422   while ((opt = getopt(argc, argv, "f:o:p:t")) != EOF)
2423     {
2424       switch (opt) {
2425       case 'f':
2426         if (optarg[0] == 't')
2427           option_fft_trace = 2;
2428         else
2429           option_fft_max_size = atol (optarg);
2430         break;
2431       case 'o':
2432         speed_option_set (optarg);
2433         break;
2434       case 'p':
2435         speed_precision = atoi (optarg);
2436         break;
2437       case 't':
2438         option_trace++;
2439         break;
2440       case '?':
2441         exit(1);
2442       }
2443     }
2444
2445   all ();
2446   exit (0);
2447 }