gen-psqr.c

   1 /* Generate perfect square testing data.
   2
   3 Copyright 2002, 2003, 2004, 2012 Free Software Foundation, Inc.
   4
   5 This file is part of the GNU MP Library.
   6
   7 The GNU MP Library is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU Lesser General Public License as published by
   9 the Free Software Foundation; either version 3 of the License, or (at your
  10 option) any later version.
  11
  12 The GNU MP Library is distributed in the hope that it will be useful, but
  13 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 License for more details.
  16
  17 You should have received a copy of the GNU Lesser General Public License
  18 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  19
  20 #include <stdio.h>
  21 #include <stdlib.h>
  22
  23 #include "bootstrap.c"
  24
  25
  26 /* The aim of this program is to choose either mpn_mod_34lsub1 or mpn_mod_1
  27    (plus a PERFSQR_PP modulus), and generate tables indicating quadratic
  28    residues and non-residues modulo small factors of that modulus.
  29
  30    For the usual 32 or 64 bit cases mpn_mod_34lsub1 gets used.  That
  31    function exists specifically because 2^24-1 and 2^48-1 have nice sets of
  32    prime factors.  For other limb sizes it's considered, but if it doesn't
  33    have good factors then mpn_mod_1 will be used instead.
  34
  35    When mpn_mod_1 is used, the modulus PERFSQR_PP is created from a
  36    selection of small primes, chosen to fill PERFSQR_MOD_BITS of a limb,
  37    with that bit count chosen so (2*GMP_LIMB_BITS)*2^PERFSQR_MOD_BITS <=
  38    GMP_LIMB_MAX, allowing PERFSQR_MOD_IDX in mpn/generic/perfsqr.c to do its
  39    calculation within a single limb.
  40
  41    In either case primes can be combined to make divisors.  The table data
  42    then effectively indicates remainders which are quadratic residues mod
  43    all the primes.  This sort of combining reduces the number of steps
  44    needed after mpn_mod_34lsub1 or mpn_mod_1, saving code size and time.
  45    Nothing is gained or lost in terms of detections, the same total fraction
  46    of non-residues will be identified.
  47
  48    Nothing particularly sophisticated is attempted for combining factors to
  49    make divisors.  This is probably a kind of knapsack problem so it'd be
  50    too hard to attempt anything completely general.  For the usual 32 and 64
  51    bit limbs we get a good enough result just pairing the biggest and
  52    smallest which fit together, repeatedly.
  53
  54    Another aim is to get powerful combinations, ie. divisors which identify
  55    biggest fraction of non-residues, and have those run first.  Again for
  56    the usual 32 and 64 bits it seems good enough just to pair for big
  57    divisors then sort according to the resulting fraction of non-residues
  58    identified.
  59
  60    Also in this program, a table sq_res_0x100 of residues modulo 256 is
  61    generated.  This simply fills bits into limbs of the appropriate
  62    build-time GMP_LIMB_BITS each.
  63
  64 */
  65
  66
  67 /* Normally we aren't using const in gen*.c programs, so as not to have to
  68    bother figuring out if it works, but using it with f_cmp_divisor and
  69    f_cmp_fraction avoids warnings from the qsort calls. */
  70
  71 /* Same tests as gmp.h. */
  72 #if  defined (__STDC__)                                 \
  73   || defined (__cplusplus)                              \
  74   || defined (_AIX)                                     \
  75   || defined (__DECC)                                   \
  76   || (defined (__mips) && defined (_SYSTYPE_SVR4))      \
  77   || defined (_MSC_VER)                                 \
  78   || defined (_WIN32)
  79 #define HAVE_CONST        1
  80 #endif
  81
  82 #if ! HAVE_CONST
  83 #define const
  84 #endif
  85
  86
  87 mpz_t  *sq_res_0x100;          /* table of limbs */
  88 int    nsq_res_0x100;          /* elements in sq_res_0x100 array */
  89 int    sq_res_0x100_num;       /* squares in sq_res_0x100 */
  90 double sq_res_0x100_fraction;  /* sq_res_0x100_num / 256 */
  91
  92 int     mod34_bits;        /* 3*GMP_NUMB_BITS/4 */
  93 int     mod_bits;          /* bits from PERFSQR_MOD_34 or MOD_PP */
  94 int     max_divisor;       /* all divisors <= max_divisor */
  95 int     max_divisor_bits;  /* ceil(log2(max_divisor)) */
  96 double  total_fraction;    /* of squares */
  97 mpz_t   pp;                /* product of primes, or 0 if mod_34lsub1 used */
  98 mpz_t   pp_norm;           /* pp shifted so NUMB high bit set */
  99 mpz_t   pp_inverted;       /* invert_limb style inverse */
 100 mpz_t   mod_mask;          /* 2^mod_bits-1 */
 101 char    mod34_excuse[128]; /* why mod_34lsub1 not used (if it's not) */
 102
 103 /* raw list of divisors of 2^mod34_bits-1 or pp, just to show in a comment */
 104 struct rawfactor_t {
 105   int     divisor;
 106   int     multiplicity;
 107 };
 108 struct rawfactor_t  *rawfactor;
 109 int                 nrawfactor;
 110
 111 /* factors of 2^mod34_bits-1 or pp and associated data, after combining etc */
 112 struct factor_t {
 113   int     divisor;
 114   mpz_t   inverse;   /* 1/divisor mod 2^mod_bits */
 115   mpz_t   mask;      /* indicating squares mod divisor */
 116   double  fraction;  /* squares/total */
 117 };
 118 struct factor_t  *factor;
 119 int              nfactor;       /* entries in use in factor array */
 120 int              factor_alloc;  /* entries allocated to factor array */
 121
 122
 123 int
 124 f_cmp_divisor (const void *parg, const void *qarg)
 125 {
 126   const struct factor_t *p, *q;
 127   p = parg;
 128   q = qarg;
 129   if (p->divisor > q->divisor)
 130     return 1;
 131   else if (p->divisor < q->divisor)
 132     return -1;
 133   else
 134     return 0;
 135 }
 136
 137 int
 138 f_cmp_fraction (const void *parg, const void *qarg)
 139 {
 140   const struct factor_t *p, *q;
 141   p = parg;
 142   q = qarg;
 143   if (p->fraction > q->fraction)
 144     return 1;
 145   else if (p->fraction < q->fraction)
 146     return -1;
 147   else
 148     return 0;
 149 }
 150
 151 /* Remove array[idx] by copying the remainder down, and adjust narray
 152    accordingly.  */
 153 #define COLLAPSE_ELEMENT(array, idx, narray)                    \
 154   do {                                                          \
 155     memmove (&(array)[idx],                                     \
 156              &(array)[idx+1],                                   \
 157              ((narray)-((idx)+1)) * sizeof (array[0]));         \
 158     (narray)--;                                                 \
 159   } while (0)
 160
 161
 162 /* return n*2^p mod m */
 163 int
 164 mul_2exp_mod (int n, int p, int m)
 165 {
 166   int  i;
 167   for (i = 0; i < p; i++)
 168     n = (2 * n) % m;
 169   return n;
 170 }
 171
 172 /* return -n mod m */
 173 int
 174 neg_mod (int n, int m)
 175 {
 176   assert (n >= 0 && n < m);
 177   return (n == 0 ? 0 : m-n);
 178 }
 179
 180 /* Set "mask" to a value such that "mask & (1<<idx)" is non-zero if
 181    "-(idx<<mod_bits)" can be a square modulo m.  */
 182 void
 183 square_mask (mpz_t mask, int m)
 184 {
 185   int    p, i, r, idx;
 186
 187   p = mul_2exp_mod (1, mod_bits, m);
 188   p = neg_mod (p, m);
 189
 190   mpz_set_ui (mask, 0L);
 191   for (i = 0; i < m; i++)
 192     {
 193       r = (i * i) % m;
 194       idx = (r * p) % m;
 195       mpz_setbit (mask, (unsigned long) idx);
 196     }
 197 }
 198
 199 void
 200 generate_sq_res_0x100 (int limb_bits)
 201 {
 202   int  i, res;
 203
 204   nsq_res_0x100 = (0x100 + limb_bits - 1) / limb_bits;
 205   sq_res_0x100 = xmalloc (nsq_res_0x100 * sizeof (*sq_res_0x100));
 206
 207   for (i = 0; i < nsq_res_0x100; i++)
 208     mpz_init_set_ui (sq_res_0x100[i], 0L);
 209
 210   for (i = 0; i < 0x100; i++)
 211     {
 212       res = (i * i) % 0x100;
 213       mpz_setbit (sq_res_0x100[res / limb_bits],
 214                   (unsigned long) (res % limb_bits));
 215     }
 216
 217   sq_res_0x100_num = 0;
 218   for (i = 0; i < nsq_res_0x100; i++)
 219     sq_res_0x100_num += mpz_popcount (sq_res_0x100[i]);
 220   sq_res_0x100_fraction = (double) sq_res_0x100_num / 256.0;
 221 }
 222
 223 void
 224 generate_mod (int limb_bits, int nail_bits)
 225 {
 226   int    numb_bits = limb_bits - nail_bits;
 227   int    i, divisor;
 228
 229   mpz_init_set_ui (pp, 0L);
 230   mpz_init_set_ui (pp_norm, 0L);
 231   mpz_init_set_ui (pp_inverted, 0L);
 232
 233   /* no more than limb_bits many factors in a one limb modulus (and of
 234      course in reality nothing like that many) */
 235   factor_alloc = limb_bits;
 236   factor = xmalloc (factor_alloc * sizeof (*factor));
 237   rawfactor = xmalloc (factor_alloc * sizeof (*rawfactor));
 238
 239   if (numb_bits % 4 != 0)
 240     {
 241       strcpy (mod34_excuse, "GMP_NUMB_BITS % 4 != 0");
 242       goto use_pp;
 243     }
 244
 245   max_divisor = 2*limb_bits;
 246   max_divisor_bits = log2_ceil (max_divisor);
 247
 248   if (numb_bits / 4 < max_divisor_bits)
 249     {
 250       /* Wind back to one limb worth of max_divisor, if that will let us use
 251          mpn_mod_34lsub1.  */
 252       max_divisor = limb_bits;
 253       max_divisor_bits = log2_ceil (max_divisor);
 254
 255       if (numb_bits / 4 < max_divisor_bits)
 256         {
 257           strcpy (mod34_excuse, "GMP_NUMB_BITS / 4 too small");
 258           goto use_pp;
 259         }
 260     }
 261
 262   {
 263     /* Can use mpn_mod_34lsub1, find small factors of 2^mod34_bits-1. */
 264     mpz_t  m, q, r;
 265     int    multiplicity;
 266
 267     mod34_bits = (numb_bits / 4) * 3;
 268
 269     /* mpn_mod_34lsub1 returns a full limb value, PERFSQR_MOD_34 folds it at
 270        the mod34_bits mark, adding the two halves for a remainder of at most
 271        mod34_bits+1 many bits */
 272     mod_bits = mod34_bits + 1;
 273
 274     mpz_init_set_ui (m, 1L);
 275     mpz_mul_2exp (m, m, mod34_bits);
 276     mpz_sub_ui (m, m, 1L);
 277
 278     mpz_init (q);
 279     mpz_init (r);
 280
 281     for (i = 3; i <= max_divisor; i++)
 282       {
 283         if (! isprime (i))
 284           continue;
 285
 286         mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
 287         if (mpz_sgn (r) != 0)
 288           continue;
 289
 290         /* if a repeated prime is found it's used as an i^n in one factor */
 291         divisor = 1;
 292         multiplicity = 0;
 293         do
 294           {
 295             if (divisor > max_divisor / i)
 296               break;
 297             multiplicity++;
 298             mpz_set (m, q);
 299             mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
 300           }
 301         while (mpz_sgn (r) == 0);
 302
 303         assert (nrawfactor < factor_alloc);
 304         rawfactor[nrawfactor].divisor = i;
 305         rawfactor[nrawfactor].multiplicity = multiplicity;
 306         nrawfactor++;
 307       }
 308
 309     mpz_clear (m);
 310     mpz_clear (q);
 311     mpz_clear (r);
 312   }
 313
 314   if (nrawfactor <= 2)
 315     {
 316       mpz_t  new_pp;
 317
 318       sprintf (mod34_excuse, "only %d small factor%s",
 319                nrawfactor, nrawfactor == 1 ? "" : "s");
 320
 321     use_pp:
 322       /* reset to two limbs of max_divisor, in case the mpn_mod_34lsub1 code
 323          tried with just one */
 324       max_divisor = 2*limb_bits;
 325       max_divisor_bits = log2_ceil (max_divisor);
 326
 327       mpz_init (new_pp);
 328       nrawfactor = 0;
 329       mod_bits = MIN (numb_bits, limb_bits - max_divisor_bits);
 330
 331       /* one copy of each small prime */
 332       mpz_set_ui (pp, 1L);
 333       for (i = 3; i <= max_divisor; i++)
 334         {
 335           if (! isprime (i))
 336             continue;
 337
 338           mpz_mul_ui (new_pp, pp, (unsigned long) i);
 339           if (mpz_sizeinbase (new_pp, 2) > mod_bits)
 340             break;
 341           mpz_set (pp, new_pp);
 342
 343           assert (nrawfactor < factor_alloc);
 344           rawfactor[nrawfactor].divisor = i;
 345           rawfactor[nrawfactor].multiplicity = 1;
 346           nrawfactor++;
 347         }
 348
 349       /* Plus an extra copy of one or more of the primes selected, if that
 350          still fits in max_divisor and the total in mod_bits.  Usually only
 351          3 or 5 will be candidates */
 352       for (i = nrawfactor-1; i >= 0; i--)
 353         {
 354           if (rawfactor[i].divisor > max_divisor / rawfactor[i].divisor)
 355             continue;
 356           mpz_mul_ui (new_pp, pp, (unsigned long) rawfactor[i].divisor);
 357           if (mpz_sizeinbase (new_pp, 2) > mod_bits)
 358             continue;
 359           mpz_set (pp, new_pp);
 360
 361           rawfactor[i].multiplicity++;
 362         }
 363
 364       mod_bits = mpz_sizeinbase (pp, 2);
 365
 366       mpz_set (pp_norm, pp);
 367       while (mpz_sizeinbase (pp_norm, 2) < numb_bits)
 368         mpz_add (pp_norm, pp_norm, pp_norm);
 369
 370       mpz_preinv_invert (pp_inverted, pp_norm, numb_bits);
 371
 372       mpz_clear (new_pp);
 373     }
 374
 375   /* start the factor array */
 376   for (i = 0; i < nrawfactor; i++)
 377     {
 378       int  j;
 379       assert (nfactor < factor_alloc);
 380       factor[nfactor].divisor = 1;
 381       for (j = 0; j < rawfactor[i].multiplicity; j++)
 382         factor[nfactor].divisor *= rawfactor[i].divisor;
 383       nfactor++;
 384     }
 385
 386  combine:
 387   /* Combine entries in the factor array.  Combine the smallest entry with
 388      the biggest one that will fit with it (ie. under max_divisor), then
 389      repeat that with the new smallest entry. */
 390   qsort (factor, nfactor, sizeof (factor[0]), f_cmp_divisor);
 391   for (i = nfactor-1; i >= 1; i--)
 392     {
 393       if (factor[i].divisor <= max_divisor / factor[0].divisor)
 394         {
 395           factor[0].divisor *= factor[i].divisor;
 396           COLLAPSE_ELEMENT (factor, i, nfactor);
 397           goto combine;
 398         }
 399     }
 400
 401   total_fraction = 1.0;
 402   for (i = 0; i < nfactor; i++)
 403     {
 404       mpz_init (factor[i].inverse);
 405       mpz_invert_ui_2exp (factor[i].inverse,
 406                           (unsigned long) factor[i].divisor,
 407                           (unsigned long) mod_bits);
 408
 409       mpz_init (factor[i].mask);
 410       square_mask (factor[i].mask, factor[i].divisor);
 411
 412       /* fraction of possible squares */
 413       factor[i].fraction = (double) mpz_popcount (factor[i].mask)
 414         / factor[i].divisor;
 415
 416       /* total fraction of possible squares */
 417       total_fraction *= factor[i].fraction;
 418     }
 419
 420   /* best tests first (ie. smallest fraction) */
 421   qsort (factor, nfactor, sizeof (factor[0]), f_cmp_fraction);
 422 }
 423
 424 void
 425 print (int limb_bits, int nail_bits)
 426 {
 427   int    i;
 428   mpz_t  mhi, mlo;
 429
 430   printf ("/* This file generated by gen-psqr.c - DO NOT EDIT. */\n");
 431   printf ("\n");
 432
 433   printf ("#if GMP_LIMB_BITS != %d || GMP_NAIL_BITS != %d\n",
 434           limb_bits, nail_bits);
 435   printf ("Error, error, this data is for %d bit limb and %d bit nail\n",
 436           limb_bits, nail_bits);
 437   printf ("#endif\n");
 438   printf ("\n");
 439
 440   printf ("/* Non-zero bit indicates a quadratic residue mod 0x100.\n");
 441   printf ("   This test identifies %.2f%% as non-squares (%d/256). */\n",
 442           (1.0 - sq_res_0x100_fraction) * 100.0,
 443           0x100 - sq_res_0x100_num);
 444   printf ("static const mp_limb_t\n");
 445   printf ("sq_res_0x100[%d] = {\n", nsq_res_0x100);
 446   for (i = 0; i < nsq_res_0x100; i++)
 447     {
 448       printf ("  CNST_LIMB(0x");
 449       mpz_out_str (stdout, 16, sq_res_0x100[i]);
 450       printf ("),\n");
 451     }
 452   printf ("};\n");
 453   printf ("\n");
 454
 455   if (mpz_sgn (pp) != 0)
 456     {
 457       printf ("/* mpn_mod_34lsub1 not used due to %s */\n", mod34_excuse);
 458       printf ("/* PERFSQR_PP = ");
 459     }
 460   else
 461     printf ("/* 2^%d-1 = ", mod34_bits);
 462   for (i = 0; i < nrawfactor; i++)
 463     {
 464       if (i != 0)
 465         printf (" * ");
 466       printf ("%d", rawfactor[i].divisor);
 467       if (rawfactor[i].multiplicity != 1)
 468         printf ("^%d", rawfactor[i].multiplicity);
 469     }
 470   printf (" %s*/\n", mpz_sgn (pp) == 0 ? "... " : "");
 471
 472   printf ("#define PERFSQR_MOD_BITS  %d\n", mod_bits);
 473   if (mpz_sgn (pp) != 0)
 474     {
 475       printf ("#define PERFSQR_PP            CNST_LIMB(0x");
 476       mpz_out_str (stdout, 16, pp);
 477       printf (")\n");
 478       printf ("#define PERFSQR_PP_NORM       CNST_LIMB(0x");
 479       mpz_out_str (stdout, 16, pp_norm);
 480       printf (")\n");
 481       printf ("#define PERFSQR_PP_INVERTED   CNST_LIMB(0x");
 482       mpz_out_str (stdout, 16, pp_inverted);
 483       printf (")\n");
 484     }
 485   printf ("\n");
 486
 487   mpz_init (mhi);
 488   mpz_init (mlo);
 489
 490   printf ("/* This test identifies %.2f%% as non-squares. */\n",
 491           (1.0 - total_fraction) * 100.0);
 492   printf ("#define PERFSQR_MOD_TEST(up, usize) \\\n");
 493   printf ("  do {                              \\\n");
 494   printf ("    mp_limb_t  r;                   \\\n");
 495   if (mpz_sgn (pp) != 0)
 496     printf ("    PERFSQR_MOD_PP (r, up, usize);  \\\n");
 497   else
 498     printf ("    PERFSQR_MOD_34 (r, up, usize);  \\\n");
 499
 500   for (i = 0; i < nfactor; i++)
 501     {
 502       printf ("                                    \\\n");
 503       printf ("    /* %5.2f%% */                    \\\n",
 504               (1.0 - factor[i].fraction) * 100.0);
 505
 506       printf ("    PERFSQR_MOD_%d (r, CNST_LIMB(%2d), CNST_LIMB(0x",
 507               factor[i].divisor <= limb_bits ? 1 : 2,
 508               factor[i].divisor);
 509       mpz_out_str (stdout, 16, factor[i].inverse);
 510       printf ("), \\\n");
 511       printf ("                   CNST_LIMB(0x");
 512
 513       if ( factor[i].divisor <= limb_bits)
 514         {
 515           mpz_out_str (stdout, 16, factor[i].mask);
 516         }
 517       else
 518         {
 519           mpz_tdiv_r_2exp (mlo, factor[i].mask, (unsigned long) limb_bits);
 520           mpz_tdiv_q_2exp (mhi, factor[i].mask, (unsigned long) limb_bits);
 521           mpz_out_str (stdout, 16, mhi);
 522           printf ("), CNST_LIMB(0x");
 523           mpz_out_str (stdout, 16, mlo);
 524         }
 525       printf (")); \\\n");
 526     }
 527
 528   printf ("  } while (0)\n");
 529   printf ("\n");
 530
 531   printf ("/* Grand total sq_res_0x100 and PERFSQR_MOD_TEST, %.2f%% non-squares. */\n",
 532           (1.0 - (total_fraction * 44.0/256.0)) * 100.0);
 533   printf ("\n");
 534
 535   printf ("/* helper for tests/mpz/t-perfsqr.c */\n");
 536   printf ("#define PERFSQR_DIVISORS  { 256,");
 537   for (i = 0; i < nfactor; i++)
 538       printf (" %d,", factor[i].divisor);
 539   printf (" }\n");
 540
 541
 542   mpz_clear (mhi);
 543   mpz_clear (mlo);
 544 }
 545
 546 int
 547 main (int argc, char *argv[])
 548 {
 549   int  limb_bits, nail_bits;
 550
 551   if (argc != 3)
 552     {
 553       fprintf (stderr, "Usage: gen-psqr <limbbits> <nailbits>\n");
 554       exit (1);
 555     }
 556
 557   limb_bits = atoi (argv[1]);
 558   nail_bits = atoi (argv[2]);
 559
 560   if (limb_bits <= 0
 561       || nail_bits < 0
 562       || nail_bits >= limb_bits)
 563     {
 564       fprintf (stderr, "Invalid limb/nail bits: %d %d\n",
 565                limb_bits, nail_bits);
 566       exit (1);
 567     }
 568
 569   generate_sq_res_0x100 (limb_bits);
 570   generate_mod (limb_bits, nail_bits);
 571
 572   print (limb_bits, nail_bits);
 573
 574   return 0;
 575 }