gen-psqr.c

   1 /* Generate perfect square testing data.
   2
   3 Copyright 2002, 2003, 2004 Free Software Foundation, Inc.
   4
   5 This file is part of the GNU MP Library.
   6
   7 The GNU MP Library is free software; you can redistribute it and/or modify
   8 it under the terms of the GNU Lesser General Public License as published by
   9 the Free Software Foundation; either version 2.1 of the License, or (at your
  10 option) any later version.
  11
  12 The GNU MP Library is distributed in the hope that it will be useful, but
  13 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  14 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  15 License for more details.
  16
  17 You should have received a copy of the GNU Lesser General Public License
  18 along with the GNU MP Library; see the file COPYING.LIB.  If not, write to
  19 the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  20 MA 02110-1301, USA. */
  21
  22 #include <stdio.h>
  23 #include <stdlib.h>
  24
  25 #include "dumbmp.c"
  26
  27
  28 /* The aim of this program is to choose either mpn_mod_34lsub1 or mpn_mod_1
  29    (plus a PERFSQR_PP modulus), and generate tables indicating quadratic
  30    residues and non-residues modulo small factors of that modulus.
  31
  32    For the usual 32 or 64 bit cases mpn_mod_34lsub1 gets used.  That
  33    function exists specifically because 2^24-1 and 2^48-1 have nice sets of
  34    prime factors.  For other limb sizes it's considered, but if it doesn't
  35    have good factors then mpn_mod_1 will be used instead.
  36
  37    When mpn_mod_1 is used, the modulus PERFSQR_PP is created from a
  38    selection of small primes, chosen to fill PERFSQR_MOD_BITS of a limb,
  39    with that bit count chosen so (2*GMP_LIMB_BITS)*2^PERFSQR_MOD_BITS <=
  40    GMP_LIMB_MAX, allowing PERFSQR_MOD_IDX in mpn/generic/perfsqr.c to do its
  41    calculation within a single limb.
  42
  43    In either case primes can be combined to make divisors.  The table data
  44    then effectively indicates remainders which are quadratic residues mod
  45    all the primes.  This sort of combining reduces the number of steps
  46    needed after mpn_mod_34lsub1 or mpn_mod_1, saving code size and time.
  47    Nothing is gained or lost in terms of detections, the same total fraction
  48    of non-residues will be identified.
  49
  50    Nothing particularly sophisticated is attempted for combining factors to
  51    make divisors.  This is probably a kind of knapsack problem so it'd be
  52    too hard to attempt anything completely general.  For the usual 32 and 64
  53    bit limbs we get a good enough result just pairing the biggest and
  54    smallest which fit together, repeatedly.
  55
  56    Another aim is to get powerful combinations, ie. divisors which identify
  57    biggest fraction of non-residues, and have those run first.  Again for
  58    the usual 32 and 64 bits it seems good enough just to pair for big
  59    divisors then sort according to the resulting fraction of non-residues
  60    identified.
  61
  62    Also in this program, a table sq_res_0x100 of residues modulo 256 is
  63    generated.  This simply fills bits into limbs of the appropriate
  64    build-time GMP_LIMB_BITS each.
  65
  66 */
  67
  68
  69 /* Normally we aren't using const in gen*.c programs, so as not to have to
  70    bother figuring out if it works, but using it with f_cmp_divisor and
  71    f_cmp_fraction avoids warnings from the qsort calls. */
  72
  73 /* Same tests as gmp.h. */
  74 #if  defined (__STDC__)                                 \
  75   || defined (__cplusplus)                              \
  76   || defined (_AIX)                                     \
  77   || defined (__DECC)                                   \
  78   || (defined (__mips) && defined (_SYSTYPE_SVR4))      \
  79   || defined (_MSC_VER)                                 \
  80   || defined (_WIN32)
  81 #define HAVE_CONST        1
  82 #endif
  83
  84 #if ! HAVE_CONST
  85 #define const
  86 #endif
  87
  88
  89 mpz_t  *sq_res_0x100;          /* table of limbs */
  90 int    nsq_res_0x100;          /* elements in sq_res_0x100 array */
  91 int    sq_res_0x100_num;       /* squares in sq_res_0x100 */
  92 double sq_res_0x100_fraction;  /* sq_res_0x100_num / 256 */
  93
  94 int     mod34_bits;        /* 3*GMP_NUMB_BITS/4 */
  95 int     mod_bits;          /* bits from PERFSQR_MOD_34 or MOD_PP */
  96 int     max_divisor;       /* all divisors <= max_divisor */
  97 int     max_divisor_bits;  /* ceil(log2(max_divisor)) */
  98 double  total_fraction;    /* of squares */
  99 mpz_t   pp;                /* product of primes, or 0 if mod_34lsub1 used */
 100 mpz_t   pp_norm;           /* pp shifted so NUMB high bit set */
 101 mpz_t   pp_inverted;       /* invert_limb style inverse */
 102 mpz_t   mod_mask;          /* 2^mod_bits-1 */
 103 char    mod34_excuse[128]; /* why mod_34lsub1 not used (if it's not) */
 104
 105 /* raw list of divisors of 2^mod34_bits-1 or pp, just to show in a comment */
 106 struct rawfactor_t {
 107   int     divisor;
 108   int     multiplicity;
 109 };
 110 struct rawfactor_t  *rawfactor;
 111 int                 nrawfactor;
 112
 113 /* factors of 2^mod34_bits-1 or pp and associated data, after combining etc */
 114 struct factor_t {
 115   int     divisor;
 116   mpz_t   inverse;   /* 1/divisor mod 2^mod_bits */
 117   mpz_t   mask;      /* indicating squares mod divisor */
 118   double  fraction;  /* squares/total */
 119 };
 120 struct factor_t  *factor;
 121 int              nfactor;       /* entries in use in factor array */
 122 int              factor_alloc;  /* entries allocated to factor array */
 123
 124
 125 int
 126 f_cmp_divisor (const void *parg, const void *qarg)
 127 {
 128   const struct factor_t *p, *q;
 129   p = parg;
 130   q = qarg;
 131   if (p->divisor > q->divisor)
 132     return 1;
 133   else if (p->divisor < q->divisor)
 134     return -1;
 135   else
 136     return 0;
 137 }
 138
 139 int
 140 f_cmp_fraction (const void *parg, const void *qarg)
 141 {
 142   const struct factor_t *p, *q;
 143   p = parg;
 144   q = qarg;
 145   if (p->fraction > q->fraction)
 146     return 1;
 147   else if (p->fraction < q->fraction)
 148     return -1;
 149   else
 150     return 0;
 151 }
 152
 153 /* Remove array[idx] by copying the remainder down, and adjust narray
 154    accordingly.  */
 155 #define COLLAPSE_ELEMENT(array, idx, narray)                    \
 156   do {                                                          \
 157     mem_copyi ((char *) &(array)[idx],                          \
 158                (char *) &(array)[idx+1],                        \
 159                ((narray)-((idx)+1)) * sizeof (array[0]));       \
 160     (narray)--;                                                 \
 161   } while (0)
 162
 163
 164 /* return n*2^p mod m */
 165 int
 166 mul_2exp_mod (int n, int p, int m)
 167 {
 168   int  i;
 169   for (i = 0; i < p; i++)
 170     n = (2 * n) % m;
 171   return n;
 172 }
 173
 174 /* return -n mod m */
 175 int
 176 neg_mod (int n, int m)
 177 {
 178   ASSERT (n >= 0 && n < m);
 179   return (n == 0 ? 0 : m-n);
 180 }
 181
 182 /* Set "mask" to a value such that "mask & (1<<idx)" is non-zero if
 183    "-(idx<<mod_bits)" can be a square modulo m.  */
 184 void
 185 square_mask (mpz_t mask, int m)
 186 {
 187   int    p, i, r, idx;
 188
 189   p = mul_2exp_mod (1, mod_bits, m);
 190   p = neg_mod (p, m);
 191
 192   mpz_set_ui (mask, 0L);
 193   for (i = 0; i < m; i++)
 194     {
 195       r = (i * i) % m;
 196       idx = (r * p) % m;
 197       mpz_setbit (mask, (unsigned long) idx);
 198     }
 199 }
 200
 201 void
 202 generate_sq_res_0x100 (int limb_bits)
 203 {
 204   int  i, res;
 205
 206   nsq_res_0x100 = (0x100 + limb_bits - 1) / limb_bits;
 207   sq_res_0x100 = (mpz_t *) xmalloc (nsq_res_0x100 * sizeof (*sq_res_0x100));
 208
 209   for (i = 0; i < nsq_res_0x100; i++)
 210     mpz_init_set_ui (sq_res_0x100[i], 0L);
 211
 212   for (i = 0; i < 0x100; i++)
 213     {
 214       res = (i * i) % 0x100;
 215       mpz_setbit (sq_res_0x100[res / limb_bits],
 216                   (unsigned long) (res % limb_bits));
 217     }
 218
 219   sq_res_0x100_num = 0;
 220   for (i = 0; i < nsq_res_0x100; i++)
 221     sq_res_0x100_num += mpz_popcount (sq_res_0x100[i]);
 222   sq_res_0x100_fraction = (double) sq_res_0x100_num / 256.0;
 223 }
 224
 225 void
 226 generate_mod (int limb_bits, int nail_bits)
 227 {
 228   int    numb_bits = limb_bits - nail_bits;
 229   int    i, divisor;
 230
 231   mpz_init_set_ui (pp, 0L);
 232   mpz_init_set_ui (pp_norm, 0L);
 233   mpz_init_set_ui (pp_inverted, 0L);
 234
 235   /* no more than limb_bits many factors in a one limb modulus (and of
 236      course in reality nothing like that many) */
 237   factor_alloc = limb_bits;
 238   factor = (struct factor_t *) xmalloc (factor_alloc * sizeof (*factor));
 239   rawfactor = (struct rawfactor_t *)
 240     xmalloc (factor_alloc * sizeof (*rawfactor));
 241
 242   if (numb_bits % 4 != 0)
 243     {
 244       strcpy (mod34_excuse, "GMP_NUMB_BITS % 4 != 0");
 245       goto use_pp;
 246     }
 247
 248   max_divisor = 2*limb_bits;
 249   max_divisor_bits = log2_ceil (max_divisor);
 250
 251   if (numb_bits / 4 < max_divisor_bits)
 252     {
 253       /* Wind back to one limb worth of max_divisor, if that will let us use
 254          mpn_mod_34lsub1.  */
 255       max_divisor = limb_bits;
 256       max_divisor_bits = log2_ceil (max_divisor);
 257
 258       if (numb_bits / 4 < max_divisor_bits)
 259         {
 260           strcpy (mod34_excuse, "GMP_NUMB_BITS / 4 too small");
 261           goto use_pp;
 262         }
 263     }
 264
 265   {
 266     /* Can use mpn_mod_34lsub1, find small factors of 2^mod34_bits-1. */
 267     mpz_t  m, q, r;
 268     int    multiplicity;
 269
 270     mod34_bits = (numb_bits / 4) * 3;
 271
 272     /* mpn_mod_34lsub1 returns a full limb value, PERFSQR_MOD_34 folds it at
 273        the mod34_bits mark, adding the two halves for a remainder of at most
 274        mod34_bits+1 many bits */
 275     mod_bits = mod34_bits + 1;
 276
 277     mpz_init_set_ui (m, 1L);
 278     mpz_mul_2exp (m, m, mod34_bits);
 279     mpz_sub_ui (m, m, 1L);
 280
 281     mpz_init (q);
 282     mpz_init (r);
 283
 284     for (i = 3; i <= max_divisor; i++)
 285       {
 286         if (! isprime (i))
 287           continue;
 288
 289         mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
 290         if (mpz_sgn (r) != 0)
 291           continue;
 292
 293         /* if a repeated prime is found it's used as an i^n in one factor */
 294         divisor = 1;
 295         multiplicity = 0;
 296         do
 297           {
 298             if (divisor > max_divisor / i)
 299               break;
 300             multiplicity++;
 301             mpz_set (m, q);
 302             mpz_tdiv_qr_ui (q, r, m, (unsigned long) i);
 303           }
 304         while (mpz_sgn (r) == 0);
 305
 306         ASSERT (nrawfactor < factor_alloc);
 307         rawfactor[nrawfactor].divisor = i;
 308         rawfactor[nrawfactor].multiplicity = multiplicity;
 309         nrawfactor++;
 310       }
 311
 312     mpz_clear (m);
 313     mpz_clear (q);
 314     mpz_clear (r);
 315   }
 316
 317   if (nrawfactor <= 2)
 318     {
 319       mpz_t  new_pp;
 320
 321       sprintf (mod34_excuse, "only %d small factor%s",
 322                nrawfactor, nrawfactor == 1 ? "" : "s");
 323
 324     use_pp:
 325       /* reset to two limbs of max_divisor, in case the mpn_mod_34lsub1 code
 326          tried with just one */
 327       max_divisor = 2*limb_bits;
 328       max_divisor_bits = log2_ceil (max_divisor);
 329
 330       mpz_init (new_pp);
 331       nrawfactor = 0;
 332       mod_bits = MIN (numb_bits, limb_bits - max_divisor_bits);
 333
 334       /* one copy of each small prime */
 335       mpz_set_ui (pp, 1L);
 336       for (i = 3; i <= max_divisor; i++)
 337         {
 338           if (! isprime (i))
 339             continue;
 340
 341           mpz_mul_ui (new_pp, pp, (unsigned long) i);
 342           if (mpz_sizeinbase (new_pp, 2) > mod_bits)
 343             break;
 344           mpz_set (pp, new_pp);
 345
 346           ASSERT (nrawfactor < factor_alloc);
 347           rawfactor[nrawfactor].divisor = i;
 348           rawfactor[nrawfactor].multiplicity = 1;
 349           nrawfactor++;
 350         }
 351
 352       /* Plus an extra copy of one or more of the primes selected, if that
 353          still fits in max_divisor and the total in mod_bits.  Usually only
 354          3 or 5 will be candidates */
 355       for (i = nrawfactor-1; i >= 0; i--)
 356         {
 357           if (rawfactor[i].divisor > max_divisor / rawfactor[i].divisor)
 358             continue;
 359           mpz_mul_ui (new_pp, pp, (unsigned long) rawfactor[i].divisor);
 360           if (mpz_sizeinbase (new_pp, 2) > mod_bits)
 361             continue;
 362           mpz_set (pp, new_pp);
 363
 364           rawfactor[i].multiplicity++;
 365         }
 366
 367       mod_bits = mpz_sizeinbase (pp, 2);
 368
 369       mpz_set (pp_norm, pp);
 370       while (mpz_sizeinbase (pp_norm, 2) < numb_bits)
 371         mpz_add (pp_norm, pp_norm, pp_norm);
 372
 373       mpz_preinv_invert (pp_inverted, pp_norm, numb_bits);
 374
 375       mpz_clear (new_pp);
 376     }
 377
 378   /* start the factor array */
 379   for (i = 0; i < nrawfactor; i++)
 380     {
 381       int  j;
 382       ASSERT (nfactor < factor_alloc);
 383       factor[nfactor].divisor = 1;
 384       for (j = 0; j < rawfactor[i].multiplicity; j++)
 385         factor[nfactor].divisor *= rawfactor[i].divisor;
 386       nfactor++;
 387     }
 388
 389  combine:
 390   /* Combine entries in the factor array.  Combine the smallest entry with
 391      the biggest one that will fit with it (ie. under max_divisor), then
 392      repeat that with the new smallest entry. */
 393   qsort (factor, nfactor, sizeof (factor[0]), f_cmp_divisor);
 394   for (i = nfactor-1; i >= 1; i--)
 395     {
 396       if (factor[i].divisor <= max_divisor / factor[0].divisor)
 397         {
 398           factor[0].divisor *= factor[i].divisor;
 399           COLLAPSE_ELEMENT (factor, i, nfactor);
 400           goto combine;
 401         }
 402     }
 403
 404   total_fraction = 1.0;
 405   for (i = 0; i < nfactor; i++)
 406     {
 407       mpz_init (factor[i].inverse);
 408       mpz_invert_ui_2exp (factor[i].inverse,
 409                           (unsigned long) factor[i].divisor,
 410                           (unsigned long) mod_bits);
 411
 412       mpz_init (factor[i].mask);
 413       square_mask (factor[i].mask, factor[i].divisor);
 414
 415       /* fraction of possible squares */
 416       factor[i].fraction = (double) mpz_popcount (factor[i].mask)
 417         / factor[i].divisor;
 418
 419       /* total fraction of possible squares */
 420       total_fraction *= factor[i].fraction;
 421     }
 422
 423   /* best tests first (ie. smallest fraction) */
 424   qsort (factor, nfactor, sizeof (factor[0]), f_cmp_fraction);
 425 }
 426
 427 void
 428 print (int limb_bits, int nail_bits)
 429 {
 430   int    i;
 431   mpz_t  mhi, mlo;
 432
 433   printf ("/* This file generated by gen-psqr.c - DO NOT EDIT. */\n");
 434   printf ("\n");
 435
 436   printf ("#if GMP_LIMB_BITS != %d || GMP_NAIL_BITS != %d\n",
 437           limb_bits, nail_bits);
 438   printf ("Error, error, this data is for %d bit limb and %d bit nail\n",
 439           limb_bits, nail_bits);
 440   printf ("#endif\n");
 441   printf ("\n");
 442
 443   printf ("/* Non-zero bit indicates a quadratic residue mod 0x100.\n");
 444   printf ("   This test identifies %.2f%% as non-squares (%d/256). */\n",
 445           (1.0 - sq_res_0x100_fraction) * 100.0,
 446           0x100 - sq_res_0x100_num);
 447   printf ("static const mp_limb_t\n");
 448   printf ("sq_res_0x100[%d] = {\n", nsq_res_0x100);
 449   for (i = 0; i < nsq_res_0x100; i++)
 450     {
 451       printf ("  CNST_LIMB(0x");
 452       mpz_out_str (stdout, 16, sq_res_0x100[i]);
 453       printf ("),\n");
 454     }
 455   printf ("};\n");
 456   printf ("\n");
 457
 458   if (mpz_sgn (pp) != 0)
 459     {
 460       printf ("/* mpn_mod_34lsub1 not used due to %s */\n", mod34_excuse);
 461       printf ("/* PERFSQR_PP = ");
 462     }
 463   else
 464     printf ("/* 2^%d-1 = ", mod34_bits);
 465   for (i = 0; i < nrawfactor; i++)
 466     {
 467       if (i != 0)
 468         printf (" * ");
 469       printf ("%d", rawfactor[i].divisor);
 470       if (rawfactor[i].multiplicity != 1)
 471         printf ("^%d", rawfactor[i].multiplicity);
 472     }
 473   printf (" %s*/\n", mpz_sgn (pp) == 0 ? "... " : "");
 474
 475   printf ("#define PERFSQR_MOD_BITS  %d\n", mod_bits);
 476   if (mpz_sgn (pp) != 0)
 477     {
 478       printf ("#define PERFSQR_PP            CNST_LIMB(0x");
 479       mpz_out_str (stdout, 16, pp);
 480       printf (")\n");
 481       printf ("#define PERFSQR_PP_NORM       CNST_LIMB(0x");
 482       mpz_out_str (stdout, 16, pp_norm);
 483       printf (")\n");
 484       printf ("#define PERFSQR_PP_INVERTED   CNST_LIMB(0x");
 485       mpz_out_str (stdout, 16, pp_inverted);
 486       printf (")\n");
 487     }
 488   printf ("\n");
 489
 490   mpz_init (mhi);
 491   mpz_init (mlo);
 492
 493   printf ("/* This test identifies %.2f%% as non-squares. */\n",
 494           (1.0 - total_fraction) * 100.0);
 495   printf ("#define PERFSQR_MOD_TEST(up, usize) \\\n");
 496   printf ("  do {                              \\\n");
 497   printf ("    mp_limb_t  r;                   \\\n");
 498   if (mpz_sgn (pp) != 0)
 499     printf ("    PERFSQR_MOD_PP (r, up, usize);  \\\n");
 500   else
 501     printf ("    PERFSQR_MOD_34 (r, up, usize);  \\\n");
 502
 503   for (i = 0; i < nfactor; i++)
 504     {
 505       printf ("                                    \\\n");
 506       printf ("    /* %5.2f%% */                    \\\n",
 507               (1.0 - factor[i].fraction) * 100.0);
 508
 509       printf ("    PERFSQR_MOD_%d (r, CNST_LIMB(%2d), CNST_LIMB(0x",
 510               factor[i].divisor <= limb_bits ? 1 : 2,
 511               factor[i].divisor);
 512       mpz_out_str (stdout, 16, factor[i].inverse);
 513       printf ("), \\\n");
 514       printf ("                   CNST_LIMB(0x");
 515
 516       if ( factor[i].divisor <= limb_bits)
 517         {
 518           mpz_out_str (stdout, 16, factor[i].mask);
 519         }
 520       else
 521         {
 522           mpz_tdiv_r_2exp (mlo, factor[i].mask, (unsigned long) limb_bits);
 523           mpz_tdiv_q_2exp (mhi, factor[i].mask, (unsigned long) limb_bits);
 524           mpz_out_str (stdout, 16, mhi);
 525           printf ("), CNST_LIMB(0x");
 526           mpz_out_str (stdout, 16, mlo);
 527         }
 528       printf (")); \\\n");
 529     }
 530
 531   printf ("  } while (0)\n");
 532   printf ("\n");
 533
 534   printf ("/* Grand total sq_res_0x100 and PERFSQR_MOD_TEST, %.2f%% non-squares. */\n",
 535           (1.0 - (total_fraction * 44.0/256.0)) * 100.0);
 536   printf ("\n");
 537
 538   printf ("/* helper for tests/mpz/t-perfsqr.c */\n");
 539   printf ("#define PERFSQR_DIVISORS  { 256,");
 540   for (i = 0; i < nfactor; i++)
 541       printf (" %d,", factor[i].divisor);
 542   printf (" }\n");
 543
 544
 545   mpz_clear (mhi);
 546   mpz_clear (mlo);
 547 }
 548
 549 int
 550 main (int argc, char *argv[])
 551 {
 552   int  limb_bits, nail_bits;
 553
 554   if (argc != 3)
 555     {
 556       fprintf (stderr, "Usage: gen-psqr <limbbits> <nailbits>\n");
 557       exit (1);
 558     }
 559
 560   limb_bits = atoi (argv[1]);
 561   nail_bits = atoi (argv[2]);
 562
 563   if (limb_bits <= 0
 564       || nail_bits < 0
 565       || nail_bits >= limb_bits)
 566     {
 567       fprintf (stderr, "Invalid limb/nail bits: %d %d\n",
 568                limb_bits, nail_bits);
 569       exit (1);
 570     }
 571
 572   generate_sq_res_0x100 (limb_bits);
 573   generate_mod (limb_bits, nail_bits);
 574
 575   print (limb_bits, nail_bits);
 576
 577   return 0;
 578 }