mpn/generic/perfsqr.c

   1 /* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square,
   2    zero otherwise.
   3
   4 Copyright 1991, 1993, 1994, 1996, 1997, 2000, 2001, 2002, 2005 Free Software
   5 Foundation, Inc.
   6
   7 This file is part of the GNU MP Library.
   8
   9 The GNU MP Library is free software; you can redistribute it and/or modify
  10 it under the terms of the GNU Lesser General Public License as published by
  11 the Free Software Foundation; either version 3 of the License, or (at your
  12 option) any later version.
  13
  14 The GNU MP Library is distributed in the hope that it will be useful, but
  15 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  16 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  17 License for more details.
  18
  19 You should have received a copy of the GNU Lesser General Public License
  20 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  21
  22 #include <stdio.h> /* for NULL */
  23 #include "gmp.h"
  24 #include "gmp-impl.h"
  25 #include "longlong.h"
  26
  27 #include "perfsqr.h"
  28
  29
  30 /* change this to "#define TRACE(x) x" for diagnostics */
  31 #define TRACE(x)
  32
  33
  34
  35 /* PERFSQR_MOD_* detects non-squares using residue tests.
  36
  37    A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h.  It takes
  38    {up,usize} modulo a selected modulus to get a remainder r.  For 32-bit or
  39    64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34,
  40    or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP
  41    used.  PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this
  42    case too.
  43
  44    PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or
  45    PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table
  46    data indicating residues and non-residues modulo those divisors.  The
  47    table data is in 1 or 2 limbs worth of bits respectively, per the size of
  48    each d.
  49
  50    A "modexact" style remainder is taken to reduce r modulo d.
  51    PERFSQR_MOD_IDX implements this, producing an index "idx" for use with
  52    the table data.  Notice there's just one multiplication by a constant
  53    "inv", for each d.
  54
  55    The modexact doesn't produce a true r%d remainder, instead idx satisfies
  56    "-(idx<<PERFSQR_MOD_BITS) == r mod d".  Because d is odd, this factor
  57    -2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is
  58    accounted for by having the table data suitably permuted.
  59
  60    The remainder r fits within PERFSQR_MOD_BITS which is less than a limb.
  61    In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit
  62    each divisor d meaning the modexact multiply can take place entirely
  63    within one limb, giving the compiler the chance to optimize it, in a way
  64    that say umul_ppmm would not give.
  65
  66    There's no need for the divisors d to be prime, in fact gen-psqr.c makes
  67    a deliberate effort to combine factors so as to reduce the number of
  68    separate tests done on r.  But such combining is limited to d <=
  69    2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs.
  70
  71    Alternatives:
  72
  73    It'd be possible to use bigger divisors d, and more than 2 limbs of table
  74    data, but this doesn't look like it would be of much help to the prime
  75    factors in the usual moduli 2^24-1 or 2^48-1.
  76
  77    The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're
  78    just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime
  79    factors.  2^32-1 and 2^64-1 would be equally easy to calculate, but have
  80    fewer prime factors.
  81
  82    The nails case usually ends up using mpn_mod_1, which is a lot slower
  83    than mpn_mod_34lsub1.  Perhaps other such special moduli could be found
  84    for the nails case.  Two-term things like 2^30-2^15-1 might be
  85    candidates.  Or at worst some on-the-fly de-nailing would allow the plain
  86    2^24-1 to be used.  Currently nails are too preliminary to be worried
  87    about.
  88
  89 */
  90
  91 #define PERFSQR_MOD_MASK       ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1)
  92
  93 #define MOD34_BITS  (GMP_NUMB_BITS / 4 * 3)
  94 #define MOD34_MASK  ((CNST_LIMB(1) << MOD34_BITS) - 1)
  95
  96 #define PERFSQR_MOD_34(r, up, usize)                            \
  97   do {                                                          \
  98     (r) = mpn_mod_34lsub1 (up, usize);                          \
  99     (r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS);             \
 100   } while (0)
 101
 102 /* FIXME: The %= here isn't good, and might destroy any savings from keeping
 103    the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm).
 104    Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor
 105    and a shift count, like mpn_preinv_divrem_1.  But mod_34lsub1 is our
 106    normal case, so lets not worry too much about mod_1.  */
 107 #define PERFSQR_MOD_PP(r, up, usize)                                    \
 108   do {                                                                  \
 109     if (BELOW_THRESHOLD (usize, PREINV_MOD_1_TO_MOD_1_THRESHOLD))       \
 110       {                                                                 \
 111         (r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM,             \
 112                                 PERFSQR_PP_INVERTED);                   \
 113         (r) %= PERFSQR_PP;                                              \
 114       }                                                                 \
 115     else                                                                \
 116       {                                                                 \
 117         (r) = mpn_mod_1 (up, usize, PERFSQR_PP);                        \
 118       }                                                                 \
 119   } while (0)
 120
 121 #define PERFSQR_MOD_IDX(idx, r, d, inv)                         \
 122   do {                                                          \
 123     mp_limb_t  q;                                               \
 124     ASSERT ((r) <= PERFSQR_MOD_MASK);                           \
 125     ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1);           \
 126     ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK);           \
 127                                                                 \
 128     q = ((r) * (inv)) & PERFSQR_MOD_MASK;                       \
 129     ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK));               \
 130     (idx) = (q * (d)) >> PERFSQR_MOD_BITS;                      \
 131   } while (0)
 132
 133 #define PERFSQR_MOD_1(r, d, inv, mask)                          \
 134   do {                                                          \
 135     unsigned   idx;                                             \
 136     ASSERT ((d) <= GMP_LIMB_BITS);                              \
 137     PERFSQR_MOD_IDX(idx, r, d, inv);                            \
 138     TRACE (printf ("  PERFSQR_MOD_1 d=%u r=%lu idx=%u\n",       \
 139                    d, r%d, idx));                               \
 140     if ((((mask) >> idx) & 1) == 0)                             \
 141       {                                                         \
 142         TRACE (printf ("  non-square\n"));                      \
 143         return 0;                                               \
 144       }                                                         \
 145   } while (0)
 146
 147 /* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the
 148    sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch.  */
 149 #define PERFSQR_MOD_2(r, d, inv, mhi, mlo)                      \
 150   do {                                                          \
 151     mp_limb_t  m;                                               \
 152     unsigned   idx;                                             \
 153     ASSERT ((d) <= 2*GMP_LIMB_BITS);                            \
 154                                                                 \
 155     PERFSQR_MOD_IDX (idx, r, d, inv);                           \
 156     TRACE (printf ("  PERFSQR_MOD_2 d=%u r=%lu idx=%u\n",       \
 157                    d, r%d, idx));                               \
 158     m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi));        \
 159     idx %= GMP_LIMB_BITS;                                       \
 160     if (((m >> idx) & 1) == 0)                                  \
 161       {                                                         \
 162         TRACE (printf ("  non-square\n"));                      \
 163         return 0;                                               \
 164       }                                                         \
 165   } while (0)
 166
 167
 168 int
 169 mpn_perfect_square_p (mp_srcptr up, mp_size_t usize)
 170 {
 171   ASSERT (usize >= 1);
 172
 173   TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize));
 174
 175   /* The first test excludes 212/256 (82.8%) of the perfect square candidates
 176      in O(1) time.  */
 177   {
 178     unsigned  idx = up[0] % 0x100;
 179     if (((sq_res_0x100[idx / GMP_LIMB_BITS]
 180           >> (idx % GMP_LIMB_BITS)) & 1) == 0)
 181       return 0;
 182   }
 183
 184 #if 0
 185   /* Check that we have even multiplicity of 2, and then check that the rest is
 186      a possible perfect square.  Leave disabled until we can determine this
 187      really is an improvement.  It it is, it could completely replace the
 188      simple probe above, since this should through out more non-squares, but at
 189      the expense of somewhat more cycles.  */
 190   {
 191     mp_limb_t lo;
 192     int cnt;
 193     lo = up[0];
 194     while (lo == 0)
 195       up++, lo = up[0], usize--;
 196     count_trailing_zeros (cnt, lo);
 197     if ((cnt & 1) != 0)
 198       return 0;                 /* return of not even multiplicity of 2 */
 199     lo >>= cnt;                 /* shift down to align lowest non-zero bit */
 200     lo >>= 1;                   /* shift away lowest non-zero bit */
 201     if ((lo & 3) != 0)
 202       return 0;
 203   }
 204 #endif
 205
 206
 207   /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares
 208      according to their residues modulo small primes (or powers of
 209      primes).  See perfsqr.h.  */
 210   PERFSQR_MOD_TEST (up, usize);
 211
 212
 213   /* For the third and last test, we finally compute the square root,
 214      to make sure we've really got a perfect square.  */
 215   {
 216     mp_ptr root_ptr;
 217     int res;
 218     TMP_DECL;
 219
 220     TMP_MARK;
 221     root_ptr = TMP_ALLOC_LIMBS ((usize + 1) / 2);
 222
 223     /* Iff mpn_sqrtrem returns zero, the square is perfect.  */
 224     res = ! mpn_sqrtrem (root_ptr, NULL, up, usize);
 225     TMP_FREE;
 226
 227     return res;
 228   }
 229 }