mpn/generic/mulmod_bnm1.c

   1 /* mulmod_bnm1.c -- multiplication mod B^n-1.
   2
   3    Contributed to the GNU project by Niels Möller, Torbjorn Granlund and
   4    Marco Bodrato.
   5
   6    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
   7    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   8    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   9
  10 Copyright 2009, 2010 Free Software Foundation, Inc.
  11
  12 This file is part of the GNU MP Library.
  13
  14 The GNU MP Library is free software; you can redistribute it and/or modify
  15 it under the terms of the GNU Lesser General Public License as published by
  16 the Free Software Foundation; either version 3 of the License, or (at your
  17 option) any later version.
  18
  19 The GNU MP Library is distributed in the hope that it will be useful, but
  20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  21 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  22 License for more details.
  23
  24 You should have received a copy of the GNU Lesser General Public License
  25 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  26
  27
  28 #include "gmp.h"
  29 #include "gmp-impl.h"
  30 #include "longlong.h"
  31
  32 /* Inputs are {ap,rn} and {bp,rn}; output is {rp,rn}, computation is
  33    mod B^rn - 1, and values are semi-normalised; zero is represented
  34    as either 0 or B^n - 1.  Needs a scratch of 2rn limbs at tp.
  35    tp==rp is allowed. */
  36 void
  37 mpn_bc_mulmod_bnm1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
  38                     mp_ptr tp)
  39 {
  40   mp_limb_t cy;
  41
  42   ASSERT (0 < rn);
  43
  44   mpn_mul_n (tp, ap, bp, rn);
  45   cy = mpn_add_n (rp, tp, tp + rn, rn);
  46   /* If cy == 1, then the value of rp is at most B^rn - 2, so there can
  47    * be no overflow when adding in the carry. */
  48   MPN_INCR_U (rp, rn, cy);
  49 }
  50
  51
  52 /* Inputs are {ap,rn+1} and {bp,rn+1}; output is {rp,rn+1}, in
  53    semi-normalised representation, computation is mod B^rn + 1. Needs
  54    a scratch area of 2rn + 2 limbs at tp; tp == rp is allowed.
  55    Output is normalised. */
  56 static void
  57 mpn_bc_mulmod_bnp1 (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t rn,
  58                     mp_ptr tp)
  59 {
  60   mp_limb_t cy;
  61
  62   ASSERT (0 < rn);
  63
  64   mpn_mul_n (tp, ap, bp, rn + 1);
  65   ASSERT (tp[2*rn+1] == 0);
  66   ASSERT (tp[2*rn] < GMP_NUMB_MAX);
  67   cy = tp[2*rn] + mpn_sub_n (rp, tp, tp+rn, rn);
  68   rp[rn] = 0;
  69   MPN_INCR_U (rp, rn+1, cy );
  70 }
  71
  72
  73 /* Computes {rp,MIN(rn,an+bn)} <- {ap,an}*{bp,bn} Mod(B^rn-1)
  74  *
  75  * The result is expected to be ZERO if and only if one of the operand
  76  * already is. Otherwise the class [0] Mod(B^rn-1) is represented by
  77  * B^rn-1. This should not be a problem if mulmod_bnm1 is used to
  78  * combine results and obtain a natural number when one knows in
  79  * advance that the final value is less than (B^rn-1).
  80  * Moreover it should not be a problem if mulmod_bnm1 is used to
  81  * compute the full product with an+bn <= rn, because this condition
  82  * implies (B^an-1)(B^bn-1) < (B^rn-1) .
  83  *
  84  * Requires 0 < bn <= an <= rn and an + bn > rn/2
  85  * Scratch need: rn + (need for recursive call OR rn + 4). This gives
  86  *
  87  * S(n) <= rn + MAX (rn + 4, S(n/2)) <= 2rn + 4
  88  */
  89 void
  90 mpn_mulmod_bnm1 (mp_ptr rp, mp_size_t rn, mp_srcptr ap, mp_size_t an, mp_srcptr bp, mp_size_t bn, mp_ptr tp)
  91 {
  92   ASSERT (0 < bn);
  93   ASSERT (bn <= an);
  94   ASSERT (an <= rn);
  95
  96   if ((rn & 1) != 0 || BELOW_THRESHOLD (rn, MULMOD_BNM1_THRESHOLD))
  97     {
  98       if (UNLIKELY (bn < rn))
  99         {
 100           if (UNLIKELY (an + bn <= rn))
 101             {
 102               mpn_mul (rp, ap, an, bp, bn);
 103             }
 104           else
 105             {
 106               mp_limb_t cy;
 107               mpn_mul (tp, ap, an, bp, bn);
 108               cy = mpn_add (rp, tp, rn, tp + rn, an + bn - rn);
 109               MPN_INCR_U (rp, rn, cy);
 110             }
 111         }
 112       else
 113         mpn_bc_mulmod_bnm1 (rp, ap, bp, rn, tp);
 114     }
 115   else
 116     {
 117       mp_size_t n;
 118       mp_limb_t cy;
 119       mp_limb_t hi;
 120
 121       n = rn >> 1;
 122
 123       /* We need at least an + bn >= n, to be able to fit one of the
 124          recursive products at rp. Requiring strict inequality makes
 125          the coded slightly simpler. If desired, we could avoid this
 126          restriction by initially halving rn as long as rn is even and
 127          an + bn <= rn/2. */
 128
 129       ASSERT (an + bn > n);
 130
 131       /* Compute xm = a*b mod (B^n - 1), xp = a*b mod (B^n + 1)
 132          and crt together as
 133
 134          x = -xp * B^n + (B^n + 1) * [ (xp + xm)/2 mod (B^n-1)]
 135       */
 136
 137 #define a0 ap
 138 #define a1 (ap + n)
 139 #define b0 bp
 140 #define b1 (bp + n)
 141
 142 #define xp  tp  /* 2n + 2 */
 143       /* am1  maybe in {xp, n} */
 144       /* bm1  maybe in {xp + n, n} */
 145 #define sp1 (tp + 2*n + 2)
 146       /* ap1  maybe in {sp1, n + 1} */
 147       /* bp1  maybe in {sp1 + n + 1, n + 1} */
 148
 149       {
 150         mp_srcptr am1, bm1;
 151         mp_size_t anm, bnm;
 152         mp_ptr so;
 153
 154         if (LIKELY (an > n))
 155           {
 156             am1 = xp;
 157             cy = mpn_add (xp, a0, n, a1, an - n);
 158             MPN_INCR_U (xp, n, cy);
 159             anm = n;
 160             if (LIKELY (bn > n))
 161               {
 162                 bm1 = xp + n;
 163                 cy = mpn_add (xp + n, b0, n, b1, bn - n);
 164                 MPN_INCR_U (xp + n, n, cy);
 165                 bnm = n;
 166                 so = xp + 2*n;
 167               }
 168             else
 169               {
 170                 so = xp + n;
 171                 bm1 = b0;
 172                 bnm = bn;
 173               }
 174           }
 175         else
 176           {
 177             so = xp;
 178             am1 = a0;
 179             anm = an;
 180             bm1 = b0;
 181             bnm = bn;
 182           }
 183
 184         mpn_mulmod_bnm1 (rp, n, am1, anm, bm1, bnm, so);
 185       }
 186
 187       {
 188         int       k;
 189         mp_srcptr ap1, bp1;
 190         mp_size_t anp, bnp;
 191
 192         if (LIKELY (an > n)) {
 193           ap1 = sp1;
 194           cy = mpn_sub (sp1, a0, n, a1, an - n);
 195           sp1[n] = 0;
 196           MPN_INCR_U (sp1, n + 1, cy);
 197           anp = n + ap1[n];
 198         } else {
 199           ap1 = a0;
 200           anp = an;
 201         }
 202
 203         if (LIKELY (bn > n)) {
 204           bp1 = sp1 + n + 1;
 205           cy = mpn_sub (sp1 + n + 1, b0, n, b1, bn - n);
 206           sp1[2*n+1] = 0;
 207           MPN_INCR_U (sp1 + n + 1, n + 1, cy);
 208           bnp = n + bp1[n];
 209         } else {
 210           bp1 = b0;
 211           bnp = bn;
 212         }
 213
 214         if (BELOW_THRESHOLD (n, MUL_FFT_MODF_THRESHOLD))
 215           k=0;
 216         else
 217           {
 218             int mask;
 219             k = mpn_fft_best_k (n, 0);
 220             mask = (1<<k) -1;
 221             while (n & mask) {k--; mask >>=1;};
 222           }
 223         if (k >= FFT_FIRST_K)
 224           xp[n] = mpn_mul_fft (xp, n, ap1, anp, bp1, bnp, k);
 225         else if (UNLIKELY (bp1 == b0))
 226           {
 227             ASSERT (anp + bnp <= 2*n+1);
 228             ASSERT (anp + bnp > n);
 229             ASSERT (anp >= bnp);
 230             mpn_mul (xp, ap1, anp, bp1, bnp);
 231             anp = anp + bnp - n;
 232             ASSERT (anp <= n || xp[2*n]==0);
 233             anp-= anp > n;
 234             cy = mpn_sub (xp, xp, n, xp + n, anp);
 235             xp[n] = 0;
 236             MPN_INCR_U (xp, n+1, cy);
 237           }
 238         else
 239           mpn_bc_mulmod_bnp1 (xp, ap1, bp1, n, xp);
 240       }
 241
 242       /* Here the CRT recomposition begins.
 243
 244          xm <- (xp + xm)/2 = (xp + xm)B^n/2 mod (B^n-1)
 245          Division by 2 is a bitwise rotation.
 246
 247          Assumes xp normalised mod (B^n+1).
 248
 249          The residue class [0] is represented by [B^n-1]; except when
 250          both input are ZERO.
 251       */
 252
 253 #if HAVE_NATIVE_mpn_rsh1add_n || HAVE_NATIVE_mpn_rsh1add_nc
 254 #if HAVE_NATIVE_mpn_rsh1add_nc
 255       cy = mpn_rsh1add_nc(rp, rp, xp, n, xp[n]); /* B^n = 1 */
 256       hi = cy << (GMP_NUMB_BITS - 1);
 257       cy = 0;
 258       /* next update of rp[n-1] will set cy = 1 only if rp[n-1]+=hi
 259          overflows, i.e. a further increment will not overflow again. */
 260 #else /* ! _nc */
 261       cy = xp[n] + mpn_rsh1add_n(rp, rp, xp, n); /* B^n = 1 */
 262       hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
 263       cy >>= 1;
 264       /* cy = 1 only if xp[n] = 1 i.e. {xp,n} = ZERO, this implies that
 265          the rsh1add was a simple rshift: the top bit is 0. cy=1 => hi=0. */
 266 #endif
 267 #if GMP_NAIL_BITS == 0
 268       add_ssaaaa(cy, rp[n-1], cy, rp[n-1], 0, hi);
 269 #else
 270       cy += (hi & rp[n-1]) >> (GMP_NUMB_BITS-1);
 271       rp[n-1] ^= hi;
 272 #endif
 273 #else /* ! HAVE_NATIVE_mpn_rsh1add_n */
 274 #if HAVE_NATIVE_mpn_add_nc
 275       cy = mpn_add_nc(rp, rp, xp, n, xp[n]);
 276 #else /* ! _nc */
 277       cy = xp[n] + mpn_add_n(rp, rp, xp, n); /* xp[n] == 1 implies {xp,n} == ZERO */
 278 #endif
 279       cy += (rp[0]&1);
 280       mpn_rshift(rp, rp, n, 1);
 281       ASSERT (cy <= 2);
 282       hi = (cy<<(GMP_NUMB_BITS-1))&GMP_NUMB_MASK; /* (cy&1) << ... */
 283       cy >>= 1;
 284       /* We can have cy != 0 only if hi = 0... */
 285       ASSERT ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0);
 286       rp[n-1] |= hi;
 287       /* ... rp[n-1] + cy can not overflow, the following INCR is correct. */
 288 #endif
 289       ASSERT (cy <= 1);
 290       /* Next increment can not overflow, read the previous comments about cy. */
 291       ASSERT ((cy == 0) || ((rp[n-1] & GMP_NUMB_HIGHBIT) == 0));
 292       MPN_INCR_U(rp, n, cy);
 293
 294       /* Compute the highest half:
 295          ([(xp + xm)/2 mod (B^n-1)] - xp ) * B^n
 296        */
 297       if (UNLIKELY (an + bn < rn))
 298         {
 299           /* Note that in this case, the only way the result can equal
 300              zero mod B^{rn} - 1 is if one of the inputs is zero, and
 301              then the output of both the recursive calls and this CRT
 302              reconstruction is zero, not B^{rn} - 1. Which is good,
 303              since the latter representation doesn't fit in the output
 304              area.*/
 305           cy = mpn_sub_n (rp + n, rp, xp, an + bn - n);
 306
 307           /* FIXME: This subtraction of the high parts is not really
 308              necessary, we do it to get the carry out, and for sanity
 309              checking. */
 310           cy = xp[n] + mpn_sub_nc (xp + an + bn - n, rp + an + bn - n,
 311                                    xp + an + bn - n, rn - (an + bn), cy);
 312           ASSERT (an + bn == rn - 1 ||
 313                   mpn_zero_p (xp + an + bn - n + 1, rn - 1 - (an + bn)));
 314           cy = mpn_sub_1 (rp, rp, an + bn, cy);
 315           ASSERT (cy == (xp + an + bn - n)[0]);
 316         }
 317       else
 318         {
 319           cy = xp[n] + mpn_sub_n (rp + n, rp, xp, n);
 320           /* cy = 1 only if {xp,n+1} is not ZERO, i.e. {rp,n} is not ZERO.
 321              DECR will affect _at most_ the lowest n limbs. */
 322           MPN_DECR_U (rp, 2*n, cy);
 323         }
 324 #undef a0
 325 #undef a1
 326 #undef b0
 327 #undef b1
 328 #undef xp
 329 #undef sp1
 330     }
 331 }
 332
 333 mp_size_t
 334 mpn_mulmod_bnm1_next_size (mp_size_t n)
 335 {
 336   mp_size_t nh;
 337
 338   if (BELOW_THRESHOLD (n,     MULMOD_BNM1_THRESHOLD))
 339     return n;
 340   if (BELOW_THRESHOLD (n, 4 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
 341     return (n + (2-1)) & (-2);
 342   if (BELOW_THRESHOLD (n, 8 * (MULMOD_BNM1_THRESHOLD - 1) + 1))
 343     return (n + (4-1)) & (-4);
 344
 345   nh = (n + 1) >> 1;
 346
 347   if (BELOW_THRESHOLD (nh, MUL_FFT_MODF_THRESHOLD))
 348     return (n + (8-1)) & (-8);
 349
 350   return 2 * mpn_fft_next_size (nh, mpn_fft_best_k (nh, 0));
 351 }