mpn/generic/toom33_mul.c

   1 /* mpn_toom33_mul -- Multiply {ap,an} and {p,bn} where an and bn are close in
   2    size.  Or more accurately, bn <= an < (3/2)bn.
   3
   4    Contributed to the GNU project by Torbjorn Granlund.
   5    Additional improvements by Marco Bodrato.
   6
   7    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   8    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   9    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
  10
  11 Copyright 2006, 2007, 2008 Free Software Foundation, Inc.
  12
  13 This file is part of the GNU MP Library.
  14
  15 The GNU MP Library is free software; you can redistribute it and/or modify
  16 it under the terms of the GNU Lesser General Public License as published by
  17 the Free Software Foundation; either version 3 of the License, or (at your
  18 option) any later version.
  19
  20 The GNU MP Library is distributed in the hope that it will be useful, but
  21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  22 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  23 License for more details.
  24
  25 You should have received a copy of the GNU Lesser General Public License
  26 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  27
  28
  29 #include "gmp.h"
  30 #include "gmp-impl.h"
  31
  32 /* Evaluate in: -1, 0, +1, +2, +inf
  33
  34   <-s--><--n--><--n--><--n-->
  35    ____ ______ ______ ______
  36   |_a3_|___a2_|___a1_|___a0_|
  37    |b3_|___b2_|___b1_|___b0_|
  38    <-t-><--n--><--n--><--n-->
  39
  40   v0  =  a0         * b0          #   A(0)*B(0)
  41   v1  = (a0+ a1+ a2)*(b0+ b1+ b2) #   A(1)*B(1)      ah  <= 2  bh <= 2
  42   vm1 = (a0- a1+ a2)*(b0- b1+ b2) #  A(-1)*B(-1)    |ah| <= 1  bh <= 1
  43   v2  = (a0+2a1+4a2)*(b0+2b1+4b2) #   A(2)*B(2)      ah  <= 6  bh <= 6
  44   vinf=          a2 *         b2  # A(inf)*B(inf)
  45 */
  46
  47 #if TUNE_PROGRAM_BUILD
  48 #define MAYBE_mul_basecase 1
  49 #define MAYBE_mul_toom33   1
  50 #else
  51 #define MAYBE_mul_basecase                                              \
  52   (MUL_TOOM33_THRESHOLD < 3 * MUL_TOOM22_THRESHOLD)
  53 #define MAYBE_mul_toom33                                                \
  54   (MUL_TOOM44_THRESHOLD >= 3 * MUL_TOOM33_THRESHOLD)
  55 #endif
  56
  57 /* FIXME: TOOM33_MUL_N_REC is not quite right for a balanced
  58    multiplication at the infinity point. We may have
  59    MAYBE_mul_basecase == 0, and still get s just below
  60    MUL_TOOM22_THRESHOLD. If MUL_TOOM33_THRESHOLD == 7, we can even get
  61    s == 1 and mpn_toom22_mul will crash.
  62 */
  63
  64 #define TOOM33_MUL_N_REC(p, a, b, n, ws)                                \
  65   do {                                                                  \
  66     if (MAYBE_mul_basecase                                              \
  67         && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD))                   \
  68       mpn_mul_basecase (p, a, n, b, n);                                 \
  69     else if (! MAYBE_mul_toom33                                         \
  70              || BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD))              \
  71       mpn_toom22_mul (p, a, n, b, n, ws);                               \
  72     else                                                                \
  73       mpn_toom33_mul (p, a, n, b, n, ws);                               \
  74   } while (0)
  75
  76 void
  77 mpn_toom33_mul (mp_ptr pp,
  78                 mp_srcptr ap, mp_size_t an,
  79                 mp_srcptr bp, mp_size_t bn,
  80                 mp_ptr scratch)
  81 {
  82   mp_size_t n, s, t;
  83   int vm1_neg;
  84   mp_limb_t cy, vinf0;
  85   mp_ptr gp;
  86   mp_ptr as1, asm1, as2;
  87   mp_ptr bs1, bsm1, bs2;
  88
  89 #define a0  ap
  90 #define a1  (ap + n)
  91 #define a2  (ap + 2*n)
  92 #define b0  bp
  93 #define b1  (bp + n)
  94 #define b2  (bp + 2*n)
  95
  96   n = (an + 2) / (size_t) 3;
  97
  98   s = an - 2 * n;
  99   t = bn - 2 * n;
 100
 101   ASSERT (an >= bn);
 102
 103   ASSERT (0 < s && s <= n);
 104   ASSERT (0 < t && t <= n);
 105
 106   as1  = scratch + 4 * n + 4;
 107   asm1 = scratch + 2 * n + 2;
 108   as2 = pp + n + 1;
 109
 110   bs1 = pp;
 111   bsm1 = scratch + 3 * n + 3; /* we need 4n+4 <= 4n+s+t */
 112   bs2 = pp + 2 * n + 2;
 113
 114   gp = scratch;
 115
 116   vm1_neg = 0;
 117
 118   /* Compute as1 and asm1.  */
 119   cy = mpn_add (gp, a0, n, a2, s);
 120 #if HAVE_NATIVE_mpn_add_n_sub_n
 121   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
 122     {
 123       cy = mpn_add_n_sub_n (as1, asm1, a1, gp, n);
 124       as1[n] = 0;
 125       asm1[n] = 0;
 126       vm1_neg = 1;
 127     }
 128   else
 129     {
 130       cy2 = mpn_add_n_sub_n (as1, asm1, gp, a1, n);
 131       as1[n] = cy + (cy2 >> 1);
 132       asm1[n] = cy - (cy & 1);
 133     }
 134 #else
 135   as1[n] = cy + mpn_add_n (as1, gp, a1, n);
 136   if (cy == 0 && mpn_cmp (gp, a1, n) < 0)
 137     {
 138       mpn_sub_n (asm1, a1, gp, n);
 139       asm1[n] = 0;
 140       vm1_neg = 1;
 141     }
 142   else
 143     {
 144       cy -= mpn_sub_n (asm1, gp, a1, n);
 145       asm1[n] = cy;
 146     }
 147 #endif
 148
 149   /* Compute as2.  */
 150 #if HAVE_NATIVE_mpn_rsblsh1_n
 151   cy = mpn_add_n (as2, a2, as1, s);
 152   if (s != n)
 153     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
 154   cy += as1[n];
 155   cy = 2 * cy + mpn_rsblsh1_n (as2, a0, as2, n);
 156 #else
 157 #if HAVE_NATIVE_mpn_addlsh1_n
 158   cy  = mpn_addlsh1_n (as2, a1, a2, s);
 159   if (s != n)
 160     cy = mpn_add_1 (as2 + s, a1 + s, n - s, cy);
 161   cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n);
 162 #else
 163   cy = mpn_add_n (as2, a2, as1, s);
 164   if (s != n)
 165     cy = mpn_add_1 (as2 + s, as1 + s, n - s, cy);
 166   cy += as1[n];
 167   cy = 2 * cy + mpn_lshift (as2, as2, n, 1);
 168   cy -= mpn_sub_n (as2, as2, a0, n);
 169 #endif
 170 #endif
 171   as2[n] = cy;
 172
 173   /* Compute bs1 and bsm1.  */
 174   cy = mpn_add (gp, b0, n, b2, t);
 175 #if HAVE_NATIVE_mpn_add_n_sub_n
 176   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
 177     {
 178       cy = mpn_add_n_sub_n (bs1, bsm1, b1, gp, n);
 179       bs1[n] = 0;
 180       bsm1[n] = 0;
 181       vm1_neg ^= 1;
 182     }
 183   else
 184     {
 185       cy2 = mpn_add_n_sub_n (bs1, bsm1, gp, b1, n);
 186       bs1[n] = cy + (cy2 >> 1);
 187       bsm1[n] = cy - (cy & 1);
 188     }
 189 #else
 190   bs1[n] = cy + mpn_add_n (bs1, gp, b1, n);
 191   if (cy == 0 && mpn_cmp (gp, b1, n) < 0)
 192     {
 193       mpn_sub_n (bsm1, b1, gp, n);
 194       bsm1[n] = 0;
 195       vm1_neg ^= 1;
 196     }
 197   else
 198     {
 199       cy -= mpn_sub_n (bsm1, gp, b1, n);
 200       bsm1[n] = cy;
 201     }
 202 #endif
 203
 204   /* Compute bs2.  */
 205 #if HAVE_NATIVE_mpn_rsblsh1_n
 206   cy = mpn_add_n (bs2, b2, bs1, t);
 207   if (t != n)
 208     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
 209   cy += bs1[n];
 210   cy = 2 * cy + mpn_rsblsh1_n (bs2, b0, bs2, n);
 211 #else
 212 #if HAVE_NATIVE_mpn_addlsh1_n
 213   cy  = mpn_addlsh1_n (bs2, b1, b2, t);
 214   if (t != n)
 215     cy = mpn_add_1 (bs2 + t, b1 + t, n - t, cy);
 216   cy = 2 * cy + mpn_addlsh1_n (bs2, b0, bs2, n);
 217 #else
 218   cy  = mpn_add_n (bs2, bs1, b2, t);
 219   if (t != n)
 220     cy = mpn_add_1 (bs2 + t, bs1 + t, n - t, cy);
 221   cy += bs1[n];
 222   cy = 2 * cy + mpn_lshift (bs2, bs2, n, 1);
 223   cy -= mpn_sub_n (bs2, bs2, b0, n);
 224 #endif
 225 #endif
 226   bs2[n] = cy;
 227
 228   ASSERT (as1[n] <= 2);
 229   ASSERT (bs1[n] <= 2);
 230   ASSERT (asm1[n] <= 1);
 231   ASSERT (bsm1[n] <= 1);
 232   ASSERT (as2[n] <= 6);
 233   ASSERT (bs2[n] <= 6);
 234
 235 #define v0    pp                                /* 2n */
 236 #define v1    (pp + 2 * n)                      /* 2n+1 */
 237 #define vinf  (pp + 4 * n)                      /* s+t */
 238 #define vm1   scratch                           /* 2n+1 */
 239 #define v2    (scratch + 2 * n + 1)             /* 2n+2 */
 240 #define scratch_out  (scratch + 5 * n + 5)
 241
 242   /* vm1, 2n+1 limbs */
 243 #ifdef SMALLER_RECURSION
 244   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n, scratch_out);
 245   cy = 0;
 246   if (asm1[n] != 0)
 247     cy = bsm1[n] + mpn_add_n (vm1 + n, vm1 + n, bsm1, n);
 248   if (bsm1[n] != 0)
 249     cy += mpn_add_n (vm1 + n, vm1 + n, asm1, n);
 250   vm1[2 * n] = cy;
 251 #else
 252   TOOM33_MUL_N_REC (vm1, asm1, bsm1, n + 1, scratch_out);
 253 #endif
 254
 255   TOOM33_MUL_N_REC (v2, as2, bs2, n + 1, scratch_out);  /* v2, 2n+1 limbs */
 256
 257   /* vinf, s+t limbs */
 258   if (s > t)  mpn_mul (vinf, a2, s, b2, t);
 259   else        TOOM33_MUL_N_REC (vinf, a2, b2, s, scratch_out);
 260
 261   vinf0 = vinf[0];                              /* v1 overlaps with this */
 262
 263 #ifdef SMALLER_RECURSION
 264   /* v1, 2n+1 limbs */
 265   TOOM33_MUL_N_REC (v1, as1, bs1, n, scratch_out);
 266   if (as1[n] == 1)
 267     {
 268       cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n);
 269     }
 270   else if (as1[n] != 0)
 271     {
 272 #if HAVE_NATIVE_mpn_addlsh1_n
 273       cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n);
 274 #else
 275       cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2));
 276 #endif
 277     }
 278   else
 279     cy = 0;
 280   if (bs1[n] == 1)
 281     {
 282       cy += mpn_add_n (v1 + n, v1 + n, as1, n);
 283     }
 284   else if (bs1[n] != 0)
 285     {
 286 #if HAVE_NATIVE_mpn_addlsh1_n
 287       cy += mpn_addlsh1_n (v1 + n, v1 + n, as1, n);
 288 #else
 289       cy += mpn_addmul_1 (v1 + n, as1, n, CNST_LIMB(2));
 290 #endif
 291     }
 292   v1[2 * n] = cy;
 293 #else
 294   cy = vinf[1];
 295   TOOM33_MUL_N_REC (v1, as1, bs1, n + 1, scratch_out);
 296   vinf[1] = cy;
 297 #endif
 298
 299   TOOM33_MUL_N_REC (v0, ap, bp, n, scratch_out);        /* v0, 2n limbs */
 300
 301   mpn_toom_interpolate_5pts (pp, v2, vm1, n, s + t, vm1_neg, vinf0);
 302 }