mpn/generic/mullo_n.c

   1 /* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs
   2    of their products.
   3
   4    Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
   5
   6    THIS IS (FOR NOW) AN INTERNAL FUNCTION.  IT IS ONLY SAFE TO REACH THIS
   7    FUNCTION THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST GUARANTEED
   8    THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   9
  10 Copyright 2004, 2005, 2009, 2010 Free Software Foundation, Inc.
  11
  12 This file is part of the GNU MP Library.
  13
  14 The GNU MP Library is free software; you can redistribute it and/or modify
  15 it under the terms of the GNU Lesser General Public License as published by
  16 the Free Software Foundation; either version 3 of the License, or (at your
  17 option) any later version.
  18
  19 The GNU MP Library is distributed in the hope that it will be useful, but
  20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  21 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  22 License for more details.
  23
  24 You should have received a copy of the GNU Lesser General Public License
  25 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  26
  27 #include "gmp.h"
  28 #include "gmp-impl.h"
  29
  30
  31 #ifndef MULLO_BASECASE_THRESHOLD
  32 #define MULLO_BASECASE_THRESHOLD 0      /* never use mpn_mul_basecase */
  33 #endif
  34
  35 #ifndef MULLO_DC_THRESHOLD
  36 #define MULLO_DC_THRESHOLD 3*MUL_TOOM22_THRESHOLD
  37 #endif
  38
  39 #ifndef MULLO_MUL_N_THRESHOLD
  40 #define MULLO_MUL_N_THRESHOLD MUL_FFT_THRESHOLD
  41 #endif
  42
  43 #if TUNE_PROGRAM_BUILD
  44 #define MAYBE_range_basecase 1
  45 #define MAYBE_range_toom22   1
  46 #else
  47 #define MAYBE_range_basecase                                           \
  48   ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11))
  49 #define MAYBE_range_toom22                                             \
  50   ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) )
  51 #endif
  52
  53 /*  THINK: The DC strategy uses different constants in different Toom's
  54          ranges. Something smoother?
  55 */
  56
  57 /*
  58   Compute the least significant half of the product {xy,n}*{yp,n}, or
  59   formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
  60
  61   Above the given threshold, the Divide and Conquer strategy is used.
  62   The operands are split in two, and a full product plus two mullo
  63   are used to obtain the final result. The more natural strategy is to
  64   split in two halves, but this is far from optimal when a
  65   sub-quadratic multiplication is used.
  66
  67   Mulders suggests an unbalanced split in favour of the full product,
  68   split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
  69
  70   To compute the value of a, we assume that the cost of mullo for a
  71   given size ML(n) is a fraction of the cost of a full product with
  72   same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
  73   then we can write:
  74
  75   ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
  76
  77   Given a value for e, want to minimise the value of k, i.e. the
  78   function k=(1-a)^e/(1-2*a^e).
  79
  80   With e=2, the exponent for schoolbook multiplication, the minimum is
  81   given by the values a=1-a=1/2.
  82
  83   With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
  84   Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
  85
  86   Other possible approximations follow:
  87   e=log(5)/log(3) [Toom-3] -> a ~= 9/40
  88   e=log(7)/log(4) [Toom-4] -> a ~= 7/39
  89   e=log(11)/log(6) [Toom-6] -> a ~= 1/8
  90   e=log(15)/log(8) [Toom-8] -> a ~= 1/10
  91
  92   The values above where obtained with the following trivial commands
  93   in the gp-pari shell:
  94
  95 fun(e,a)=(1-a)^e/(1-2*a^e)
  96 mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
  97 contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
  98 contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
  99 contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
 100 contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
 101 contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
 102
 103   ,
 104   |\
 105   | \
 106   +----,
 107   |    |
 108   |    |
 109   |    |\
 110   |    | \
 111   +----+--`
 112   ^ n2 ^n1^
 113
 114   For an actual implementation, the assumption that M(n)=n^e is
 115   incorrect, as a consequence also the assumption that ML(n)=k*M(n)
 116   with a constant k is wrong.
 117
 118   But theory suggest us two things:
 119   - the best the multiplication product is (lower e), the more k
 120     approaches 1, and a approaches 0.
 121
 122   - A value for a smaller than optimal is probably less bad than a
 123     bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
 124     value, and k(a)=0.808_ the mul/mullo speed ratio. We get
 125     k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
 126 */
 127
 128 static mp_size_t
 129 mpn_mullo_n_itch (mp_size_t n)
 130 {
 131   return 2*n;
 132 }
 133
 134 /*
 135     mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp.
 136     It accepts tp == rp.
 137 */
 138 static void
 139 mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp)
 140 {
 141   mp_size_t n2, n1;
 142   ASSERT (n >= 2);
 143   ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
 144   ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
 145   ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
 146
 147   /* Divide-and-conquer */
 148
 149   /* We need fractional approximation of the value 0 < a <= 1/2
 150      giving the minimum in the function k=(1-a)^e/(1-2*a^e).
 151   */
 152   if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11)))
 153     n1 = n >> 1;
 154   else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
 155     n1 = n * 11 / (size_t) 36;  /* n1 ~= n*(1-.694...) */
 156   else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9)))
 157     n1 = n * 9 / (size_t) 40;   /* n1 ~= n*(1-.775...) */
 158   else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9))
 159     n1 = n * 7 / (size_t) 39;   /* n1 ~= n*(1-.821...) */
 160   /* n1 = n * 4 / (size_t) 31;  // n1 ~= n*(1-.871...) [TOOM66] */
 161   else
 162     n1 = n / (size_t) 10;               /* n1 ~= n*(1-.899...) [TOOM88] */
 163
 164   n2 = n - n1;
 165
 166   /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0,
 167               y = y1 2^(n2 GMP_NUMB_BITS) + y0 */
 168
 169   /* x0 * y0 */
 170   mpn_mul_n (tp, xp, yp, n2);
 171   MPN_COPY (rp, tp, n2);
 172
 173   /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */
 174   if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
 175     mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1);
 176   else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
 177     mpn_mullo_basecase (tp + n, xp + n2, yp, n1);
 178   else
 179     mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n);
 180   mpn_add_n (rp + n2, tp + n2, tp + n, n1);
 181
 182   /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
 183   if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
 184     mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1);
 185   else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
 186     mpn_mullo_basecase (tp + n, xp, yp + n2, n1);
 187   else
 188     mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n);
 189   mpn_add_n (rp + n2, rp + n2, tp + n, n1);
 190 }
 191
 192 /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0.  */
 193 #define MUL_BASECASE_ALLOC \
 194  (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT)
 195
 196 /* FIXME: This function should accept a temporary area; dc_mullow_n
 197    accepts a pointer tp, and handle the case tp == rp, do the same here.
 198    Maybe recombine the two functions.
 199    THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase
 200           (typically thanks to mpn_addmul_2) should we unconditionally use
 201           mpn_mul_n?
 202 */
 203
 204 void
 205 mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
 206 {
 207   ASSERT (n >= 1);
 208   ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
 209   ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
 210
 211   if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD))
 212     {
 213       /* Allocate workspace of fixed size on stack: fast! */
 214       mp_limb_t tp[MUL_BASECASE_ALLOC];
 215       mpn_mul_basecase (tp, xp, n, yp, n);
 216       MPN_COPY (rp, tp, n);
 217     }
 218   else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD))
 219     {
 220       mpn_mullo_basecase (rp, xp, yp, n);
 221     }
 222   else
 223     {
 224       mp_ptr tp;
 225       TMP_DECL;
 226       TMP_MARK;
 227       tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n));
 228       if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD))
 229         {
 230           mpn_dc_mullo_n (rp, xp, yp, n, tp);
 231         }
 232       else
 233         {
 234           /* For really large operands, use plain mpn_mul_n but throw away upper n
 235              limbs of result.  */
 236 #if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD)
 237           mpn_fft_mul (tp, xp, n, yp, n);
 238 #else
 239           mpn_mul_n (tp, xp, yp, n);
 240 #endif
 241           MPN_COPY (rp, tp, n);
 242         }
 243       TMP_FREE;
 244     }
 245 }