1 /* mpn_mullo_n -- multiply two n-limb numbers and return the low n limbs
4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato.
6 THIS IS (FOR NOW) AN INTERNAL FUNCTION. IT IS ONLY SAFE TO REACH THIS
7 FUNCTION THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST GUARANTEED
8 THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
10 Copyright 2004, 2005, 2009, 2010 Free Software Foundation, Inc.
12 This file is part of the GNU MP Library.
14 The GNU MP Library is free software; you can redistribute it and/or modify
15 it under the terms of the GNU Lesser General Public License as published by
16 the Free Software Foundation; either version 3 of the License, or (at your
17 option) any later version.
19 The GNU MP Library is distributed in the hope that it will be useful, but
20 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
21 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
22 License for more details.
24 You should have received a copy of the GNU Lesser General Public License
25 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
31 #ifndef MULLO_BASECASE_THRESHOLD
32 #define MULLO_BASECASE_THRESHOLD 0 /* never use mpn_mul_basecase */
35 #ifndef MULLO_DC_THRESHOLD
36 #define MULLO_DC_THRESHOLD 3*MUL_TOOM22_THRESHOLD
39 #ifndef MULLO_MUL_N_THRESHOLD
40 #define MULLO_MUL_N_THRESHOLD MUL_FFT_THRESHOLD
43 #if TUNE_PROGRAM_BUILD
44 #define MAYBE_range_basecase 1
45 #define MAYBE_range_toom22 1
47 #define MAYBE_range_basecase \
48 ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM22_THRESHOLD*36/(36-11))
49 #define MAYBE_range_toom22 \
50 ((MULLO_DC_THRESHOLD == 0 ? MULLO_BASECASE_THRESHOLD : MULLO_DC_THRESHOLD) < MUL_TOOM33_THRESHOLD*36/(36-11) )
53 /* THINK: The DC strategy uses different constants in different Toom's
54 ranges. Something smoother?
58 Compute the least significant half of the product {xy,n}*{yp,n}, or
59 formally {rp,n} = {xy,n}*{yp,n} Mod (B^n).
61 Above the given threshold, the Divide and Conquer strategy is used.
62 The operands are split in two, and a full product plus two mullo
63 are used to obtain the final result. The more natural strategy is to
64 split in two halves, but this is far from optimal when a
65 sub-quadratic multiplication is used.
67 Mulders suggests an unbalanced split in favour of the full product,
68 split n = n1 + n2, where an = n1 <= n2 = (1-a)n; i.e. 0 < a <= 1/2.
70 To compute the value of a, we assume that the cost of mullo for a
71 given size ML(n) is a fraction of the cost of a full product with
72 same size M(n), and the cost M(n)=n^e for some exponent 1 < e <= 2;
75 ML(n) = 2*ML(an) + M((1-a)n) => k*M(n) = 2*k*M(n)*a^e + M(n)*(1-a)^e
77 Given a value for e, want to minimise the value of k, i.e. the
78 function k=(1-a)^e/(1-2*a^e).
80 With e=2, the exponent for schoolbook multiplication, the minimum is
81 given by the values a=1-a=1/2.
83 With e=log(3)/log(2), the exponent for Karatsuba (aka toom22),
84 Mulders compute (1-a) = 0.694... and we approximate a with 11/36.
86 Other possible approximations follow:
87 e=log(5)/log(3) [Toom-3] -> a ~= 9/40
88 e=log(7)/log(4) [Toom-4] -> a ~= 7/39
89 e=log(11)/log(6) [Toom-6] -> a ~= 1/8
90 e=log(15)/log(8) [Toom-8] -> a ~= 1/10
92 The values above where obtained with the following trivial commands
95 fun(e,a)=(1-a)^e/(1-2*a^e)
96 mul(a,b,c)={local(m,x,p);if(b-c<1/10000,(b+c)/2,m=1;x=b;forstep(p=c,b,(b-c)/8,if(fun(a,p)<m,m=fun(a,p);x=p));mul(a,(b+x)/2,(c+x)/2))}
97 contfracpnqn(contfrac(mul(log(2*2-1)/log(2),1/2,0),5))
98 contfracpnqn(contfrac(mul(log(3*2-1)/log(3),1/2,0),5))
99 contfracpnqn(contfrac(mul(log(4*2-1)/log(4),1/2,0),5))
100 contfracpnqn(contfrac(mul(log(6*2-1)/log(6),1/2,0),3))
101 contfracpnqn(contfrac(mul(log(8*2-1)/log(8),1/2,0),3))
114 For an actual implementation, the assumption that M(n)=n^e is
115 incorrect, as a consequence also the assumption that ML(n)=k*M(n)
116 with a constant k is wrong.
118 But theory suggest us two things:
119 - the best the multiplication product is (lower e), the more k
120 approaches 1, and a approaches 0.
122 - A value for a smaller than optimal is probably less bad than a
123 bigger one: e.g. let e=log(3)/log(2), a=0.3058_ the optimal
124 value, and k(a)=0.808_ the mul/mullo speed ratio. We get
125 k(a+1/6)=0.929_ but k(a-1/6)=0.865_.
129 mpn_mullo_n_itch (mp_size_t n)
135 mpn_dc_mullo_n requires a scratch space of 2*n limbs at tp.
139 mpn_dc_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n, mp_ptr tp)
143 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
144 ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
145 ASSERT (MPN_SAME_OR_SEPARATE2_P(rp, n, tp, 2*n));
147 /* Divide-and-conquer */
149 /* We need fractional approximation of the value 0 < a <= 1/2
150 giving the minimum in the function k=(1-a)^e/(1-2*a^e).
152 if (MAYBE_range_basecase && BELOW_THRESHOLD (n, MUL_TOOM22_THRESHOLD*36/(36-11)))
154 else if (MAYBE_range_toom22 && BELOW_THRESHOLD (n, MUL_TOOM33_THRESHOLD*36/(36-11)))
155 n1 = n * 11 / (size_t) 36; /* n1 ~= n*(1-.694...) */
156 else if (BELOW_THRESHOLD (n, MUL_TOOM44_THRESHOLD*40/(40-9)))
157 n1 = n * 9 / (size_t) 40; /* n1 ~= n*(1-.775...) */
158 else if (BELOW_THRESHOLD (n, MUL_TOOM8H_THRESHOLD*10/9))
159 n1 = n * 7 / (size_t) 39; /* n1 ~= n*(1-.821...) */
160 /* n1 = n * 4 / (size_t) 31; // n1 ~= n*(1-.871...) [TOOM66] */
162 n1 = n / (size_t) 10; /* n1 ~= n*(1-.899...) [TOOM88] */
166 /* Split as x = x1 2^(n2 GMP_NUMB_BITS) + x0,
167 y = y1 2^(n2 GMP_NUMB_BITS) + y0 */
170 mpn_mul_n (tp, xp, yp, n2);
171 MPN_COPY (rp, tp, n2);
173 /* x1 * y0 * 2^(n2 GMP_NUMB_BITS) */
174 if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
175 mpn_mul_basecase (tp + n, xp + n2, n1, yp, n1);
176 else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
177 mpn_mullo_basecase (tp + n, xp + n2, yp, n1);
179 mpn_dc_mullo_n (tp + n, xp + n2, yp, n1, tp + n);
180 mpn_add_n (rp + n2, tp + n2, tp + n, n1);
182 /* x0 * y1 * 2^(n2 GMP_NUMB_BITS) */
183 if (BELOW_THRESHOLD (n1, MULLO_BASECASE_THRESHOLD))
184 mpn_mul_basecase (tp + n, xp, n1, yp + n2, n1);
185 else if (BELOW_THRESHOLD (n1, MULLO_DC_THRESHOLD))
186 mpn_mullo_basecase (tp + n, xp, yp + n2, n1);
188 mpn_dc_mullo_n (tp + n, xp, yp + n2, n1, tp + n);
189 mpn_add_n (rp + n2, rp + n2, tp + n, n1);
192 /* Avoid zero allocations when MULLO_BASECASE_THRESHOLD is 0. */
193 #define MUL_BASECASE_ALLOC \
194 (MULLO_BASECASE_THRESHOLD_LIMIT == 0 ? 1 : 2*MULLO_BASECASE_THRESHOLD_LIMIT)
196 /* FIXME: This function should accept a temporary area; dc_mullow_n
197 accepts a pointer tp, and handle the case tp == rp, do the same here.
198 Maybe recombine the two functions.
199 THINK: If mpn_mul_basecase is always faster than mpn_mullo_basecase
200 (typically thanks to mpn_addmul_2) should we unconditionally use
205 mpn_mullo_n (mp_ptr rp, mp_srcptr xp, mp_srcptr yp, mp_size_t n)
208 ASSERT (! MPN_OVERLAP_P (rp, n, xp, n));
209 ASSERT (! MPN_OVERLAP_P (rp, n, yp, n));
211 if (BELOW_THRESHOLD (n, MULLO_BASECASE_THRESHOLD))
213 /* Allocate workspace of fixed size on stack: fast! */
214 mp_limb_t tp[MUL_BASECASE_ALLOC];
215 mpn_mul_basecase (tp, xp, n, yp, n);
216 MPN_COPY (rp, tp, n);
218 else if (BELOW_THRESHOLD (n, MULLO_DC_THRESHOLD))
220 mpn_mullo_basecase (rp, xp, yp, n);
227 tp = TMP_ALLOC_LIMBS (mpn_mullo_n_itch (n));
228 if (BELOW_THRESHOLD (n, MULLO_MUL_N_THRESHOLD))
230 mpn_dc_mullo_n (rp, xp, yp, n, tp);
234 /* For really large operands, use plain mpn_mul_n but throw away upper n
236 #if !TUNE_PROGRAM_BUILD && (MULLO_MUL_N_THRESHOLD > MUL_FFT_THRESHOLD)
237 mpn_fft_mul (tp, xp, n, yp, n);
239 mpn_mul_n (tp, xp, yp, n);
241 MPN_COPY (rp, tp, n);