1 /* mpn_mul -- Multiply two natural numbers.
3 Contributed to the GNU project by Torbjorn Granlund.
5 Copyright 1991, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003, 2005,
6 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
8 This file is part of the GNU MP Library.
10 The GNU MP Library is free software; you can redistribute it and/or modify
11 it under the terms of the GNU Lesser General Public License as published by
12 the Free Software Foundation; either version 3 of the License, or (at your
13 option) any later version.
15 The GNU MP Library is distributed in the hope that it will be useful, but
16 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
17 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
18 License for more details.
20 You should have received a copy of the GNU Lesser General Public License
21 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
27 #ifndef MUL_BASECASE_MAX_UN
28 #define MUL_BASECASE_MAX_UN 500
31 #define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn)
32 #define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn)
34 /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v
35 (pointed to by VP, with VN limbs), and store the result at PRODP. The
36 result is UN + VN limbs. Return the most significant limb of the result.
38 NOTE: The space pointed to by PRODP is overwritten before finished with U
39 and V, so overlap is an error.
43 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from
44 the multiplier and the multiplicand. */
47 * The cutoff lines in the toomX2 and toomX3 code are now exactly between the
48 ideal lines of the surrounding algorithms. Is that optimal?
50 * The toomX3 code now uses a structure similar to the one of toomX2, except
51 that it loops longer in the unbalanced case. The result is that the
52 remaining area might have un < vn. Should we fix the toomX2 code in a
55 * The toomX3 code is used for the largest non-FFT unbalanced operands. It
56 therefore calls mpn_mul recursively for certain cases.
58 * Allocate static temp space using THRESHOLD variables (except for toom44
59 when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all.
61 * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42
62 have the same vn threshold. This is not true, we should actually use
63 mul_basecase for slightly larger operands for toom32 than for toom22, and
64 even larger for toom42.
66 * That problem is even more prevalent for toomX3. We therefore use special
67 THRESHOLD variables there.
69 * Is our ITCH allocation correct?
72 #define ITCH (16*vn + 100)
75 mpn_mul (mp_ptr prodp,
76 mp_srcptr up, mp_size_t un,
77 mp_srcptr vp, mp_size_t vn)
81 ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un));
82 ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn));
87 mpn_sqr (prodp, up, un);
89 mpn_mul_n (prodp, up, vp, un);
91 else if (vn < MUL_TOOM22_THRESHOLD)
92 { /* plain schoolbook multiplication */
94 /* Unless un is very large, or else if have an applicable mpn_mul_N,
95 perform basecase multiply directly. */
96 if (un <= MUL_BASECASE_MAX_UN
97 #if HAVE_NATIVE_mpn_mul_2
103 mpn_mul_basecase (prodp, up, un, vp, vn);
106 /* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory
107 locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply
108 these pieces with the vp[] operand. After each such partial
109 multiplication (but the last) we copy the most significant vn
110 limbs into a temporary buffer since that part would otherwise be
111 overwritten by the next multiplication. After the next
112 multiplication, we add it back. This illustrates the situation:
115 | |<------- un ------->|
116 _____________________|
118 /XX__________________/ |
119 _____________________ |
121 /XX__________________/ |
122 _____________________ |
124 /____________________/ |
125 ==================================================================
127 The parts marked with X are the parts whose sums are copied into
128 the temporary buffer. */
130 mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT];
132 ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT);
134 mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
135 prodp += MUL_BASECASE_MAX_UN;
136 MPN_COPY (tp, prodp, vn); /* preserve high triangle */
137 up += MUL_BASECASE_MAX_UN;
138 un -= MUL_BASECASE_MAX_UN;
139 while (un > MUL_BASECASE_MAX_UN)
141 mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn);
142 cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
143 mpn_incr_u (prodp + vn, cy);
144 prodp += MUL_BASECASE_MAX_UN;
145 MPN_COPY (tp, prodp, vn); /* preserve high triangle */
146 up += MUL_BASECASE_MAX_UN;
147 un -= MUL_BASECASE_MAX_UN;
151 mpn_mul_basecase (prodp, up, un, vp, vn);
156 mpn_mul_basecase (prodp, vp, vn, up, un);
158 cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */
159 mpn_incr_u (prodp + vn, cy);
162 else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD))
164 /* Use ToomX2 variants */
166 TMP_SDECL; TMP_SMARK;
168 scratch = TMP_SALLOC_LIMBS (ITCH);
170 /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn
171 square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly
172 wise; we would get better balance by slightly moving the bound. We
173 will sometimes end up with un < vn, like the the X3 arm below. */
179 /* The maximum ws usage is for the mpn_mul result. */
180 ws = TMP_SALLOC_LIMBS (4 * vn);
182 mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
189 mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
192 cy = mpn_add_n (prodp, prodp, ws, vn);
193 MPN_COPY (prodp + vn, ws + vn, 2 * vn);
194 mpn_incr_u (prodp + vn, cy);
201 mpn_toom22_mul (ws, up, un, vp, vn, scratch);
202 else if (4 * un < 7 * vn)
203 mpn_toom32_mul (ws, up, un, vp, vn, scratch);
205 mpn_toom42_mul (ws, up, un, vp, vn, scratch);
207 cy = mpn_add_n (prodp, prodp, ws, vn);
208 MPN_COPY (prodp + vn, ws + vn, un);
209 mpn_incr_u (prodp + vn, cy);
214 mpn_toom22_mul (prodp, up, un, vp, vn, scratch);
215 else if (4 * un < 7 * vn)
216 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
218 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
222 else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) ||
223 BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD))
225 /* Handle the largest operands that are not in the FFT range. The 2nd
226 condition makes very unbalanced operands avoid the FFT code (except
227 perhaps as coefficient products of the Toom code. */
229 if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn))
231 /* Use ToomX3 variants */
233 TMP_SDECL; TMP_SMARK;
235 scratch = TMP_SALLOC_LIMBS (ITCH);
237 if (2 * un >= 5 * vn)
242 /* The maximum ws usage is for the mpn_mul result. */
243 ws = TMP_SALLOC_LIMBS (7 * vn >> 1);
245 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
246 mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch);
248 mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch);
253 while (2 * un >= 5 * vn) /* un >= 2.5vn */
255 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
256 mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch);
258 mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch);
261 cy = mpn_add_n (prodp, prodp, ws, vn);
262 MPN_COPY (prodp + vn, ws + vn, 2 * vn);
263 mpn_incr_u (prodp + vn, cy);
267 /* vn / 2 <= un < 2.5vn */
270 mpn_mul (ws, vp, vn, up, un);
272 mpn_mul (ws, up, un, vp, vn);
274 cy = mpn_add_n (prodp, prodp, ws, vn);
275 MPN_COPY (prodp + vn, ws + vn, un);
276 mpn_incr_u (prodp + vn, cy);
281 mpn_toom33_mul (prodp, up, un, vp, vn, scratch);
282 else if (2 * un < 3 * vn)
284 if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD))
285 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
287 mpn_toom43_mul (prodp, up, un, vp, vn, scratch);
289 else if (6 * un < 11 * vn)
293 if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD))
294 mpn_toom32_mul (prodp, up, un, vp, vn, scratch);
296 mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
300 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD))
301 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
303 mpn_toom53_mul (prodp, up, un, vp, vn, scratch);
308 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD))
309 mpn_toom42_mul (prodp, up, un, vp, vn, scratch);
311 mpn_toom63_mul (prodp, up, un, vp, vn, scratch);
321 if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD))
323 scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (un, vn));
324 mpn_toom44_mul (prodp, up, un, vp, vn, scratch);
326 else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD))
328 scratch = TMP_ALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn));
329 mpn_toom6h_mul (prodp, up, un, vp, vn, scratch);
333 scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn));
334 mpn_toom8h_mul (prodp, up, un, vp, vn, scratch);
347 /* The maximum ws usage is for the mpn_mul result. */
348 ws = TMP_BALLOC_LIMBS (9 * vn >> 1);
350 mpn_fft_mul (prodp, up, 3 * vn, vp, vn);
355 while (2 * un >= 7 * vn) /* un >= 3.5vn */
357 mpn_fft_mul (ws, up, 3 * vn, vp, vn);
360 cy = mpn_add_n (prodp, prodp, ws, vn);
361 MPN_COPY (prodp + vn, ws + vn, 3 * vn);
362 mpn_incr_u (prodp + vn, cy);
366 /* vn / 2 <= un < 3.5vn */
369 mpn_mul (ws, vp, vn, up, un);
371 mpn_mul (ws, up, un, vp, vn);
373 cy = mpn_add_n (prodp, prodp, ws, vn);
374 MPN_COPY (prodp + vn, ws + vn, un);
375 mpn_incr_u (prodp + vn, cy);
380 mpn_fft_mul (prodp, up, un, vp, vn);
383 return prodp[un + vn - 1]; /* historic */