1 /* Interpolaton for the algorithm Toom-Cook 6.5-way.
3 Contributed to the GNU project by Marco Bodrato.
5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY
6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
7 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
9 Copyright 2009, 2010 Free Software Foundation, Inc.
11 This file is part of the GNU MP Library.
13 The GNU MP Library is free software; you can redistribute it and/or modify
14 it under the terms of the GNU Lesser General Public License as published by
15 the Free Software Foundation; either version 3 of the License, or (at your
16 option) any later version.
18 The GNU MP Library is distributed in the hope that it will be useful, but
19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
20 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
21 License for more details.
23 You should have received a copy of the GNU Lesser General Public License
24 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
31 #if HAVE_NATIVE_mpn_sublsh_n
32 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
35 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
38 return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
41 __cy = mpn_lshift(ws,src,n,s);
42 return __cy + mpn_sub_n(dst,dst,ws,n);
47 #if HAVE_NATIVE_mpn_addlsh_n
48 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
51 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
54 return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
57 __cy = mpn_lshift(ws,src,n,s);
58 return __cy + mpn_add_n(dst,dst,ws,n);
63 #if HAVE_NATIVE_mpn_subrsh
64 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
66 /* FIXME: This is not a correct definition, it assumes no carry */
67 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \
70 MPN_DECR_U (dst, nd, src[0] >> s); \
71 __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \
72 MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \
77 #if GMP_NUMB_BITS < 21
78 #error Not implemented: Both sublsh_n(,,,20) should be corrected.
81 #if GMP_NUMB_BITS < 16
82 #error Not implemented: divexact_by42525 needs splitting.
85 #if GMP_NUMB_BITS < 12
86 #error Not implemented: Hard to adapt...
89 /* FIXME: tuneup should decide the best variant */
90 #ifndef AORSMUL_FASTER_AORS_AORSLSH
91 #define AORSMUL_FASTER_AORS_AORSLSH 1
93 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
94 #define AORSMUL_FASTER_AORS_2AORSLSH 1
96 #ifndef AORSMUL_FASTER_2AORSLSH
97 #define AORSMUL_FASTER_2AORSLSH 1
99 #ifndef AORSMUL_FASTER_3AORSLSH
100 #define AORSMUL_FASTER_3AORSLSH 1
104 ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
106 #define BINVERT_255 \
107 (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
109 /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
110 #if GMP_LIMB_BITS == 32
111 #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B))
112 #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35))
114 #if GMP_LIMB_BITS == 64
115 #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B))
116 #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35))
120 #ifndef mpn_divexact_by255
121 #if GMP_NUMB_BITS % 8 == 0
122 #define mpn_divexact_by255(dst,src,size) \
123 (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
125 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
126 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
128 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
133 #ifndef mpn_divexact_by9x4
134 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
135 #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
137 #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
141 #ifndef mpn_divexact_by42525
142 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
143 #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
145 #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
149 #ifndef mpn_divexact_by2835x4
150 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
151 #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
153 #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
157 /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
158 points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
159 we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
160 degree 11 (or 10), given the 12 (rsp. 11) values:
162 r0 = limit at infinity of f(x) / x^7,
170 All couples of the form f(n),f(-n) must be already mixed with
171 toom_couple_handling(f(n),...,f(-n),...)
173 The result is stored in {pp, spt + 7*n (or 6*n)}.
174 At entry, r6 is stored at {pp, 2n},
175 r4 is stored at {pp + 3n, 3n + 1}.
176 r2 is stored at {pp + 7n, 3n + 1}.
177 r0 is stored at {pp +11n, spt}.
179 The other values are 3n+1 limbs each (with most significant limbs small).
181 Negative intermediate results are stored two-complemented.
182 Inputs are destroyed.
186 mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
187 mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
195 #define r4 (pp + n3) /* 3n+1 */
196 #define r2 (pp + 7 * n) /* 3n+1 */
197 #define r0 (pp +11 * n) /* s+t <= 2*n */
199 /******************************* interpolation *****************************/
201 cy = mpn_sub_n (r3, r3, r0, spt);
202 MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
204 cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
205 MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
206 DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
208 cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
209 MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
210 DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
213 r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
214 DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
216 #if HAVE_NATIVE_mpn_add_n_sub_n
217 mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
219 ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
220 mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
221 MP_PTR_SWAP(r1, wsi);
224 r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
225 DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
227 #if HAVE_NATIVE_mpn_add_n_sub_n
228 mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
230 mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
231 ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
232 MP_PTR_SWAP(r5, wsi);
235 r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
237 #if AORSMUL_FASTER_AORS_AORSLSH
238 mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
240 mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
241 DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
243 /* A division by 2835x4 followsi. Warning: the operand can be negative! */
244 mpn_divexact_by2835x4(r4, r4, n3p1);
245 if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
246 r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
248 #if AORSMUL_FASTER_2AORSLSH
249 mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
251 DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
252 DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
254 mpn_divexact_by255(r5, r5, n3p1);
256 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
258 #if AORSMUL_FASTER_3AORSLSH
259 ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
261 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
262 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
263 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
265 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
266 mpn_divexact_by42525(r1, r1, n3p1);
268 #if AORSMUL_FASTER_AORS_2AORSLSH
269 ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
271 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
272 ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
273 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
275 mpn_divexact_by9x4(r2, r2, n3p1);
277 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
279 mpn_sub_n (r4, r2, r4, n3p1);
280 ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
281 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
283 mpn_add_n (r5, r5, r1, n3p1);
284 ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
286 /* last interpolation steps... */
287 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
288 ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
289 /* ... could be mixed with recomposition
290 ||H-r5|M-r5|L-r5| ||H-r1|M-r1|L-r1|
293 /***************************** recomposition *******************************/
295 pp[] prior to operations:
296 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
298 summation scheme for remaining operations:
299 |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
300 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
301 ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5|
304 cy = mpn_add_n (pp + n, pp + n, r5, n);
305 cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
306 #if HAVE_NATIVE_mpn_add_nc
307 cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
309 MPN_INCR_U (r5 + 2 * n, n + 1, cy);
310 cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
312 MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
314 pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
315 cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
316 #if HAVE_NATIVE_mpn_add_nc
317 cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
319 MPN_INCR_U (r3 + 2 * n, n + 1, cy);
320 cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
322 MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
324 pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
326 cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
327 #if HAVE_NATIVE_mpn_add_nc
328 if (LIKELY (spt > n)) {
329 cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
330 MPN_INCR_U (pp + 4 * n3, spt - n, cy);
332 ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
335 MPN_INCR_U (r1 + 2 * n, n + 1, cy);
336 if (LIKELY (spt > n)) {
337 cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
338 MPN_INCR_U (pp + 4 * n3, spt - n, cy);
340 ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
344 ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));