1 /* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn.
2 storing the result in {qp,nn}. Overlap allowed between Q and N; all other
5 Contributed to the GNU project by Torbjorn Granlund.
7 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY
8 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST
9 GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
11 Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
13 This file is part of the GNU MP Library.
15 The GNU MP Library is free software; you can redistribute it and/or modify
16 it under the terms of the GNU Lesser General Public License as published by
17 the Free Software Foundation; either version 3 of the License, or (at your
18 option) any later version.
20 The GNU MP Library is distributed in the hope that it will be useful, but
21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
22 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
23 License for more details.
25 You should have received a copy of the GNU Lesser General Public License
26 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */
30 The idea of the algorithm used herein is to compute a smaller inverted value
31 than used in the standard Barrett algorithm, and thus save time in the
32 Newton iterations, and pay just a small price when using the inverted value
33 for developing quotient bits. This algorithm was presented at ICMS 2006.
48 scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn).
50 Write quotient to Q = {qp,nn}.
52 FIXME: When iterating, perhaps do the small step before loop, not after.
53 FIXME: Try to avoid the scalar divisions when computing inverse size.
54 FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible. In
55 particular, when dn==in, tp and rp could use the same space.
56 FIXME: Trim final quotient calculation to qn limbs of precision.
59 mpn_mu_bdiv_q (mp_ptr qp,
60 mp_srcptr np, mp_size_t nn,
61 mp_srcptr dp, mp_size_t dn,
78 /* |_______________________| dividend
81 #define ip scratch /* in */
82 #define rp (scratch + in) /* dn or rest >= binvert_itch(in) */
83 #define tp (scratch + in + dn) /* dn+in or next_size(dn) */
84 #define scratch_out (scratch + in + dn + tn) /* mulmod_bnm1_itch(next_size(dn)) */
86 /* Compute an inverse size that is a nice partition of the quotient. */
87 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
88 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
90 /* Some notes on allocation:
92 When in = dn, R dies when mpn_mullo returns, if in < dn the low in
93 limbs of R dies at that point. We could save memory by letting T live
94 just under R, and let the upper part of T expand into R. These changes
95 should reduce itch to perhaps 3dn.
98 mpn_binvert (ip, dp, in, rp);
102 MPN_COPY (rp, np, dn);
104 mpn_mullo_n (qp, rp, ip, in);
109 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
110 mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[dn+in-1...in] */
113 tn = mpn_mulmod_bnm1_next_size (dn);
114 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
115 wn = dn + in - tn; /* number of wrapped limbs */
118 c0 = mpn_sub_n (tp + tn, tp, rp, wn);
119 mpn_decr_u (tp + wn, c0);
126 /* Subtract tp[dn-1...in] from partial remainder. */
127 cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
130 mpn_incr_u (tp + dn, 1);
134 /* Subtract tp[dn+in-1...dn] from dividend. */
135 cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
137 mpn_mullo_n (qp, rp, ip, in);
141 /* Generate last qn limbs.
142 FIXME: It should be possible to limit precision here, since qn is
143 typically somewhat smaller than dn. No big gains expected. */
145 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
146 mpn_mul (tp, dp, dn, qp, in); /* mulhi, need tp[qn+in-1...in] */
149 tn = mpn_mulmod_bnm1_next_size (dn);
150 mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
151 wn = dn + in - tn; /* number of wrapped limbs */
154 c0 = mpn_sub_n (tp + tn, tp, rp, wn);
155 mpn_decr_u (tp + wn, c0);
162 cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
165 mpn_incr_u (tp + dn, 1);
170 mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy);
171 mpn_mullo_n (qp, rp, ip, qn);
180 /* |_______________________| dividend
181 |________________| divisor */
183 #define ip scratch /* in */
184 #define tp (scratch + in) /* qn+in or next_size(qn) or rest >= binvert_itch(in) */
185 #define scratch_out (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */
187 /* Compute half-sized inverse. */
190 mpn_binvert (ip, dp, in, tp);
192 mpn_mullo_n (qp, np, ip, in); /* low `in' quotient limbs */
194 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
195 mpn_mul (tp, dp, qn, qp, in); /* mulhigh */
198 tn = mpn_mulmod_bnm1_next_size (qn);
199 mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out);
200 wn = qn + in - tn; /* number of wrapped limbs */
203 c0 = mpn_cmp (tp, np, wn) < 0;
204 mpn_decr_u (tp + wn, c0);
208 mpn_sub_n (tp, np + in, tp + in, qn - in);
209 mpn_mullo_n (qp + in, tp, ip, qn - in); /* high qn-in quotient limbs */
218 mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
220 mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
227 b = (qn - 1) / dn + 1; /* ceil(qn/dn), number of blocks */
228 in = (qn - 1) / b + 1; /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
229 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
236 tn = mpn_mulmod_bnm1_next_size (dn);
237 itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
239 itch_binvert = mpn_binvert_itch (in);
240 itches = dn + tn + itch_out;
241 return in + MAX (itches, itch_binvert);
246 if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
253 tn = mpn_mulmod_bnm1_next_size (qn);
254 itch_out = mpn_mulmod_bnm1_itch (tn, qn, in);
256 itch_binvert = mpn_binvert_itch (in);
257 itches = tn + itch_out;
258 return in + MAX (itches, itch_binvert);