mpn/generic/mu_bdiv_q.c

   1 /* mpn_mu_bdiv_q(qp,np,nn,dp,dn,tp) -- Compute {np,nn} / {dp,dn} mod B^nn.
   2    storing the result in {qp,nn}.  Overlap allowed between Q and N; all other
   3    overlap disallowed.
   4
   5    Contributed to the GNU project by Torbjorn Granlund.
   6
   7    THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES.  IT IS ONLY
   8    SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   9    GUARANTEED THAT THEY WILL CHANGE OR DISAPPEAR IN A FUTURE GMP RELEASE.
  10
  11 Copyright 2005, 2006, 2007, 2009, 2010 Free Software Foundation, Inc.
  12
  13 This file is part of the GNU MP Library.
  14
  15 The GNU MP Library is free software; you can redistribute it and/or modify
  16 it under the terms of the GNU Lesser General Public License as published by
  17 the Free Software Foundation; either version 3 of the License, or (at your
  18 option) any later version.
  19
  20 The GNU MP Library is distributed in the hope that it will be useful, but
  21 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  22 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  23 License for more details.
  24
  25 You should have received a copy of the GNU Lesser General Public License
  26 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  27
  28
  29 /*
  30    The idea of the algorithm used herein is to compute a smaller inverted value
  31    than used in the standard Barrett algorithm, and thus save time in the
  32    Newton iterations, and pay just a small price when using the inverted value
  33    for developing quotient bits.  This algorithm was presented at ICMS 2006.
  34 */
  35
  36 #include "gmp.h"
  37 #include "gmp-impl.h"
  38
  39
  40 /* N = {np,nn}
  41    D = {dp,dn}
  42
  43    Requirements: N >= D
  44                  D >= 1
  45                  D odd
  46                  dn >= 2
  47                  nn >= 2
  48                  scratch space as determined by mpn_mu_bdiv_q_itch(nn,dn).
  49
  50    Write quotient to Q = {qp,nn}.
  51
  52    FIXME: When iterating, perhaps do the small step before loop, not after.
  53    FIXME: Try to avoid the scalar divisions when computing inverse size.
  54    FIXME: Trim allocation for (qn > dn) case, 3*dn might be possible.  In
  55           particular, when dn==in, tp and rp could use the same space.
  56    FIXME: Trim final quotient calculation to qn limbs of precision.
  57 */
  58 void
  59 mpn_mu_bdiv_q (mp_ptr qp,
  60                mp_srcptr np, mp_size_t nn,
  61                mp_srcptr dp, mp_size_t dn,
  62                mp_ptr scratch)
  63 {
  64   mp_size_t qn;
  65   mp_size_t in;
  66   int cy, c0;
  67   mp_size_t tn, wn;
  68
  69   qn = nn;
  70
  71   ASSERT (dn >= 2);
  72   ASSERT (qn >= 2);
  73
  74   if (qn > dn)
  75     {
  76       mp_size_t b;
  77
  78       /* |_______________________|   dividend
  79                         |________|   divisor  */
  80
  81 #define ip           scratch                    /* in */
  82 #define rp           (scratch + in)             /* dn or rest >= binvert_itch(in) */
  83 #define tp           (scratch + in + dn)        /* dn+in or next_size(dn) */
  84 #define scratch_out  (scratch + in + dn + tn)   /* mulmod_bnm1_itch(next_size(dn)) */
  85
  86       /* Compute an inverse size that is a nice partition of the quotient.  */
  87       b = (qn - 1) / dn + 1;    /* ceil(qn/dn), number of blocks */
  88       in = (qn - 1) / b + 1;    /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
  89
  90       /* Some notes on allocation:
  91
  92          When in = dn, R dies when mpn_mullo returns, if in < dn the low in
  93          limbs of R dies at that point.  We could save memory by letting T live
  94          just under R, and let the upper part of T expand into R. These changes
  95          should reduce itch to perhaps 3dn.
  96        */
  97
  98       mpn_binvert (ip, dp, in, rp);
  99
 100       cy = 0;
 101
 102       MPN_COPY (rp, np, dn);
 103       np += dn;
 104       mpn_mullo_n (qp, rp, ip, in);
 105       qn -= in;
 106
 107       while (qn > in)
 108         {
 109           if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
 110             mpn_mul (tp, dp, dn, qp, in);       /* mulhi, need tp[dn+in-1...in] */
 111           else
 112             {
 113               tn = mpn_mulmod_bnm1_next_size (dn);
 114               mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
 115               wn = dn + in - tn;                /* number of wrapped limbs */
 116               if (wn > 0)
 117                 {
 118                   c0 = mpn_sub_n (tp + tn, tp, rp, wn);
 119                   mpn_decr_u (tp + wn, c0);
 120                 }
 121             }
 122
 123           qp += in;
 124           if (dn != in)
 125             {
 126               /* Subtract tp[dn-1...in] from partial remainder.  */
 127               cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
 128               if (cy == 2)
 129                 {
 130                   mpn_incr_u (tp + dn, 1);
 131                   cy = 1;
 132                 }
 133             }
 134           /* Subtract tp[dn+in-1...dn] from dividend.  */
 135           cy = mpn_sub_nc (rp + dn - in, np, tp + dn, in, cy);
 136           np += in;
 137           mpn_mullo_n (qp, rp, ip, in);
 138           qn -= in;
 139         }
 140
 141       /* Generate last qn limbs.
 142          FIXME: It should be possible to limit precision here, since qn is
 143          typically somewhat smaller than dn.  No big gains expected.  */
 144
 145       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
 146         mpn_mul (tp, dp, dn, qp, in);           /* mulhi, need tp[qn+in-1...in] */
 147       else
 148         {
 149           tn = mpn_mulmod_bnm1_next_size (dn);
 150           mpn_mulmod_bnm1 (tp, tn, dp, dn, qp, in, scratch_out);
 151           wn = dn + in - tn;                    /* number of wrapped limbs */
 152           if (wn > 0)
 153             {
 154               c0 = mpn_sub_n (tp + tn, tp, rp, wn);
 155               mpn_decr_u (tp + wn, c0);
 156             }
 157         }
 158
 159       qp += in;
 160       if (dn != in)
 161         {
 162           cy += mpn_sub_n (rp, rp + in, tp + in, dn - in);
 163           if (cy == 2)
 164             {
 165               mpn_incr_u (tp + dn, 1);
 166               cy = 1;
 167             }
 168         }
 169
 170       mpn_sub_nc (rp + dn - in, np, tp + dn, qn - (dn - in), cy);
 171       mpn_mullo_n (qp, rp, ip, qn);
 172
 173 #undef ip
 174 #undef rp
 175 #undef tp
 176 #undef scratch_out
 177    }
 178   else
 179     {
 180       /* |_______________________|   dividend
 181                 |________________|   divisor  */
 182
 183 #define ip           scratch            /* in */
 184 #define tp           (scratch + in)     /* qn+in or next_size(qn) or rest >= binvert_itch(in) */
 185 #define scratch_out  (scratch + in + tn)/* mulmod_bnm1_itch(next_size(qn)) */
 186
 187       /* Compute half-sized inverse.  */
 188       in = qn - (qn >> 1);
 189
 190       mpn_binvert (ip, dp, in, tp);
 191
 192       mpn_mullo_n (qp, np, ip, in);             /* low `in' quotient limbs */
 193
 194       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
 195         mpn_mul (tp, dp, qn, qp, in);           /* mulhigh */
 196       else
 197         {
 198           tn = mpn_mulmod_bnm1_next_size (qn);
 199           mpn_mulmod_bnm1 (tp, tn, dp, qn, qp, in, scratch_out);
 200           wn = qn + in - tn;                    /* number of wrapped limbs */
 201           if (wn > 0)
 202             {
 203               c0 = mpn_cmp (tp, np, wn) < 0;
 204               mpn_decr_u (tp + wn, c0);
 205             }
 206         }
 207
 208       mpn_sub_n (tp, np + in, tp + in, qn - in);
 209       mpn_mullo_n (qp + in, tp, ip, qn - in);   /* high qn-in quotient limbs */
 210
 211 #undef ip
 212 #undef tp
 213 #undef scratch_out
 214     }
 215 }
 216
 217 mp_size_t
 218 mpn_mu_bdiv_q_itch (mp_size_t nn, mp_size_t dn)
 219 {
 220   mp_size_t qn, in, tn, itch_binvert, itch_out, itches;
 221   mp_size_t b;
 222
 223   qn = nn;
 224
 225   if (qn > dn)
 226     {
 227       b = (qn - 1) / dn + 1;    /* ceil(qn/dn), number of blocks */
 228       in = (qn - 1) / b + 1;    /* ceil(qn/b) = ceil(qn / ceil(qn/dn)) */
 229       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
 230         {
 231           tn = dn + in;
 232           itch_out = 0;
 233         }
 234       else
 235         {
 236           tn = mpn_mulmod_bnm1_next_size (dn);
 237           itch_out = mpn_mulmod_bnm1_itch (tn, dn, in);
 238         }
 239       itch_binvert = mpn_binvert_itch (in);
 240       itches = dn + tn + itch_out;
 241       return in + MAX (itches, itch_binvert);
 242     }
 243   else
 244     {
 245       in = qn - (qn >> 1);
 246       if (BELOW_THRESHOLD (in, MUL_TO_MULMOD_BNM1_FOR_2NXN_THRESHOLD))
 247         {
 248           tn = qn + in;
 249           itch_out = 0;
 250         }
 251       else
 252         {
 253           tn = mpn_mulmod_bnm1_next_size (qn);
 254           itch_out = mpn_mulmod_bnm1_itch (tn, qn, in);
 255         }
 256       itch_binvert = mpn_binvert_itch (in);
 257       itches = tn + itch_out;
 258       return in + MAX (itches, itch_binvert);
 259     }
 260 }