mpn/generic/toom42_mulmid.c

   1 /* mpn_toom42_mulmid -- toom42 middle product
   2
   3    Contributed by David Harvey.
   4
   5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   7    GUARANTEED THAT IT'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   8
   9 Copyright 2011 Free Software Foundation, Inc.
  10
  11 This file is part of the GNU MP Library.
  12
  13 The GNU MP Library is free software; you can redistribute it and/or modify
  14 it under the terms of either:
  15
  16   * the GNU Lesser General Public License as published by the Free
  17     Software Foundation; either version 3 of the License, or (at your
  18     option) any later version.
  19
  20 or
  21
  22   * the GNU General Public License as published by the Free Software
  23     Foundation; either version 2 of the License, or (at your option) any
  24     later version.
  25
  26 or both in parallel, as here.
  27
  28 The GNU MP Library is distributed in the hope that it will be useful, but
  29 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  30 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  31 for more details.
  32
  33 You should have received copies of the GNU General Public License and the
  34 GNU Lesser General Public License along with the GNU MP Library.  If not,
  35 see https://www.gnu.org/licenses/.  */
  36
  37
  38 #include "gmp.h"
  39 #include "gmp-impl.h"
  40
  41
  42
  43 /*
  44   Middle product of {ap,2n-1} and {bp,n}, output written to {rp,n+2}.
  45
  46   Neither ap nor bp may overlap rp.
  47
  48   Must have n >= 4.
  49
  50   Amount of scratch space required is given by mpn_toom42_mulmid_itch().
  51
  52   FIXME: this code assumes that n is small compared to GMP_NUMB_MAX. The exact
  53   requirements should be clarified.
  54 */
  55 void
  56 mpn_toom42_mulmid (mp_ptr rp, mp_srcptr ap, mp_srcptr bp, mp_size_t n,
  57                    mp_ptr scratch)
  58 {
  59   mp_limb_t cy, e[12], zh, zl;
  60   mp_size_t m;
  61   int neg;
  62
  63   ASSERT (n >= 4);
  64   ASSERT (! MPN_OVERLAP_P (rp, n + 2, ap, 2*n - 1));
  65   ASSERT (! MPN_OVERLAP_P (rp, n + 2, bp, n));
  66
  67   ap += n & 1;   /* handle odd row and diagonal later */
  68   m = n / 2;
  69
  70   /* (e0h:e0l) etc are correction terms, in 2's complement */
  71 #define e0l (e[0])
  72 #define e0h (e[1])
  73 #define e1l (e[2])
  74 #define e1h (e[3])
  75 #define e2l (e[4])
  76 #define e2h (e[5])
  77 #define e3l (e[6])
  78 #define e3h (e[7])
  79 #define e4l (e[8])
  80 #define e4h (e[9])
  81 #define e5l (e[10])
  82 #define e5h (e[11])
  83
  84 #define s (scratch + 2)
  85 #define t (rp + m + 2)
  86 #define p0 rp
  87 #define p1 scratch
  88 #define p2 (rp + m)
  89 #define next_scratch (scratch + 3*m + 1)
  90
  91   /*
  92             rp                            scratch
  93   |---------|-----------|    |---------|---------|----------|
  94   0         m         2m+2   0         m         2m        3m+1
  95             <----p2---->       <-------------s------------->
  96   <----p0----><---t---->     <----p1---->
  97   */
  98
  99   /* compute {s,3m-1} = {a,3m-1} + {a+m,3m-1} and error terms e0, e1, e2, e3 */
 100   cy = mpn_add_err1_n (s, ap, ap + m, &e0l, bp + m, m - 1, 0);
 101   cy = mpn_add_err2_n (s + m - 1, ap + m - 1, ap + 2*m - 1, &e1l,
 102                        bp + m, bp, m, cy);
 103   mpn_add_err1_n (s + 2*m - 1, ap + 2*m - 1, ap + 3*m - 1, &e3l, bp, m, cy);
 104
 105   /* compute t = (-1)^neg * ({b,m} - {b+m,m}) and error terms e4, e5 */
 106   if (mpn_cmp (bp + m, bp, m) < 0)
 107     {
 108       ASSERT_NOCARRY (mpn_sub_err2_n (t, bp, bp + m, &e4l,
 109                                       ap + m - 1, ap + 2*m - 1, m, 0));
 110       neg = 1;
 111     }
 112   else
 113     {
 114       ASSERT_NOCARRY (mpn_sub_err2_n (t, bp + m, bp, &e4l,
 115                                       ap + m - 1, ap + 2*m - 1, m, 0));
 116       neg = 0;
 117     }
 118
 119   /* recursive middle products. The picture is:
 120
 121       b[2m-1]   A   A   A   B   B   B   -   -   -   -   -
 122       ...       -   A   A   A   B   B   B   -   -   -   -
 123       b[m]      -   -   A   A   A   B   B   B   -   -   -
 124       b[m-1]    -   -   -   C   C   C   D   D   D   -   -
 125       ...       -   -   -   -   C   C   C   D   D   D   -
 126       b[0]      -   -   -   -   -   C   C   C   D   D   D
 127                a[0]   ...  a[m]  ...  a[2m]    ...    a[4m-2]
 128   */
 129
 130   if (m < MULMID_TOOM42_THRESHOLD)
 131     {
 132       /* A + B */
 133       mpn_mulmid_basecase (p0, s, 2*m - 1, bp + m, m);
 134       /* accumulate high limbs of p0 into e1 */
 135       ADDC_LIMB (cy, e1l, e1l, p0[m]);
 136       e1h += p0[m + 1] + cy;
 137       /* (-1)^neg * (B - C)   (overwrites first m limbs of s) */
 138       mpn_mulmid_basecase (p1, ap + m, 2*m - 1, t, m);
 139       /* C + D   (overwrites t) */
 140       mpn_mulmid_basecase (p2, s + m, 2*m - 1, bp, m);
 141     }
 142   else
 143     {
 144       /* as above, but use toom42 instead */
 145       mpn_toom42_mulmid (p0, s, bp + m, m, next_scratch);
 146       ADDC_LIMB (cy, e1l, e1l, p0[m]);
 147       e1h += p0[m + 1] + cy;
 148       mpn_toom42_mulmid (p1, ap + m, t, m, next_scratch);
 149       mpn_toom42_mulmid (p2, s + m, bp, m, next_scratch);
 150     }
 151
 152   /* apply error terms */
 153
 154   /* -e0 at rp[0] */
 155   SUBC_LIMB (cy, rp[0], rp[0], e0l);
 156   SUBC_LIMB (cy, rp[1], rp[1], e0h + cy);
 157   if (UNLIKELY (cy))
 158     {
 159       cy = (m > 2) ? mpn_sub_1 (rp + 2, rp + 2, m - 2, 1) : 1;
 160       SUBC_LIMB (cy, e1l, e1l, cy);
 161       e1h -= cy;
 162     }
 163
 164   /* z = e1 - e2 + high(p0) */
 165   SUBC_LIMB (cy, zl, e1l, e2l);
 166   zh = e1h - e2h - cy;
 167
 168   /* z at rp[m] */
 169   ADDC_LIMB (cy, rp[m], rp[m], zl);
 170   zh = (zh + cy) & GMP_NUMB_MASK;
 171   ADDC_LIMB (cy, rp[m + 1], rp[m + 1], zh);
 172   cy -= (zh >> (GMP_NUMB_BITS - 1));
 173   if (UNLIKELY (cy))
 174     {
 175       if (cy == 1)
 176         mpn_add_1 (rp + m + 2, rp + m + 2, m, 1);
 177       else /* cy == -1 */
 178         mpn_sub_1 (rp + m + 2, rp + m + 2, m, 1);
 179     }
 180
 181   /* e3 at rp[2*m] */
 182   ADDC_LIMB (cy, rp[2*m], rp[2*m], e3l);
 183   rp[2*m + 1] = (rp[2*m + 1] + e3h + cy) & GMP_NUMB_MASK;
 184
 185   /* e4 at p1[0] */
 186   ADDC_LIMB (cy, p1[0], p1[0], e4l);
 187   ADDC_LIMB (cy, p1[1], p1[1], e4h + cy);
 188   if (UNLIKELY (cy))
 189     mpn_add_1 (p1 + 2, p1 + 2, m, 1);
 190
 191   /* -e5 at p1[m] */
 192   SUBC_LIMB (cy, p1[m], p1[m], e5l);
 193   p1[m + 1] = (p1[m + 1] - e5h - cy) & GMP_NUMB_MASK;
 194
 195   /* adjustment if p1 ends up negative */
 196   cy = (p1[m + 1] >> (GMP_NUMB_BITS - 1));
 197
 198   /* add (-1)^neg * (p1 - B^m * p1) to output */
 199   if (neg)
 200     {
 201       mpn_sub_1 (rp + m + 2, rp + m + 2, m, cy);
 202       mpn_add (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
 203       mpn_sub_n (rp + m, rp + m, p1, m + 2);            /* B + D */
 204     }
 205   else
 206     {
 207       mpn_add_1 (rp + m + 2, rp + m + 2, m, cy);
 208       mpn_sub (rp, rp, 2*m + 2, p1, m + 2);             /* A + C */
 209       mpn_add_n (rp + m, rp + m, p1, m + 2);            /* B + D */
 210     }
 211
 212   /* odd row and diagonal */
 213   if (n & 1)
 214     {
 215       /*
 216         Products marked E are already done. We need to do products marked O.
 217
 218         OOOOO----
 219         -EEEEO---
 220         --EEEEO--
 221         ---EEEEO-
 222         ----EEEEO
 223        */
 224
 225       /* first row of O's */
 226       cy = mpn_addmul_1 (rp, ap - 1, n, bp[n - 1]);
 227       ADDC_LIMB (rp[n + 1], rp[n], rp[n], cy);
 228
 229       /* O's on diagonal */
 230       /* FIXME: should probably define an interface "mpn_mulmid_diag_1"
 231          that can handle the sum below. Currently we're relying on
 232          mulmid_basecase being pretty fast for a diagonal sum like this,
 233          which is true at least for the K8 asm version, but surely false
 234          for the generic version. */
 235       mpn_mulmid_basecase (e, ap + n - 1, n - 1, bp, n - 1);
 236       mpn_add_n (rp + n - 1, rp + n - 1, e, 3);
 237     }
 238 }