mpn/generic/toom_interpolate_12pts.c

   1 /* Interpolaton for the algorithm Toom-Cook 6.5-way.
   2
   3    Contributed to the GNU project by Marco Bodrato.
   4
   5    THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE.  IT IS ONLY
   6    SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES.  IN FACT, IT IS ALMOST
   7    GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE.
   8
   9 Copyright 2009, 2010 Free Software Foundation, Inc.
  10
  11 This file is part of the GNU MP Library.
  12
  13 The GNU MP Library is free software; you can redistribute it and/or modify
  14 it under the terms of the GNU Lesser General Public License as published by
  15 the Free Software Foundation; either version 3 of the License, or (at your
  16 option) any later version.
  17
  18 The GNU MP Library is distributed in the hope that it will be useful, but
  19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  20 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
  21 License for more details.
  22
  23 You should have received a copy of the GNU Lesser General Public License
  24 along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.  */
  25
  26
  27 #include "gmp.h"
  28 #include "gmp-impl.h"
  29
  30
  31 #if HAVE_NATIVE_mpn_sublsh_n
  32 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s)
  33 #else
  34 static mp_limb_t
  35 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
  36 {
  37 #if USE_MUL_1 && 0
  38   return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s));
  39 #else
  40   mp_limb_t __cy;
  41   __cy = mpn_lshift(ws,src,n,s);
  42   return    __cy + mpn_sub_n(dst,dst,ws,n);
  43 #endif
  44 }
  45 #endif
  46
  47 #if HAVE_NATIVE_mpn_addlsh_n
  48 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s)
  49 #else
  50 static mp_limb_t
  51 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws)
  52 {
  53 #if USE_MUL_1 && 0
  54   return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s));
  55 #else
  56   mp_limb_t __cy;
  57   __cy = mpn_lshift(ws,src,n,s);
  58   return    __cy + mpn_add_n(dst,dst,ws,n);
  59 #endif
  60 }
  61 #endif
  62
  63 #if HAVE_NATIVE_mpn_subrsh
  64 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s)
  65 #else
  66 /* FIXME: This is not a correct definition, it assumes no carry */
  67 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws)                               \
  68 do {                                                                    \
  69   mp_limb_t __cy;                                                       \
  70   MPN_DECR_U (dst, nd, src[0] >> s);                                    \
  71   __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \
  72   MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy);                         \
  73 } while (0)
  74 #endif
  75
  76
  77 #if GMP_NUMB_BITS < 21
  78 #error Not implemented: Both sublsh_n(,,,20) should be corrected.
  79 #endif
  80
  81 #if GMP_NUMB_BITS < 16
  82 #error Not implemented: divexact_by42525 needs splitting.
  83 #endif
  84
  85 #if GMP_NUMB_BITS < 12
  86 #error Not implemented: Hard to adapt...
  87 #endif
  88
  89 /* FIXME: tuneup should decide the best variant */
  90 #ifndef AORSMUL_FASTER_AORS_AORSLSH
  91 #define AORSMUL_FASTER_AORS_AORSLSH 1
  92 #endif
  93 #ifndef AORSMUL_FASTER_AORS_2AORSLSH
  94 #define AORSMUL_FASTER_AORS_2AORSLSH 1
  95 #endif
  96 #ifndef AORSMUL_FASTER_2AORSLSH
  97 #define AORSMUL_FASTER_2AORSLSH 1
  98 #endif
  99 #ifndef AORSMUL_FASTER_3AORSLSH
 100 #define AORSMUL_FASTER_3AORSLSH 1
 101 #endif
 102
 103 #define BINVERT_9 \
 104   ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39)
 105
 106 #define BINVERT_255 \
 107   (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8)))
 108
 109   /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */
 110 #if GMP_LIMB_BITS == 32
 111 #define BINVERT_2835  (GMP_NUMB_MASK &          CNST_LIMB(0x53E3771B))
 112 #define BINVERT_42525 (GMP_NUMB_MASK &          CNST_LIMB(0x9F314C35))
 113 #else
 114 #if GMP_LIMB_BITS == 64
 115 #define BINVERT_2835  (GMP_NUMB_MASK &  CNST_LIMB(0x938CC70553E3771B))
 116 #define BINVERT_42525 (GMP_NUMB_MASK &  CNST_LIMB(0xE7B40D449F314C35))
 117 #endif
 118 #endif
 119
 120 #ifndef mpn_divexact_by255
 121 #if GMP_NUMB_BITS % 8 == 0
 122 #define mpn_divexact_by255(dst,src,size) \
 123   (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255)))
 124 #else
 125 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
 126 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0)
 127 #else
 128 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255))
 129 #endif
 130 #endif
 131 #endif
 132
 133 #ifndef mpn_divexact_by9x4
 134 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1
 135 #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2)
 136 #else
 137 #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2)
 138 #endif
 139 #endif
 140
 141 #ifndef mpn_divexact_by42525
 142 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525)
 143 #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0)
 144 #else
 145 #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525))
 146 #endif
 147 #endif
 148
 149 #ifndef mpn_divexact_by2835x4
 150 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835)
 151 #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2)
 152 #else
 153 #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2)
 154 #endif
 155 #endif
 156
 157 /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation
 158    points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely,
 159    we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of
 160    degree 11 (or 10), given the 12 (rsp. 11) values:
 161
 162      r0 = limit at infinity of f(x) / x^7,
 163      r1 = f(4),f(-4),
 164      r2 = f(2),f(-2),
 165      r3 = f(1),f(-1),
 166      r4 = f(1/4),f(-1/4),
 167      r5 = f(1/2),f(-1/2),
 168      r6 = f(0).
 169
 170    All couples of the form f(n),f(-n) must be already mixed with
 171    toom_couple_handling(f(n),...,f(-n),...)
 172
 173    The result is stored in {pp, spt + 7*n (or 6*n)}.
 174    At entry, r6 is stored at {pp, 2n},
 175    r4 is stored at {pp + 3n, 3n + 1}.
 176    r2 is stored at {pp + 7n, 3n + 1}.
 177    r0 is stored at {pp +11n, spt}.
 178
 179    The other values are 3n+1 limbs each (with most significant limbs small).
 180
 181    Negative intermediate results are stored two-complemented.
 182    Inputs are destroyed.
 183 */
 184
 185 void
 186 mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5,
 187                         mp_size_t n, mp_size_t spt, int half, mp_ptr wsi)
 188 {
 189   mp_limb_t cy;
 190   mp_size_t n3;
 191   mp_size_t n3p1;
 192   n3 = 3 * n;
 193   n3p1 = n3 + 1;
 194
 195 #define   r4    (pp + n3)                       /* 3n+1 */
 196 #define   r2    (pp + 7 * n)                    /* 3n+1 */
 197 #define   r0    (pp +11 * n)                    /* s+t <= 2*n */
 198
 199   /******************************* interpolation *****************************/
 200   if (half != 0) {
 201     cy = mpn_sub_n (r3, r3, r0, spt);
 202     MPN_DECR_U (r3 + spt, n3p1 - spt, cy);
 203
 204     cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi);
 205     MPN_DECR_U (r2 + spt, n3p1 - spt, cy);
 206     DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi);
 207
 208     cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi);
 209     MPN_DECR_U (r1 + spt, n3p1 - spt, cy);
 210     DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi);
 211   };
 212
 213   r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi);
 214   DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi);
 215
 216 #if HAVE_NATIVE_mpn_add_n_sub_n
 217   mpn_add_n_sub_n (r1, r4, r4, r1, n3p1);
 218 #else
 219   ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1));
 220   mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */
 221   MP_PTR_SWAP(r1, wsi);
 222 #endif
 223
 224   r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi);
 225   DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi);
 226
 227 #if HAVE_NATIVE_mpn_add_n_sub_n
 228   mpn_add_n_sub_n (r2, r5, r5, r2, n3p1);
 229 #else
 230   mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */
 231   ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1));
 232   MP_PTR_SWAP(r5, wsi);
 233 #endif
 234
 235   r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n);
 236
 237 #if AORSMUL_FASTER_AORS_AORSLSH
 238   mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */
 239 #else
 240   mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */
 241   DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */
 242 #endif
 243   /* A division by 2835x4 followsi. Warning: the operand can be negative! */
 244   mpn_divexact_by2835x4(r4, r4, n3p1);
 245   if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0)
 246     r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2));
 247
 248 #if AORSMUL_FASTER_2AORSLSH
 249   mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */
 250 #else
 251   DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */
 252   DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */
 253 #endif
 254   mpn_divexact_by255(r5, r5, n3p1);
 255
 256   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi));
 257
 258 #if AORSMUL_FASTER_3AORSLSH
 259   ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100));
 260 #else
 261   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi));
 262   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi));
 263   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi));
 264 #endif
 265   ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi));
 266   mpn_divexact_by42525(r1, r1, n3p1);
 267
 268 #if AORSMUL_FASTER_AORS_2AORSLSH
 269   ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225));
 270 #else
 271   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1));
 272   ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi));
 273   ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi));
 274 #endif
 275   mpn_divexact_by9x4(r2, r2, n3p1);
 276
 277   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1));
 278
 279   mpn_sub_n (r4, r2, r4, n3p1);
 280   ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1));
 281   ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1));
 282
 283   mpn_add_n (r5, r5, r1, n3p1);
 284   ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1));
 285
 286   /* last interpolation steps... */
 287   ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1));
 288   ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1));
 289   /* ... could be mixed with recomposition
 290         ||H-r5|M-r5|L-r5|   ||H-r1|M-r1|L-r1|
 291   */
 292
 293   /***************************** recomposition *******************************/
 294   /*
 295     pp[] prior to operations:
 296     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
 297
 298     summation scheme for remaining operations:
 299     |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp
 300     |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp
 301         ||H r1|M r1|L r1|   ||H r3|M r3|L r3|   ||H_r5|M_r5|L_r5|
 302   */
 303
 304   cy = mpn_add_n (pp + n, pp + n, r5, n);
 305   cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy);
 306 #if HAVE_NATIVE_mpn_add_nc
 307   cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy);
 308 #else
 309   MPN_INCR_U (r5 + 2 * n, n + 1, cy);
 310   cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n);
 311 #endif
 312   MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy);
 313
 314   pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n);
 315   cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]);
 316 #if HAVE_NATIVE_mpn_add_nc
 317   cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy);
 318 #else
 319   MPN_INCR_U (r3 + 2 * n, n + 1, cy);
 320   cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n);
 321 #endif
 322   MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy);
 323
 324   pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n);
 325   if (half) {
 326     cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]);
 327 #if HAVE_NATIVE_mpn_add_nc
 328     if (LIKELY (spt > n)) {
 329       cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy);
 330       MPN_INCR_U (pp + 4 * n3, spt - n, cy);
 331     } else {
 332       ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy));
 333     }
 334 #else
 335     MPN_INCR_U (r1 + 2 * n, n + 1, cy);
 336     if (LIKELY (spt > n)) {
 337       cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n);
 338       MPN_INCR_U (pp + 4 * n3, spt - n, cy);
 339     } else {
 340       ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt));
 341     }
 342 #endif
 343   } else {
 344     ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n]));
 345   }
 346
 347 #undef   r0
 348 #undef   r2
 349 #undef   r4
 350 }