mpz/n_pow_ui.c

   1 /* mpz_n_pow_ui -- mpn raised to ulong.
   2
   3    THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY.  THEY'RE ALMOST
   4    CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN
   5    FUTURE GNU MP RELEASES.
   6
   7 Copyright 2001, 2002, 2005, 2012 Free Software Foundation, Inc.
   8
   9 This file is part of the GNU MP Library.
  10
  11 The GNU MP Library is free software; you can redistribute it and/or modify
  12 it under the terms of either:
  13
  14   * the GNU Lesser General Public License as published by the Free
  15     Software Foundation; either version 3 of the License, or (at your
  16     option) any later version.
  17
  18 or
  19
  20   * the GNU General Public License as published by the Free Software
  21     Foundation; either version 2 of the License, or (at your option) any
  22     later version.
  23
  24 or both in parallel, as here.
  25
  26 The GNU MP Library is distributed in the hope that it will be useful, but
  27 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
  28 or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  29 for more details.
  30
  31 You should have received copies of the GNU General Public License and the
  32 GNU Lesser General Public License along with the GNU MP Library.  If not,
  33 see https://www.gnu.org/licenses/.  */
  34
  35 #include "gmp.h"
  36 #include "gmp-impl.h"
  37 #include "longlong.h"
  38
  39
  40 /* Change this to "#define TRACE(x) x" for some traces. */
  41 #define TRACE(x)
  42
  43
  44 /* Use this to test the mul_2 code on a CPU without a native version of that
  45    routine.  */
  46 #if 0
  47 #define mpn_mul_2  refmpn_mul_2
  48 #define HAVE_NATIVE_mpn_mul_2  1
  49 #endif
  50
  51
  52 /* mpz_pow_ui and mpz_ui_pow_ui want to share almost all of this code.
  53    ui_pow_ui doesn't need the mpn_mul based powering loop or the tests on
  54    bsize==2 or >2, but separating that isn't easy because there's shared
  55    code both before and after (the size calculations and the powers of 2
  56    handling).
  57
  58    Alternatives:
  59
  60    It would work to just use the mpn_mul powering loop for 1 and 2 limb
  61    bases, but the current separate loop allows mul_1 and mul_2 to be done
  62    in-place, which might help cache locality a bit.  If mpn_mul was relaxed
  63    to allow source==dest when vn==1 or 2 then some pointer twiddling might
  64    let us get the same effect in one loop.
  65
  66    The initial powering for bsize==1 into blimb or blimb:blimb_low doesn't
  67    form the biggest possible power of b that fits, only the biggest power of
  68    2 power, ie. b^(2^n).  It'd be possible to choose a bigger power, perhaps
  69    using mp_bases[b].big_base for small b, and thereby get better value
  70    from mpn_mul_1 or mpn_mul_2 in the bignum powering.  It's felt that doing
  71    so would be more complicated than it's worth, and could well end up being
  72    a slowdown for small e.  For big e on the other hand the algorithm is
  73    dominated by mpn_sqr so there wouldn't much of a saving.  The current
  74    code can be viewed as simply doing the first few steps of the powering in
  75    a single or double limb where possible.
  76
  77    If r==b, and blow_twos==0, and r must be realloc'ed, then the temporary
  78    copy made of b is unnecessary.  We could just use the old alloc'ed block
  79    and free it at the end.  But arranging this seems like a lot more trouble
  80    than it's worth.  */
  81
  82
  83 /* floor(sqrt(GMP_NUMB_MAX)), ie. the biggest value that can be squared in
  84    a limb without overflowing.
  85    FIXME: This formula is an underestimate when GMP_NUMB_BITS is odd. */
  86
  87 #define GMP_NUMB_HALFMAX  (((mp_limb_t) 1 << GMP_NUMB_BITS/2) - 1)
  88
  89
  90 /* The following are for convenience, they update the size and check the
  91    alloc.  */
  92
  93 #define MPN_SQR(dst, alloc, src, size)          \
  94   do {                                          \
  95     ASSERT (2*(size) <= (alloc));               \
  96     mpn_sqr (dst, src, size);                   \
  97     (size) *= 2;                                \
  98     (size) -= ((dst)[(size)-1] == 0);           \
  99   } while (0)
 100
 101 #define MPN_MUL(dst, alloc, src, size, src2, size2)     \
 102   do {                                                  \
 103     mp_limb_t  cy;                                      \
 104     ASSERT ((size) + (size2) <= (alloc));               \
 105     cy = mpn_mul (dst, src, size, src2, size2);         \
 106     (size) += (size2) - (cy == 0);                      \
 107   } while (0)
 108
 109 #define MPN_MUL_2(ptr, size, alloc, mult)       \
 110   do {                                          \
 111     mp_limb_t  cy;                              \
 112     ASSERT ((size)+2 <= (alloc));               \
 113     cy = mpn_mul_2 (ptr, ptr, size, mult);      \
 114     (size)++;                                   \
 115     (ptr)[(size)] = cy;                         \
 116     (size) += (cy != 0);                        \
 117   } while (0)
 118
 119 #define MPN_MUL_1(ptr, size, alloc, limb)       \
 120   do {                                          \
 121     mp_limb_t  cy;                              \
 122     ASSERT ((size)+1 <= (alloc));               \
 123     cy = mpn_mul_1 (ptr, ptr, size, limb);      \
 124     (ptr)[size] = cy;                           \
 125     (size) += (cy != 0);                        \
 126   } while (0)
 127
 128 #define MPN_LSHIFT(ptr, size, alloc, shift)     \
 129   do {                                          \
 130     mp_limb_t  cy;                              \
 131     ASSERT ((size)+1 <= (alloc));               \
 132     cy = mpn_lshift (ptr, ptr, size, shift);    \
 133     (ptr)[size] = cy;                           \
 134     (size) += (cy != 0);                        \
 135   } while (0)
 136
 137 #define MPN_RSHIFT_OR_COPY(dst, src, size, shift)       \
 138   do {                                                  \
 139     if ((shift) == 0)                                   \
 140       MPN_COPY (dst, src, size);                        \
 141     else                                                \
 142       {                                                 \
 143         mpn_rshift (dst, src, size, shift);             \
 144         (size) -= ((dst)[(size)-1] == 0);               \
 145       }                                                 \
 146   } while (0)
 147
 148
 149 /* ralloc and talloc are only wanted for ASSERTs, after the initial space
 150    allocations.  Avoid writing values to them in a normal build, to ensure
 151    the compiler lets them go dead.  gcc already figures this out itself
 152    actually.  */
 153
 154 #define SWAP_RP_TP                                      \
 155   do {                                                  \
 156     MP_PTR_SWAP (rp, tp);                               \
 157     ASSERT_CODE (MP_SIZE_T_SWAP (ralloc, talloc));      \
 158   } while (0)
 159
 160
 161 void
 162 mpz_n_pow_ui (mpz_ptr r, mp_srcptr bp, mp_size_t bsize, unsigned long int e)
 163 {
 164   mp_ptr         rp;
 165   mp_size_t      rtwos_limbs, ralloc, rsize;
 166   int            rneg, i, cnt, btwos, r_bp_overlap;
 167   mp_limb_t      blimb, rl;
 168   mp_bitcnt_t    rtwos_bits;
 169 #if HAVE_NATIVE_mpn_mul_2
 170   mp_limb_t      blimb_low, rl_high;
 171 #else
 172   mp_limb_t      b_twolimbs[2];
 173 #endif
 174   TMP_DECL;
 175
 176   TRACE (printf ("mpz_n_pow_ui rp=0x%lX bp=0x%lX bsize=%ld e=%lu (0x%lX)\n",
 177                  PTR(r), bp, bsize, e, e);
 178          mpn_trace ("b", bp, bsize));
 179
 180   ASSERT (bsize == 0 || bp[ABS(bsize)-1] != 0);
 181   ASSERT (MPN_SAME_OR_SEPARATE2_P (PTR(r), ALLOC(r), bp, ABS(bsize)));
 182
 183   /* b^0 == 1, including 0^0 == 1 */
 184   if (e == 0)
 185     {
 186       PTR(r)[0] = 1;
 187       SIZ(r) = 1;
 188       return;
 189     }
 190
 191   /* 0^e == 0 apart from 0^0 above */
 192   if (bsize == 0)
 193     {
 194       SIZ(r) = 0;
 195       return;
 196     }
 197
 198   /* Sign of the final result. */
 199   rneg = (bsize < 0 && (e & 1) != 0);
 200   bsize = ABS (bsize);
 201   TRACE (printf ("rneg %d\n", rneg));
 202
 203   r_bp_overlap = (PTR(r) == bp);
 204
 205   /* Strip low zero limbs from b. */
 206   rtwos_limbs = 0;
 207   for (blimb = *bp; blimb == 0; blimb = *++bp)
 208     {
 209       rtwos_limbs += e;
 210       bsize--; ASSERT (bsize >= 1);
 211     }
 212   TRACE (printf ("trailing zero rtwos_limbs=%ld\n", rtwos_limbs));
 213
 214   /* Strip low zero bits from b. */
 215   count_trailing_zeros (btwos, blimb);
 216   blimb >>= btwos;
 217   rtwos_bits = e * btwos;
 218   rtwos_limbs += rtwos_bits / GMP_NUMB_BITS;
 219   rtwos_bits %= GMP_NUMB_BITS;
 220   TRACE (printf ("trailing zero btwos=%d rtwos_limbs=%ld rtwos_bits=%lu\n",
 221                  btwos, rtwos_limbs, rtwos_bits));
 222
 223   TMP_MARK;
 224
 225   rl = 1;
 226 #if HAVE_NATIVE_mpn_mul_2
 227   rl_high = 0;
 228 #endif
 229
 230   if (bsize == 1)
 231     {
 232     bsize_1:
 233       /* Power up as far as possible within blimb.  We start here with e!=0,
 234          but if e is small then we might reach e==0 and the whole b^e in rl.
 235          Notice this code works when blimb==1 too, reaching e==0.  */
 236
 237       while (blimb <= GMP_NUMB_HALFMAX)
 238         {
 239           TRACE (printf ("small e=0x%lX blimb=0x%lX rl=0x%lX\n",
 240                          e, blimb, rl));
 241           ASSERT (e != 0);
 242           if ((e & 1) != 0)
 243             rl *= blimb;
 244           e >>= 1;
 245           if (e == 0)
 246             goto got_rl;
 247           blimb *= blimb;
 248         }
 249
 250 #if HAVE_NATIVE_mpn_mul_2
 251       TRACE (printf ("single power, e=0x%lX b=0x%lX rl=0x%lX\n",
 252                      e, blimb, rl));
 253
 254       /* Can power b once more into blimb:blimb_low */
 255       bsize = 2;
 256       ASSERT (e != 0);
 257       if ((e & 1) != 0)
 258         {
 259           umul_ppmm (rl_high, rl, rl, blimb << GMP_NAIL_BITS);
 260           rl >>= GMP_NAIL_BITS;
 261         }
 262       e >>= 1;
 263       umul_ppmm (blimb, blimb_low, blimb, blimb << GMP_NAIL_BITS);
 264       blimb_low >>= GMP_NAIL_BITS;
 265
 266     got_rl:
 267       TRACE (printf ("double power e=0x%lX blimb=0x%lX:0x%lX rl=0x%lX:%lX\n",
 268                      e, blimb, blimb_low, rl_high, rl));
 269
 270       /* Combine left-over rtwos_bits into rl_high:rl to be handled by the
 271          final mul_1 or mul_2 rather than a separate lshift.
 272          - rl_high:rl mustn't be 1 (since then there's no final mul)
 273          - rl_high mustn't overflow
 274          - rl_high mustn't change to non-zero, since mul_1+lshift is
 275          probably faster than mul_2 (FIXME: is this true?)  */
 276
 277       if (rtwos_bits != 0
 278           && ! (rl_high == 0 && rl == 1)
 279           && (rl_high >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
 280         {
 281           mp_limb_t  new_rl_high = (rl_high << rtwos_bits)
 282             | (rl >> (GMP_NUMB_BITS-rtwos_bits));
 283           if (! (rl_high == 0 && new_rl_high != 0))
 284             {
 285               rl_high = new_rl_high;
 286               rl <<= rtwos_bits;
 287               rtwos_bits = 0;
 288               TRACE (printf ("merged rtwos_bits, rl=0x%lX:%lX\n",
 289                              rl_high, rl));
 290             }
 291         }
 292 #else
 293     got_rl:
 294       TRACE (printf ("small power e=0x%lX blimb=0x%lX rl=0x%lX\n",
 295                      e, blimb, rl));
 296
 297       /* Combine left-over rtwos_bits into rl to be handled by the final
 298          mul_1 rather than a separate lshift.
 299          - rl mustn't be 1 (since then there's no final mul)
 300          - rl mustn't overflow  */
 301
 302       if (rtwos_bits != 0
 303           && rl != 1
 304           && (rl >> (GMP_NUMB_BITS-rtwos_bits)) == 0)
 305         {
 306           rl <<= rtwos_bits;
 307           rtwos_bits = 0;
 308           TRACE (printf ("merged rtwos_bits, rl=0x%lX\n", rl));
 309         }
 310 #endif
 311     }
 312   else if (bsize == 2)
 313     {
 314       mp_limb_t  bsecond = bp[1];
 315       if (btwos != 0)
 316         blimb |= (bsecond << (GMP_NUMB_BITS - btwos)) & GMP_NUMB_MASK;
 317       bsecond >>= btwos;
 318       if (bsecond == 0)
 319         {
 320           /* Two limbs became one after rshift. */
 321           bsize = 1;
 322           goto bsize_1;
 323         }
 324
 325       TRACE (printf ("bsize==2 using b=0x%lX:%lX", bsecond, blimb));
 326 #if HAVE_NATIVE_mpn_mul_2
 327       blimb_low = blimb;
 328 #else
 329       bp = b_twolimbs;
 330       b_twolimbs[0] = blimb;
 331       b_twolimbs[1] = bsecond;
 332 #endif
 333       blimb = bsecond;
 334     }
 335   else
 336     {
 337       if (r_bp_overlap || btwos != 0)
 338         {
 339           mp_ptr tp = TMP_ALLOC_LIMBS (bsize);
 340           MPN_RSHIFT_OR_COPY (tp, bp, bsize, btwos);
 341           bp = tp;
 342           TRACE (printf ("rshift or copy bp,bsize, new bsize=%ld\n", bsize));
 343         }
 344 #if HAVE_NATIVE_mpn_mul_2
 345       /* in case 3 limbs rshift to 2 and hence use the mul_2 loop below */
 346       blimb_low = bp[0];
 347 #endif
 348       blimb = bp[bsize-1];
 349
 350       TRACE (printf ("big bsize=%ld  ", bsize);
 351              mpn_trace ("b", bp, bsize));
 352     }
 353
 354   /* At this point blimb is the most significant limb of the base to use.
 355
 356      Each factor of b takes (bsize*BPML-cnt) bits and there's e of them; +1
 357      limb to round up the division; +1 for multiplies all using an extra
 358      limb over the true size; +2 for rl at the end; +1 for lshift at the
 359      end.
 360
 361      The size calculation here is reasonably accurate.  The base is at least
 362      half a limb, so in 32 bits the worst case is 2^16+1 treated as 17 bits
 363      when it will power up as just over 16, an overestimate of 17/16 =
 364      6.25%.  For a 64-bit limb it's half that.
 365
 366      If e==0 then blimb won't be anything useful (though it will be
 367      non-zero), but that doesn't matter since we just end up with ralloc==5,
 368      and that's fine for 2 limbs of rl and 1 of lshift.  */
 369
 370   ASSERT (blimb != 0);
 371   count_leading_zeros (cnt, blimb);
 372   ralloc = (bsize*GMP_NUMB_BITS - cnt + GMP_NAIL_BITS) * e / GMP_NUMB_BITS + 5;
 373   TRACE (printf ("ralloc %ld, from bsize=%ld blimb=0x%lX cnt=%d\n",
 374                  ralloc, bsize, blimb, cnt));
 375   rp = MPZ_REALLOC (r, ralloc + rtwos_limbs);
 376
 377   /* Low zero limbs resulting from powers of 2. */
 378   MPN_ZERO (rp, rtwos_limbs);
 379   rp += rtwos_limbs;
 380
 381   if (e == 0)
 382     {
 383       /* Any e==0 other than via bsize==1 or bsize==2 is covered at the
 384          start. */
 385       rp[0] = rl;
 386       rsize = 1;
 387 #if HAVE_NATIVE_mpn_mul_2
 388       rp[1] = rl_high;
 389       rsize += (rl_high != 0);
 390 #endif
 391       ASSERT (rp[rsize-1] != 0);
 392     }
 393   else
 394     {
 395       mp_ptr     tp;
 396       mp_size_t  talloc;
 397
 398       /* In the mpn_mul_1 or mpn_mul_2 loops or in the mpn_mul loop when the
 399          low bit of e is zero, tp only has to hold the second last power
 400          step, which is half the size of the final result.  There's no need
 401          to round up the divide by 2, since ralloc includes a +2 for rl
 402          which not needed by tp.  In the mpn_mul loop when the low bit of e
 403          is 1, tp must hold nearly the full result, so just size it the same
 404          as rp.  */
 405
 406       talloc = ralloc;
 407 #if HAVE_NATIVE_mpn_mul_2
 408       if (bsize <= 2 || (e & 1) == 0)
 409         talloc /= 2;
 410 #else
 411       if (bsize <= 1 || (e & 1) == 0)
 412         talloc /= 2;
 413 #endif
 414       TRACE (printf ("talloc %ld\n", talloc));
 415       tp = TMP_ALLOC_LIMBS (talloc);
 416
 417       /* Go from high to low over the bits of e, starting with i pointing at
 418          the bit below the highest 1 (which will mean i==-1 if e==1).  */
 419       count_leading_zeros (cnt, (mp_limb_t) e);
 420       i = GMP_LIMB_BITS - cnt - 2;
 421
 422 #if HAVE_NATIVE_mpn_mul_2
 423       if (bsize <= 2)
 424         {
 425           mp_limb_t  mult[2];
 426
 427           /* Any bsize==1 will have been powered above to be two limbs. */
 428           ASSERT (bsize == 2);
 429           ASSERT (blimb != 0);
 430
 431           /* Arrange the final result ends up in r, not in the temp space */
 432           if ((i & 1) == 0)
 433             SWAP_RP_TP;
 434
 435           rp[0] = blimb_low;
 436           rp[1] = blimb;
 437           rsize = 2;
 438
 439           mult[0] = blimb_low;
 440           mult[1] = blimb;
 441
 442           for ( ; i >= 0; i--)
 443             {
 444               TRACE (printf ("mul_2 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
 445                              i, e, rsize, ralloc, talloc);
 446                      mpn_trace ("r", rp, rsize));
 447
 448               MPN_SQR (tp, talloc, rp, rsize);
 449               SWAP_RP_TP;
 450               if ((e & (1L << i)) != 0)
 451                 MPN_MUL_2 (rp, rsize, ralloc, mult);
 452             }
 453
 454           TRACE (mpn_trace ("mul_2 before rl, r", rp, rsize));
 455           if (rl_high != 0)
 456             {
 457               mult[0] = rl;
 458               mult[1] = rl_high;
 459               MPN_MUL_2 (rp, rsize, ralloc, mult);
 460             }
 461           else if (rl != 1)
 462             MPN_MUL_1 (rp, rsize, ralloc, rl);
 463         }
 464 #else
 465       if (bsize == 1)
 466         {
 467           /* Arrange the final result ends up in r, not in the temp space */
 468           if ((i & 1) == 0)
 469             SWAP_RP_TP;
 470
 471           rp[0] = blimb;
 472           rsize = 1;
 473
 474           for ( ; i >= 0; i--)
 475             {
 476               TRACE (printf ("mul_1 loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
 477                              i, e, rsize, ralloc, talloc);
 478                      mpn_trace ("r", rp, rsize));
 479
 480               MPN_SQR (tp, talloc, rp, rsize);
 481               SWAP_RP_TP;
 482               if ((e & (1L << i)) != 0)
 483                 MPN_MUL_1 (rp, rsize, ralloc, blimb);
 484             }
 485
 486           TRACE (mpn_trace ("mul_1 before rl, r", rp, rsize));
 487           if (rl != 1)
 488             MPN_MUL_1 (rp, rsize, ralloc, rl);
 489         }
 490 #endif
 491       else
 492         {
 493           int  parity;
 494
 495           /* Arrange the final result ends up in r, not in the temp space */
 496           ULONG_PARITY (parity, e);
 497           if (((parity ^ i) & 1) != 0)
 498             SWAP_RP_TP;
 499
 500           MPN_COPY (rp, bp, bsize);
 501           rsize = bsize;
 502
 503           for ( ; i >= 0; i--)
 504             {
 505               TRACE (printf ("mul loop i=%d e=0x%lX, rsize=%ld ralloc=%ld talloc=%ld\n",
 506                              i, e, rsize, ralloc, talloc);
 507                      mpn_trace ("r", rp, rsize));
 508
 509               MPN_SQR (tp, talloc, rp, rsize);
 510               SWAP_RP_TP;
 511               if ((e & (1L << i)) != 0)
 512                 {
 513                   MPN_MUL (tp, talloc, rp, rsize, bp, bsize);
 514                   SWAP_RP_TP;
 515                 }
 516             }
 517         }
 518     }
 519
 520   ASSERT (rp == PTR(r) + rtwos_limbs);
 521   TRACE (mpn_trace ("end loop r", rp, rsize));
 522   TMP_FREE;
 523
 524   /* Apply any partial limb factors of 2. */
 525   if (rtwos_bits != 0)
 526     {
 527       MPN_LSHIFT (rp, rsize, ralloc, (unsigned) rtwos_bits);
 528       TRACE (mpn_trace ("lshift r", rp, rsize));
 529     }
 530
 531   rsize += rtwos_limbs;
 532   SIZ(r) = (rneg ? -rsize : rsize);
 533 }