src/third_party/boringssl/src/crypto/bn/generic.c

   1 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
   2  * All rights reserved.
   3  *
   4  * This package is an SSL implementation written
   5  * by Eric Young (eay@cryptsoft.com).
   6  * The implementation was written so as to conform with Netscapes SSL.
   7  *
   8  * This library is free for commercial and non-commercial use as long as
   9  * the following conditions are aheared to.  The following conditions
  10  * apply to all code found in this distribution, be it the RC4, RSA,
  11  * lhash, DES, etc., code; not just the SSL code.  The SSL documentation
  12  * included with this distribution is covered by the same copyright terms
  13  * except that the holder is Tim Hudson (tjh@cryptsoft.com).
  14  *
  15  * Copyright remains Eric Young's, and as such any Copyright notices in
  16  * the code are not to be removed.
  17  * If this package is used in a product, Eric Young should be given attribution
  18  * as the author of the parts of the library used.
  19  * This can be in the form of a textual message at program startup or
  20  * in documentation (online or textual) provided with the package.
  21  *
  22  * Redistribution and use in source and binary forms, with or without
  23  * modification, are permitted provided that the following conditions
  24  * are met:
  25  * 1. Redistributions of source code must retain the copyright
  26  *    notice, this list of conditions and the following disclaimer.
  27  * 2. Redistributions in binary form must reproduce the above copyright
  28  *    notice, this list of conditions and the following disclaimer in the
  29  *    documentation and/or other materials provided with the distribution.
  30  * 3. All advertising materials mentioning features or use of this software
  31  *    must display the following acknowledgement:
  32  *    "This product includes cryptographic software written by
  33  *     Eric Young (eay@cryptsoft.com)"
  34  *    The word 'cryptographic' can be left out if the rouines from the library
  35  *    being used are not cryptographic related :-).
  36  * 4. If you include any Windows specific code (or a derivative thereof) from
  37  *    the apps directory (application code) you must include an acknowledgement:
  38  *    "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
  39  *
  40  * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
  41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  50  * SUCH DAMAGE.
  51  *
  52  * The licence and distribution terms for any publically available version or
  53  * derivative of this code cannot be changed.  i.e. this code cannot simply be
  54  * copied and put under another distribution licence
  55  * [including the GNU Public Licence.] */
  56
  57 #include <openssl/bn.h>
  58
  59 #include <assert.h>
  60
  61 #include "internal.h"
  62
  63
  64 /* Generic implementations of most operations are needed for:
  65  * - Configurations without inline assembly.
  66  * - Architectures other than x86 or x86_64.
  67  * - Windows x84_64; x86_64-gcc.c does not build on MSVC. */
  68 #if defined(OPENSSL_NO_ASM) || \
  69     (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86)) || \
  70     (defined(OPENSSL_X86_64) && defined(OPENSSL_WINDOWS))
  71
  72 #if defined(OPENSSL_WINDOWS)
  73 #define alloca _alloca
  74 #else
  75 #include <alloca.h>
  76 #endif
  77
  78 #ifdef BN_LLONG
  79 #define mul_add(r, a, w, c)             \
  80   {                                     \
  81     BN_ULLONG t;                        \
  82     t = (BN_ULLONG)w * (a) + (r) + (c); \
  83     (r) = Lw(t);                        \
  84     (c) = Hw(t);                        \
  85   }
  86
  87 #define mul(r, a, w, c)           \
  88   {                               \
  89     BN_ULLONG t;                  \
  90     t = (BN_ULLONG)w * (a) + (c); \
  91     (r) = Lw(t);                  \
  92     (c) = Hw(t);                  \
  93   }
  94
  95 #define sqr(r0, r1, a)        \
  96   {                           \
  97     BN_ULLONG t;              \
  98     t = (BN_ULLONG)(a) * (a); \
  99     (r0) = Lw(t);             \
 100     (r1) = Hw(t);             \
 101   }
 102
 103 #elif defined(BN_UMULT_LOHI)
 104 #define mul_add(r, a, w, c)             \
 105   {                                     \
 106     BN_ULONG high, low, ret, tmp = (a); \
 107     ret = (r);                          \
 108     BN_UMULT_LOHI(low, high, w, tmp);   \
 109     ret += (c);                         \
 110     (c) = (ret < (c)) ? 1 : 0;          \
 111     (c) += high;                        \
 112     ret += low;                         \
 113     (c) += (ret < low) ? 1 : 0;         \
 114     (r) = ret;                          \
 115   }
 116
 117 #define mul(r, a, w, c)                \
 118   {                                    \
 119     BN_ULONG high, low, ret, ta = (a); \
 120     BN_UMULT_LOHI(low, high, w, ta);   \
 121     ret = low + (c);                   \
 122     (c) = high;                        \
 123     (c) += (ret < low) ? 1 : 0;        \
 124     (r) = ret;                         \
 125   }
 126
 127 #define sqr(r0, r1, a)               \
 128   {                                  \
 129     BN_ULONG tmp = (a);              \
 130     BN_UMULT_LOHI(r0, r1, tmp, tmp); \
 131   }
 132
 133 #elif defined(BN_UMULT_HIGH)
 134 #define mul_add(r, a, w, c)             \
 135   {                                     \
 136     BN_ULONG high, low, ret, tmp = (a); \
 137     ret = (r);                          \
 138     high = BN_UMULT_HIGH(w, tmp);       \
 139     ret += (c);                         \
 140     low = (w) * tmp;                    \
 141     (c) = (ret < (c)) ? 1 : 0;          \
 142     (c) += high;                        \
 143     ret += low;                         \
 144     (c) += (ret < low) ? 1 : 0;         \
 145     (r) = ret;                          \
 146   }
 147
 148 #define mul(r, a, w, c)                \
 149   {                                    \
 150     BN_ULONG high, low, ret, ta = (a); \
 151     low = (w) * ta;                    \
 152     high = BN_UMULT_HIGH(w, ta);       \
 153     ret = low + (c);                   \
 154     (c) = high;                        \
 155     (c) += (ret < low) ? 1 : 0;        \
 156     (r) = ret;                         \
 157   }
 158
 159 #define sqr(r0, r1, a)              \
 160   {                                 \
 161     BN_ULONG tmp = (a);             \
 162     (r0) = tmp * tmp;               \
 163     (r1) = BN_UMULT_HIGH(tmp, tmp); \
 164   }
 165
 166 #else
 167 /*************************************************************
 168  * No long long type
 169  */
 170
 171 #define LBITS(a) ((a) & BN_MASK2l)
 172 #define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
 173 #define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
 174
 175 #define LLBITS(a) ((a) & BN_MASKl)
 176 #define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
 177 #define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
 178
 179 #define mul64(l, h, bl, bh)       \
 180   {                               \
 181     BN_ULONG m, m1, lt, ht;       \
 182                                   \
 183     lt = l;                       \
 184     ht = h;                       \
 185     m = (bh) * (lt);              \
 186     lt = (bl) * (lt);             \
 187     m1 = (bl) * (ht);             \
 188     ht = (bh) * (ht);             \
 189     m = (m + m1) & BN_MASK2;      \
 190     if (m < m1)                   \
 191       ht += L2HBITS((BN_ULONG)1); \
 192     ht += HBITS(m);               \
 193     m1 = L2HBITS(m);              \
 194     lt = (lt + m1) & BN_MASK2;    \
 195     if (lt < m1)                  \
 196       ht++;                       \
 197     (l) = lt;                     \
 198     (h) = ht;                     \
 199   }
 200
 201 #define sqr64(lo, ho, in)                    \
 202   {                                          \
 203     BN_ULONG l, h, m;                        \
 204                                              \
 205     h = (in);                                \
 206     l = LBITS(h);                            \
 207     h = HBITS(h);                            \
 208     m = (l) * (h);                           \
 209     l *= l;                                  \
 210     h *= h;                                  \
 211     h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
 212     m = (m & BN_MASK2l) << (BN_BITS4 + 1);   \
 213     l = (l + m) & BN_MASK2;                  \
 214     if (l < m)                               \
 215       h++;                                   \
 216     (lo) = l;                                \
 217     (ho) = h;                                \
 218   }
 219
 220 #define mul_add(r, a, bl, bh, c) \
 221   {                              \
 222     BN_ULONG l, h;               \
 223                                  \
 224     h = (a);                     \
 225     l = LBITS(h);                \
 226     h = HBITS(h);                \
 227     mul64(l, h, (bl), (bh));     \
 228                                  \
 229     /* non-multiply part */      \
 230     l = (l + (c)) & BN_MASK2;    \
 231     if (l < (c))                 \
 232       h++;                       \
 233     (c) = (r);                   \
 234     l = (l + (c)) & BN_MASK2;    \
 235     if (l < (c))                 \
 236       h++;                       \
 237     (c) = h & BN_MASK2;          \
 238     (r) = l;                     \
 239   }
 240
 241 #define mul(r, a, bl, bh, c)  \
 242   {                           \
 243     BN_ULONG l, h;            \
 244                               \
 245     h = (a);                  \
 246     l = LBITS(h);             \
 247     h = HBITS(h);             \
 248     mul64(l, h, (bl), (bh));  \
 249                               \
 250     /* non-multiply part */   \
 251     l += (c);                 \
 252     if ((l & BN_MASK2) < (c)) \
 253       h++;                    \
 254     (c) = h & BN_MASK2;       \
 255     (r) = l & BN_MASK2;       \
 256   }
 257 #endif /* !BN_LLONG */
 258
 259 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
 260
 261 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
 262                           BN_ULONG w) {
 263   BN_ULONG c1 = 0;
 264
 265   assert(num >= 0);
 266   if (num <= 0) {
 267     return c1;
 268   }
 269
 270   while (num & ~3) {
 271     mul_add(rp[0], ap[0], w, c1);
 272     mul_add(rp[1], ap[1], w, c1);
 273     mul_add(rp[2], ap[2], w, c1);
 274     mul_add(rp[3], ap[3], w, c1);
 275     ap += 4;
 276     rp += 4;
 277     num -= 4;
 278   }
 279
 280   while (num) {
 281     mul_add(rp[0], ap[0], w, c1);
 282     ap++;
 283     rp++;
 284     num--;
 285   }
 286
 287   return c1;
 288 }
 289
 290 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
 291   BN_ULONG c1 = 0;
 292
 293   assert(num >= 0);
 294   if (num <= 0) {
 295     return c1;
 296   }
 297
 298   while (num & ~3) {
 299     mul(rp[0], ap[0], w, c1);
 300     mul(rp[1], ap[1], w, c1);
 301     mul(rp[2], ap[2], w, c1);
 302     mul(rp[3], ap[3], w, c1);
 303     ap += 4;
 304     rp += 4;
 305     num -= 4;
 306   }
 307   while (num) {
 308     mul(rp[0], ap[0], w, c1);
 309     ap++;
 310     rp++;
 311     num--;
 312   }
 313   return c1;
 314 }
 315
 316 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
 317   assert(n >= 0);
 318   if (n <= 0) {
 319     return;
 320   }
 321
 322   while (n & ~3) {
 323     sqr(r[0], r[1], a[0]);
 324     sqr(r[2], r[3], a[1]);
 325     sqr(r[4], r[5], a[2]);
 326     sqr(r[6], r[7], a[3]);
 327     a += 4;
 328     r += 8;
 329     n -= 4;
 330   }
 331   while (n) {
 332     sqr(r[0], r[1], a[0]);
 333     a++;
 334     r += 2;
 335     n--;
 336   }
 337 }
 338
 339 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
 340
 341 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
 342                           BN_ULONG w) {
 343   BN_ULONG c = 0;
 344   BN_ULONG bl, bh;
 345
 346   assert(num >= 0);
 347   if (num <= 0) {
 348     return (BN_ULONG)0;
 349   }
 350
 351   bl = LBITS(w);
 352   bh = HBITS(w);
 353
 354   while (num & ~3) {
 355     mul_add(rp[0], ap[0], bl, bh, c);
 356     mul_add(rp[1], ap[1], bl, bh, c);
 357     mul_add(rp[2], ap[2], bl, bh, c);
 358     mul_add(rp[3], ap[3], bl, bh, c);
 359     ap += 4;
 360     rp += 4;
 361     num -= 4;
 362   }
 363   while (num) {
 364     mul_add(rp[0], ap[0], bl, bh, c);
 365     ap++;
 366     rp++;
 367     num--;
 368   }
 369   return c;
 370 }
 371
 372 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
 373   BN_ULONG carry = 0;
 374   BN_ULONG bl, bh;
 375
 376   assert(num >= 0);
 377   if (num <= 0) {
 378     return (BN_ULONG)0;
 379   }
 380
 381   bl = LBITS(w);
 382   bh = HBITS(w);
 383
 384   while (num & ~3) {
 385     mul(rp[0], ap[0], bl, bh, carry);
 386     mul(rp[1], ap[1], bl, bh, carry);
 387     mul(rp[2], ap[2], bl, bh, carry);
 388     mul(rp[3], ap[3], bl, bh, carry);
 389     ap += 4;
 390     rp += 4;
 391     num -= 4;
 392   }
 393   while (num) {
 394     mul(rp[0], ap[0], bl, bh, carry);
 395     ap++;
 396     rp++;
 397     num--;
 398   }
 399   return carry;
 400 }
 401
 402 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
 403   assert(n >= 0);
 404   if (n <= 0) {
 405     return;
 406   }
 407
 408   while (n & ~3) {
 409     sqr64(r[0], r[1], a[0]);
 410     sqr64(r[2], r[3], a[1]);
 411     sqr64(r[4], r[5], a[2]);
 412     sqr64(r[6], r[7], a[3]);
 413     a += 4;
 414     r += 8;
 415     n -= 4;
 416   }
 417   while (n) {
 418     sqr64(r[0], r[1], a[0]);
 419     a++;
 420     r += 2;
 421     n--;
 422   }
 423 }
 424
 425 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
 426
 427 #if defined(BN_LLONG) && defined(BN_DIV2W)
 428
 429 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
 430   return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
 431 }
 432
 433 #else
 434
 435 /* Divide h,l by d and return the result. */
 436 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
 437   BN_ULONG dh, dl, q, ret = 0, th, tl, t;
 438   int i, count = 2;
 439
 440   if (d == 0) {
 441     return BN_MASK2;
 442   }
 443
 444   i = BN_num_bits_word(d);
 445   assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
 446
 447   i = BN_BITS2 - i;
 448   if (h >= d) {
 449     h -= d;
 450   }
 451
 452   if (i) {
 453     d <<= i;
 454     h = (h << i) | (l >> (BN_BITS2 - i));
 455     l <<= i;
 456   }
 457   dh = (d & BN_MASK2h) >> BN_BITS4;
 458   dl = (d & BN_MASK2l);
 459   for (;;) {
 460     if ((h >> BN_BITS4) == dh) {
 461       q = BN_MASK2l;
 462     } else {
 463       q = h / dh;
 464     }
 465
 466     th = q * dh;
 467     tl = dl * q;
 468     for (;;) {
 469       t = h - th;
 470       if ((t & BN_MASK2h) ||
 471           ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
 472         break;
 473       }
 474       q--;
 475       th -= dh;
 476       tl -= dl;
 477     }
 478     t = (tl >> BN_BITS4);
 479     tl = (tl << BN_BITS4) & BN_MASK2h;
 480     th += t;
 481
 482     if (l < tl) {
 483       th++;
 484     }
 485     l -= tl;
 486     if (h < th) {
 487       h += d;
 488       q--;
 489     }
 490     h -= th;
 491
 492     if (--count == 0) {
 493       break;
 494     }
 495
 496     ret = q << BN_BITS4;
 497     h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
 498     l = (l & BN_MASK2l) << BN_BITS4;
 499   }
 500
 501   ret |= q;
 502   return ret;
 503 }
 504
 505 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
 506
 507 #ifdef BN_LLONG
 508 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 509                       int n) {
 510   BN_ULLONG ll = 0;
 511
 512   assert(n >= 0);
 513   if (n <= 0) {
 514     return (BN_ULONG)0;
 515   }
 516
 517   while (n & ~3) {
 518     ll += (BN_ULLONG)a[0] + b[0];
 519     r[0] = (BN_ULONG)ll & BN_MASK2;
 520     ll >>= BN_BITS2;
 521     ll += (BN_ULLONG)a[1] + b[1];
 522     r[1] = (BN_ULONG)ll & BN_MASK2;
 523     ll >>= BN_BITS2;
 524     ll += (BN_ULLONG)a[2] + b[2];
 525     r[2] = (BN_ULONG)ll & BN_MASK2;
 526     ll >>= BN_BITS2;
 527     ll += (BN_ULLONG)a[3] + b[3];
 528     r[3] = (BN_ULONG)ll & BN_MASK2;
 529     ll >>= BN_BITS2;
 530     a += 4;
 531     b += 4;
 532     r += 4;
 533     n -= 4;
 534   }
 535   while (n) {
 536     ll += (BN_ULLONG)a[0] + b[0];
 537     r[0] = (BN_ULONG)ll & BN_MASK2;
 538     ll >>= BN_BITS2;
 539     a++;
 540     b++;
 541     r++;
 542     n--;
 543   }
 544   return (BN_ULONG)ll;
 545 }
 546
 547 #else /* !BN_LLONG */
 548
 549 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 550                       int n) {
 551   BN_ULONG c, l, t;
 552
 553   assert(n >= 0);
 554   if (n <= 0) {
 555     return (BN_ULONG)0;
 556   }
 557
 558   c = 0;
 559   while (n & ~3) {
 560     t = a[0];
 561     t = (t + c) & BN_MASK2;
 562     c = (t < c);
 563     l = (t + b[0]) & BN_MASK2;
 564     c += (l < t);
 565     r[0] = l;
 566     t = a[1];
 567     t = (t + c) & BN_MASK2;
 568     c = (t < c);
 569     l = (t + b[1]) & BN_MASK2;
 570     c += (l < t);
 571     r[1] = l;
 572     t = a[2];
 573     t = (t + c) & BN_MASK2;
 574     c = (t < c);
 575     l = (t + b[2]) & BN_MASK2;
 576     c += (l < t);
 577     r[2] = l;
 578     t = a[3];
 579     t = (t + c) & BN_MASK2;
 580     c = (t < c);
 581     l = (t + b[3]) & BN_MASK2;
 582     c += (l < t);
 583     r[3] = l;
 584     a += 4;
 585     b += 4;
 586     r += 4;
 587     n -= 4;
 588   }
 589   while (n) {
 590     t = a[0];
 591     t = (t + c) & BN_MASK2;
 592     c = (t < c);
 593     l = (t + b[0]) & BN_MASK2;
 594     c += (l < t);
 595     r[0] = l;
 596     a++;
 597     b++;
 598     r++;
 599     n--;
 600   }
 601   return (BN_ULONG)c;
 602 }
 603
 604 #endif /* !BN_LLONG */
 605
 606 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
 607                       int n) {
 608   BN_ULONG t1, t2;
 609   int c = 0;
 610
 611   assert(n >= 0);
 612   if (n <= 0) {
 613     return (BN_ULONG)0;
 614   }
 615
 616   while (n & ~3) {
 617     t1 = a[0];
 618     t2 = b[0];
 619     r[0] = (t1 - t2 - c) & BN_MASK2;
 620     if (t1 != t2)
 621       c = (t1 < t2);
 622     t1 = a[1];
 623     t2 = b[1];
 624     r[1] = (t1 - t2 - c) & BN_MASK2;
 625     if (t1 != t2)
 626       c = (t1 < t2);
 627     t1 = a[2];
 628     t2 = b[2];
 629     r[2] = (t1 - t2 - c) & BN_MASK2;
 630     if (t1 != t2)
 631       c = (t1 < t2);
 632     t1 = a[3];
 633     t2 = b[3];
 634     r[3] = (t1 - t2 - c) & BN_MASK2;
 635     if (t1 != t2)
 636       c = (t1 < t2);
 637     a += 4;
 638     b += 4;
 639     r += 4;
 640     n -= 4;
 641   }
 642   while (n) {
 643     t1 = a[0];
 644     t2 = b[0];
 645     r[0] = (t1 - t2 - c) & BN_MASK2;
 646     if (t1 != t2)
 647       c = (t1 < t2);
 648     a++;
 649     b++;
 650     r++;
 651     n--;
 652   }
 653   return c;
 654 }
 655
 656 /* mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0) */
 657 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
 658 /* sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
 659 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
 660
 661 #ifdef BN_LLONG
 662 #define mul_add_c(a, b, c0, c1, c2) \
 663   t = (BN_ULLONG)a * b;             \
 664   t1 = (BN_ULONG)Lw(t);             \
 665   t2 = (BN_ULONG)Hw(t);             \
 666   c0 = (c0 + t1) & BN_MASK2;        \
 667   if ((c0) < t1)                    \
 668     t2++;                           \
 669   c1 = (c1 + t2) & BN_MASK2;        \
 670   if ((c1) < t2)                    \
 671     c2++;
 672
 673 #define mul_add_c2(a, b, c0, c1, c2)           \
 674   t = (BN_ULLONG)a * b;                        \
 675   tt = (t + t) & BN_MASK;                      \
 676   if (tt < t)                                  \
 677     c2++;                                      \
 678   t1 = (BN_ULONG)Lw(tt);                       \
 679   t2 = (BN_ULONG)Hw(tt);                       \
 680   c0 = (c0 + t1) & BN_MASK2;                   \
 681   if ((c0 < t1) && (((++t2) & BN_MASK2) == 0)) \
 682     c2++;                                      \
 683   c1 = (c1 + t2) & BN_MASK2;                   \
 684   if ((c1) < t2)                               \
 685     c2++;
 686
 687 #define sqr_add_c(a, i, c0, c1, c2) \
 688   t = (BN_ULLONG)a[i] * a[i];       \
 689   t1 = (BN_ULONG)Lw(t);             \
 690   t2 = (BN_ULONG)Hw(t);             \
 691   c0 = (c0 + t1) & BN_MASK2;        \
 692   if ((c0) < t1)                    \
 693     t2++;                           \
 694   c1 = (c1 + t2) & BN_MASK2;        \
 695   if ((c1) < t2)                    \
 696     c2++;
 697
 698 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
 699
 700 #elif defined(BN_UMULT_LOHI)
 701
 702 #define mul_add_c(a, b, c0, c1, c2) \
 703   {                                 \
 704     BN_ULONG ta = (a), tb = (b);    \
 705     BN_UMULT_LOHI(t1, t2, ta, tb);  \
 706     c0 += t1;                       \
 707     t2 += (c0 < t1) ? 1 : 0;        \
 708     c1 += t2;                       \
 709     c2 += (c1 < t2) ? 1 : 0;        \
 710   }
 711
 712 #define mul_add_c2(a, b, c0, c1, c2) \
 713   {                                  \
 714     BN_ULONG ta = (a), tb = (b), t0; \
 715     BN_UMULT_LOHI(t0, t1, ta, tb);   \
 716     t2 = t1 + t1;                    \
 717     c2 += (t2 < t1) ? 1 : 0;         \
 718     t1 = t0 + t0;                    \
 719     t2 += (t1 < t0) ? 1 : 0;         \
 720     c0 += t1;                        \
 721     t2 += (c0 < t1) ? 1 : 0;         \
 722     c1 += t2;                        \
 723     c2 += (c1 < t2) ? 1 : 0;         \
 724   }
 725
 726 #define sqr_add_c(a, i, c0, c1, c2) \
 727   {                                 \
 728     BN_ULONG ta = (a)[i];           \
 729     BN_UMULT_LOHI(t1, t2, ta, ta);  \
 730     c0 += t1;                       \
 731     t2 += (c0 < t1) ? 1 : 0;        \
 732     c1 += t2;                       \
 733     c2 += (c1 < t2) ? 1 : 0;        \
 734   }
 735
 736 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
 737
 738 #elif defined(BN_UMULT_HIGH)
 739
 740 #define mul_add_c(a, b, c0, c1, c2) \
 741   {                                 \
 742     BN_ULONG ta = (a), tb = (b);    \
 743     t1 = ta * tb;                   \
 744     t2 = BN_UMULT_HIGH(ta, tb);     \
 745     c0 += t1;                       \
 746     t2 += (c0 < t1) ? 1 : 0;        \
 747     c1 += t2;                       \
 748     c2 += (c1 < t2) ? 1 : 0;        \
 749   }
 750
 751 #define mul_add_c2(a, b, c0, c1, c2) \
 752   {                                  \
 753     BN_ULONG ta = (a), tb = (b), t0; \
 754     t1 = BN_UMULT_HIGH(ta, tb);      \
 755     t0 = ta * tb;                    \
 756     t2 = t1 + t1;                    \
 757     c2 += (t2 < t1) ? 1 : 0;         \
 758     t1 = t0 + t0;                    \
 759     t2 += (t1 < t0) ? 1 : 0;         \
 760     c0 += t1;                        \
 761     t2 += (c0 < t1) ? 1 : 0;         \
 762     c1 += t2;                        \
 763     c2 += (c1 < t2) ? 1 : 0;         \
 764   }
 765
 766 #define sqr_add_c(a, i, c0, c1, c2) \
 767   {                                 \
 768     BN_ULONG ta = (a)[i];           \
 769     t1 = ta * ta;                   \
 770     t2 = BN_UMULT_HIGH(ta, ta);     \
 771     c0 += t1;                       \
 772     t2 += (c0 < t1) ? 1 : 0;        \
 773     c1 += t2;                       \
 774     c2 += (c1 < t2) ? 1 : 0;        \
 775   }
 776
 777 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
 778
 779 #else /* !BN_LLONG */
 780 #define mul_add_c(a, b, c0, c1, c2) \
 781   t1 = LBITS(a);                    \
 782   t2 = HBITS(a);                    \
 783   bl = LBITS(b);                    \
 784   bh = HBITS(b);                    \
 785   mul64(t1, t2, bl, bh);            \
 786   c0 = (c0 + t1) & BN_MASK2;        \
 787   if ((c0) < t1)                    \
 788     t2++;                           \
 789   c1 = (c1 + t2) & BN_MASK2;        \
 790   if ((c1) < t2)                    \
 791     c2++;
 792
 793 #define mul_add_c2(a, b, c0, c1, c2)           \
 794   t1 = LBITS(a);                               \
 795   t2 = HBITS(a);                               \
 796   bl = LBITS(b);                               \
 797   bh = HBITS(b);                               \
 798   mul64(t1, t2, bl, bh);                       \
 799   if (t2 & BN_TBIT)                            \
 800     c2++;                                      \
 801   t2 = (t2 + t2) & BN_MASK2;                   \
 802   if (t1 & BN_TBIT)                            \
 803     t2++;                                      \
 804   t1 = (t1 + t1) & BN_MASK2;                   \
 805   c0 = (c0 + t1) & BN_MASK2;                   \
 806   if ((c0 < t1) && (((++t2) & BN_MASK2) == 0)) \
 807     c2++;                                      \
 808   c1 = (c1 + t2) & BN_MASK2;                   \
 809   if ((c1) < t2)                               \
 810     c2++;
 811
 812 #define sqr_add_c(a, i, c0, c1, c2) \
 813   sqr64(t1, t2, (a)[i]);            \
 814   c0 = (c0 + t1) & BN_MASK2;        \
 815   if ((c0) < t1)                    \
 816     t2++;                           \
 817   c1 = (c1 + t2) & BN_MASK2;        \
 818   if ((c1) < t2)                    \
 819     c2++;
 820
 821 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
 822 #endif /* !BN_LLONG */
 823
 824 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
 825 #if defined(BN_LLONG)
 826   BN_ULLONG t;
 827 #elif !defined(BN_UMULT_LOHI) && !defined(BN_UMULT_HIGH)
 828   BN_ULONG bl, bh;
 829 #endif
 830   BN_ULONG t1, t2;
 831   BN_ULONG c1, c2, c3;
 832
 833   c1 = 0;
 834   c2 = 0;
 835   c3 = 0;
 836   mul_add_c(a[0], b[0], c1, c2, c3);
 837   r[0] = c1;
 838   c1 = 0;
 839   mul_add_c(a[0], b[1], c2, c3, c1);
 840   mul_add_c(a[1], b[0], c2, c3, c1);
 841   r[1] = c2;
 842   c2 = 0;
 843   mul_add_c(a[2], b[0], c3, c1, c2);
 844   mul_add_c(a[1], b[1], c3, c1, c2);
 845   mul_add_c(a[0], b[2], c3, c1, c2);
 846   r[2] = c3;
 847   c3 = 0;
 848   mul_add_c(a[0], b[3], c1, c2, c3);
 849   mul_add_c(a[1], b[2], c1, c2, c3);
 850   mul_add_c(a[2], b[1], c1, c2, c3);
 851   mul_add_c(a[3], b[0], c1, c2, c3);
 852   r[3] = c1;
 853   c1 = 0;
 854   mul_add_c(a[4], b[0], c2, c3, c1);
 855   mul_add_c(a[3], b[1], c2, c3, c1);
 856   mul_add_c(a[2], b[2], c2, c3, c1);
 857   mul_add_c(a[1], b[3], c2, c3, c1);
 858   mul_add_c(a[0], b[4], c2, c3, c1);
 859   r[4] = c2;
 860   c2 = 0;
 861   mul_add_c(a[0], b[5], c3, c1, c2);
 862   mul_add_c(a[1], b[4], c3, c1, c2);
 863   mul_add_c(a[2], b[3], c3, c1, c2);
 864   mul_add_c(a[3], b[2], c3, c1, c2);
 865   mul_add_c(a[4], b[1], c3, c1, c2);
 866   mul_add_c(a[5], b[0], c3, c1, c2);
 867   r[5] = c3;
 868   c3 = 0;
 869   mul_add_c(a[6], b[0], c1, c2, c3);
 870   mul_add_c(a[5], b[1], c1, c2, c3);
 871   mul_add_c(a[4], b[2], c1, c2, c3);
 872   mul_add_c(a[3], b[3], c1, c2, c3);
 873   mul_add_c(a[2], b[4], c1, c2, c3);
 874   mul_add_c(a[1], b[5], c1, c2, c3);
 875   mul_add_c(a[0], b[6], c1, c2, c3);
 876   r[6] = c1;
 877   c1 = 0;
 878   mul_add_c(a[0], b[7], c2, c3, c1);
 879   mul_add_c(a[1], b[6], c2, c3, c1);
 880   mul_add_c(a[2], b[5], c2, c3, c1);
 881   mul_add_c(a[3], b[4], c2, c3, c1);
 882   mul_add_c(a[4], b[3], c2, c3, c1);
 883   mul_add_c(a[5], b[2], c2, c3, c1);
 884   mul_add_c(a[6], b[1], c2, c3, c1);
 885   mul_add_c(a[7], b[0], c2, c3, c1);
 886   r[7] = c2;
 887   c2 = 0;
 888   mul_add_c(a[7], b[1], c3, c1, c2);
 889   mul_add_c(a[6], b[2], c3, c1, c2);
 890   mul_add_c(a[5], b[3], c3, c1, c2);
 891   mul_add_c(a[4], b[4], c3, c1, c2);
 892   mul_add_c(a[3], b[5], c3, c1, c2);
 893   mul_add_c(a[2], b[6], c3, c1, c2);
 894   mul_add_c(a[1], b[7], c3, c1, c2);
 895   r[8] = c3;
 896   c3 = 0;
 897   mul_add_c(a[2], b[7], c1, c2, c3);
 898   mul_add_c(a[3], b[6], c1, c2, c3);
 899   mul_add_c(a[4], b[5], c1, c2, c3);
 900   mul_add_c(a[5], b[4], c1, c2, c3);
 901   mul_add_c(a[6], b[3], c1, c2, c3);
 902   mul_add_c(a[7], b[2], c1, c2, c3);
 903   r[9] = c1;
 904   c1 = 0;
 905   mul_add_c(a[7], b[3], c2, c3, c1);
 906   mul_add_c(a[6], b[4], c2, c3, c1);
 907   mul_add_c(a[5], b[5], c2, c3, c1);
 908   mul_add_c(a[4], b[6], c2, c3, c1);
 909   mul_add_c(a[3], b[7], c2, c3, c1);
 910   r[10] = c2;
 911   c2 = 0;
 912   mul_add_c(a[4], b[7], c3, c1, c2);
 913   mul_add_c(a[5], b[6], c3, c1, c2);
 914   mul_add_c(a[6], b[5], c3, c1, c2);
 915   mul_add_c(a[7], b[4], c3, c1, c2);
 916   r[11] = c3;
 917   c3 = 0;
 918   mul_add_c(a[7], b[5], c1, c2, c3);
 919   mul_add_c(a[6], b[6], c1, c2, c3);
 920   mul_add_c(a[5], b[7], c1, c2, c3);
 921   r[12] = c1;
 922   c1 = 0;
 923   mul_add_c(a[6], b[7], c2, c3, c1);
 924   mul_add_c(a[7], b[6], c2, c3, c1);
 925   r[13] = c2;
 926   c2 = 0;
 927   mul_add_c(a[7], b[7], c3, c1, c2);
 928   r[14] = c3;
 929   r[15] = c1;
 930 }
 931
 932 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
 933 #if defined(BN_LLONG)
 934   BN_ULLONG t;
 935 #elif !defined(BN_UMULT_LOHI) && !defined(BN_UMULT_HIGH)
 936   BN_ULONG bl, bh;
 937 #endif
 938   BN_ULONG t1, t2;
 939   BN_ULONG c1, c2, c3;
 940
 941   c1 = 0;
 942   c2 = 0;
 943   c3 = 0;
 944   mul_add_c(a[0], b[0], c1, c2, c3);
 945   r[0] = c1;
 946   c1 = 0;
 947   mul_add_c(a[0], b[1], c2, c3, c1);
 948   mul_add_c(a[1], b[0], c2, c3, c1);
 949   r[1] = c2;
 950   c2 = 0;
 951   mul_add_c(a[2], b[0], c3, c1, c2);
 952   mul_add_c(a[1], b[1], c3, c1, c2);
 953   mul_add_c(a[0], b[2], c3, c1, c2);
 954   r[2] = c3;
 955   c3 = 0;
 956   mul_add_c(a[0], b[3], c1, c2, c3);
 957   mul_add_c(a[1], b[2], c1, c2, c3);
 958   mul_add_c(a[2], b[1], c1, c2, c3);
 959   mul_add_c(a[3], b[0], c1, c2, c3);
 960   r[3] = c1;
 961   c1 = 0;
 962   mul_add_c(a[3], b[1], c2, c3, c1);
 963   mul_add_c(a[2], b[2], c2, c3, c1);
 964   mul_add_c(a[1], b[3], c2, c3, c1);
 965   r[4] = c2;
 966   c2 = 0;
 967   mul_add_c(a[2], b[3], c3, c1, c2);
 968   mul_add_c(a[3], b[2], c3, c1, c2);
 969   r[5] = c3;
 970   c3 = 0;
 971   mul_add_c(a[3], b[3], c1, c2, c3);
 972   r[6] = c1;
 973   r[7] = c2;
 974 }
 975
 976 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
 977 #if defined(BN_LLONG)
 978   BN_ULLONG t, tt;
 979 #elif !defined(BN_UMULT_LOHI) && !defined(BN_UMULT_HIGH)
 980   BN_ULONG bl, bh;
 981 #endif
 982   BN_ULONG t1, t2;
 983   BN_ULONG c1, c2, c3;
 984
 985   c1 = 0;
 986   c2 = 0;
 987   c3 = 0;
 988   sqr_add_c(a, 0, c1, c2, c3);
 989   r[0] = c1;
 990   c1 = 0;
 991   sqr_add_c2(a, 1, 0, c2, c3, c1);
 992   r[1] = c2;
 993   c2 = 0;
 994   sqr_add_c(a, 1, c3, c1, c2);
 995   sqr_add_c2(a, 2, 0, c3, c1, c2);
 996   r[2] = c3;
 997   c3 = 0;
 998   sqr_add_c2(a, 3, 0, c1, c2, c3);
 999   sqr_add_c2(a, 2, 1, c1, c2, c3);
1000   r[3] = c1;
1001   c1 = 0;
1002   sqr_add_c(a, 2, c2, c3, c1);
1003   sqr_add_c2(a, 3, 1, c2, c3, c1);
1004   sqr_add_c2(a, 4, 0, c2, c3, c1);
1005   r[4] = c2;
1006   c2 = 0;
1007   sqr_add_c2(a, 5, 0, c3, c1, c2);
1008   sqr_add_c2(a, 4, 1, c3, c1, c2);
1009   sqr_add_c2(a, 3, 2, c3, c1, c2);
1010   r[5] = c3;
1011   c3 = 0;
1012   sqr_add_c(a, 3, c1, c2, c3);
1013   sqr_add_c2(a, 4, 2, c1, c2, c3);
1014   sqr_add_c2(a, 5, 1, c1, c2, c3);
1015   sqr_add_c2(a, 6, 0, c1, c2, c3);
1016   r[6] = c1;
1017   c1 = 0;
1018   sqr_add_c2(a, 7, 0, c2, c3, c1);
1019   sqr_add_c2(a, 6, 1, c2, c3, c1);
1020   sqr_add_c2(a, 5, 2, c2, c3, c1);
1021   sqr_add_c2(a, 4, 3, c2, c3, c1);
1022   r[7] = c2;
1023   c2 = 0;
1024   sqr_add_c(a, 4, c3, c1, c2);
1025   sqr_add_c2(a, 5, 3, c3, c1, c2);
1026   sqr_add_c2(a, 6, 2, c3, c1, c2);
1027   sqr_add_c2(a, 7, 1, c3, c1, c2);
1028   r[8] = c3;
1029   c3 = 0;
1030   sqr_add_c2(a, 7, 2, c1, c2, c3);
1031   sqr_add_c2(a, 6, 3, c1, c2, c3);
1032   sqr_add_c2(a, 5, 4, c1, c2, c3);
1033   r[9] = c1;
1034   c1 = 0;
1035   sqr_add_c(a, 5, c2, c3, c1);
1036   sqr_add_c2(a, 6, 4, c2, c3, c1);
1037   sqr_add_c2(a, 7, 3, c2, c3, c1);
1038   r[10] = c2;
1039   c2 = 0;
1040   sqr_add_c2(a, 7, 4, c3, c1, c2);
1041   sqr_add_c2(a, 6, 5, c3, c1, c2);
1042   r[11] = c3;
1043   c3 = 0;
1044   sqr_add_c(a, 6, c1, c2, c3);
1045   sqr_add_c2(a, 7, 5, c1, c2, c3);
1046   r[12] = c1;
1047   c1 = 0;
1048   sqr_add_c2(a, 7, 6, c2, c3, c1);
1049   r[13] = c2;
1050   c2 = 0;
1051   sqr_add_c(a, 7, c3, c1, c2);
1052   r[14] = c3;
1053   r[15] = c1;
1054 }
1055
1056 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
1057 #if defined(BN_LLONG)
1058   BN_ULLONG t, tt;
1059 #elif !defined(BN_UMULT_LOHI) && !defined(BN_UMULT_HIGH)
1060   BN_ULONG bl, bh;
1061 #endif
1062   BN_ULONG t1, t2;
1063   BN_ULONG c1, c2, c3;
1064
1065   c1 = 0;
1066   c2 = 0;
1067   c3 = 0;
1068   sqr_add_c(a, 0, c1, c2, c3);
1069   r[0] = c1;
1070   c1 = 0;
1071   sqr_add_c2(a, 1, 0, c2, c3, c1);
1072   r[1] = c2;
1073   c2 = 0;
1074   sqr_add_c(a, 1, c3, c1, c2);
1075   sqr_add_c2(a, 2, 0, c3, c1, c2);
1076   r[2] = c3;
1077   c3 = 0;
1078   sqr_add_c2(a, 3, 0, c1, c2, c3);
1079   sqr_add_c2(a, 2, 1, c1, c2, c3);
1080   r[3] = c1;
1081   c1 = 0;
1082   sqr_add_c(a, 2, c2, c3, c1);
1083   sqr_add_c2(a, 3, 1, c2, c3, c1);
1084   r[4] = c2;
1085   c2 = 0;
1086   sqr_add_c2(a, 3, 2, c3, c1, c2);
1087   r[5] = c3;
1088   c3 = 0;
1089   sqr_add_c(a, 3, c1, c2, c3);
1090   r[6] = c1;
1091   r[7] = c2;
1092 }
1093
1094 #if defined(OPENSSL_NO_ASM) || (!defined(OPENSSL_ARM) && !defined(OPENSSL_X86_64))
1095 /* This is essentially reference implementation, which may or may not
1096  * result in performance improvement. E.g. on IA-32 this routine was
1097  * observed to give 40% faster rsa1024 private key operations and 10%
1098  * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
1099  * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
1100  * reference implementation, one to be used as starting point for
1101  * platform-specific assembler. Mentioned numbers apply to compiler
1102  * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
1103  * can vary not only from platform to platform, but even for compiler
1104  * versions. Assembler vs. assembler improvement coefficients can
1105  * [and are known to] differ and are to be documented elsewhere. */
1106 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1107                 const BN_ULONG *np, const BN_ULONG *n0p, int num) {
1108   BN_ULONG c0, c1, ml, *tp, n0;
1109 #ifdef mul64
1110   BN_ULONG mh;
1111 #endif
1112   volatile BN_ULONG *vp;
1113   int i = 0, j;
1114
1115 #if 0 /* template for platform-specific implementation */
1116         if (ap==bp)     return bn_sqr_mont(rp,ap,np,n0p,num);
1117 #endif
1118   vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1119
1120   n0 = *n0p;
1121
1122   c0 = 0;
1123   ml = bp[0];
1124 #ifdef mul64
1125   mh = HBITS(ml);
1126   ml = LBITS(ml);
1127   for (j = 0; j < num; ++j)
1128     mul(tp[j], ap[j], ml, mh, c0);
1129 #else
1130   for (j = 0; j < num; ++j)
1131     mul(tp[j], ap[j], ml, c0);
1132 #endif
1133
1134   tp[num] = c0;
1135   tp[num + 1] = 0;
1136   goto enter;
1137
1138   for (i = 0; i < num; i++) {
1139     c0 = 0;
1140     ml = bp[i];
1141 #ifdef mul64
1142     mh = HBITS(ml);
1143     ml = LBITS(ml);
1144     for (j = 0; j < num; ++j)
1145       mul_add(tp[j], ap[j], ml, mh, c0);
1146 #else
1147     for (j = 0; j < num; ++j)
1148       mul_add(tp[j], ap[j], ml, c0);
1149 #endif
1150     c1 = (tp[num] + c0) & BN_MASK2;
1151     tp[num] = c1;
1152     tp[num + 1] = (c1 < c0 ? 1 : 0);
1153   enter:
1154     c1 = tp[0];
1155     ml = (c1 * n0) & BN_MASK2;
1156     c0 = 0;
1157 #ifdef mul64
1158     mh = HBITS(ml);
1159     ml = LBITS(ml);
1160     mul_add(c1, np[0], ml, mh, c0);
1161 #else
1162     mul_add(c1, ml, np[0], c0);
1163 #endif
1164     for (j = 1; j < num; j++) {
1165       c1 = tp[j];
1166 #ifdef mul64
1167       mul_add(c1, np[j], ml, mh, c0);
1168 #else
1169       mul_add(c1, ml, np[j], c0);
1170 #endif
1171       tp[j - 1] = c1 & BN_MASK2;
1172     }
1173     c1 = (tp[num] + c0) & BN_MASK2;
1174     tp[num - 1] = c1;
1175     tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
1176   }
1177
1178   if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1179     c0 = bn_sub_words(rp, tp, np, num);
1180     if (tp[num] != 0 || c0 == 0) {
1181       for (i = 0; i < num + 2; i++)
1182         vp[i] = 0;
1183       return 1;
1184     }
1185   }
1186   for (i = 0; i < num; i++)
1187     rp[i] = tp[i], vp[i] = 0;
1188   vp[num] = 0;
1189   vp[num + 1] = 0;
1190   return 1;
1191 }
1192 #endif
1193
1194 #endif