1 /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
4 * This package is an SSL implementation written
5 * by Eric Young (eay@cryptsoft.com).
6 * The implementation was written so as to conform with Netscapes SSL.
8 * This library is free for commercial and non-commercial use as long as
9 * the following conditions are aheared to. The following conditions
10 * apply to all code found in this distribution, be it the RC4, RSA,
11 * lhash, DES, etc., code; not just the SSL code. The SSL documentation
12 * included with this distribution is covered by the same copyright terms
13 * except that the holder is Tim Hudson (tjh@cryptsoft.com).
15 * Copyright remains Eric Young's, and as such any Copyright notices in
16 * the code are not to be removed.
17 * If this package is used in a product, Eric Young should be given attribution
18 * as the author of the parts of the library used.
19 * This can be in the form of a textual message at program startup or
20 * in documentation (online or textual) provided with the package.
22 * Redistribution and use in source and binary forms, with or without
23 * modification, are permitted provided that the following conditions
25 * 1. Redistributions of source code must retain the copyright
26 * notice, this list of conditions and the following disclaimer.
27 * 2. Redistributions in binary form must reproduce the above copyright
28 * notice, this list of conditions and the following disclaimer in the
29 * documentation and/or other materials provided with the distribution.
30 * 3. All advertising materials mentioning features or use of this software
31 * must display the following acknowledgement:
32 * "This product includes cryptographic software written by
33 * Eric Young (eay@cryptsoft.com)"
34 * The word 'cryptographic' can be left out if the rouines from the library
35 * being used are not cryptographic related :-).
36 * 4. If you include any Windows specific code (or a derivative thereof) from
37 * the apps directory (application code) you must include an acknowledgement:
38 * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
40 * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
52 * The licence and distribution terms for any publically available version or
53 * derivative of this code cannot be changed. i.e. this code cannot simply be
54 * copied and put under another distribution licence
55 * [including the GNU Public Licence.] */
57 #include <openssl/bn.h>
64 /* Generic implementations of most operations are needed for:
65 * - Configurations without inline assembly.
66 * - Architectures other than x86 or x86_64.
67 * - Windows x84_64; x86_64-gcc.c does not build on MSVC. */
68 #if defined(OPENSSL_NO_ASM) || \
69 (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86)) || \
70 (defined(OPENSSL_X86_64) && defined(OPENSSL_WINDOWS))
72 #if defined(OPENSSL_WINDOWS)
73 #define alloca _alloca
79 #define mul_add(r, a, w, c) \
82 t = (BN_ULLONG)w * (a) + (r) + (c); \
87 #define mul(r, a, w, c) \
90 t = (BN_ULLONG)w * (a) + (c); \
95 #define sqr(r0, r1, a) \
98 t = (BN_ULLONG)(a) * (a); \
103 #elif defined(BN_UMULT_LOHI)
104 #define mul_add(r, a, w, c) \
106 BN_ULONG high, low, ret, tmp = (a); \
108 BN_UMULT_LOHI(low, high, w, tmp); \
110 (c) = (ret < (c)) ? 1 : 0; \
113 (c) += (ret < low) ? 1 : 0; \
117 #define mul(r, a, w, c) \
119 BN_ULONG high, low, ret, ta = (a); \
120 BN_UMULT_LOHI(low, high, w, ta); \
123 (c) += (ret < low) ? 1 : 0; \
127 #define sqr(r0, r1, a) \
129 BN_ULONG tmp = (a); \
130 BN_UMULT_LOHI(r0, r1, tmp, tmp); \
133 #elif defined(BN_UMULT_HIGH)
134 #define mul_add(r, a, w, c) \
136 BN_ULONG high, low, ret, tmp = (a); \
138 high = BN_UMULT_HIGH(w, tmp); \
141 (c) = (ret < (c)) ? 1 : 0; \
144 (c) += (ret < low) ? 1 : 0; \
148 #define mul(r, a, w, c) \
150 BN_ULONG high, low, ret, ta = (a); \
152 high = BN_UMULT_HIGH(w, ta); \
155 (c) += (ret < low) ? 1 : 0; \
159 #define sqr(r0, r1, a) \
161 BN_ULONG tmp = (a); \
163 (r1) = BN_UMULT_HIGH(tmp, tmp); \
167 /*************************************************************
171 #define LBITS(a) ((a) & BN_MASK2l)
172 #define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
173 #define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
175 #define LLBITS(a) ((a) & BN_MASKl)
176 #define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
177 #define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
179 #define mul64(l, h, bl, bh) \
181 BN_ULONG m, m1, lt, ht; \
189 m = (m + m1) & BN_MASK2; \
191 ht += L2HBITS((BN_ULONG)1); \
194 lt = (lt + m1) & BN_MASK2; \
201 #define sqr64(lo, ho, in) \
211 h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
212 m = (m & BN_MASK2l) << (BN_BITS4 + 1); \
213 l = (l + m) & BN_MASK2; \
220 #define mul_add(r, a, bl, bh, c) \
227 mul64(l, h, (bl), (bh)); \
229 /* non-multiply part */ \
230 l = (l + (c)) & BN_MASK2; \
234 l = (l + (c)) & BN_MASK2; \
237 (c) = h & BN_MASK2; \
241 #define mul(r, a, bl, bh, c) \
248 mul64(l, h, (bl), (bh)); \
250 /* non-multiply part */ \
252 if ((l & BN_MASK2) < (c)) \
254 (c) = h & BN_MASK2; \
255 (r) = l & BN_MASK2; \
257 #endif /* !BN_LLONG */
259 #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
261 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
271 mul_add(rp[0], ap[0], w, c1);
272 mul_add(rp[1], ap[1], w, c1);
273 mul_add(rp[2], ap[2], w, c1);
274 mul_add(rp[3], ap[3], w, c1);
281 mul_add(rp[0], ap[0], w, c1);
290 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
299 mul(rp[0], ap[0], w, c1);
300 mul(rp[1], ap[1], w, c1);
301 mul(rp[2], ap[2], w, c1);
302 mul(rp[3], ap[3], w, c1);
308 mul(rp[0], ap[0], w, c1);
316 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
323 sqr(r[0], r[1], a[0]);
324 sqr(r[2], r[3], a[1]);
325 sqr(r[4], r[5], a[2]);
326 sqr(r[6], r[7], a[3]);
332 sqr(r[0], r[1], a[0]);
339 #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
341 BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
355 mul_add(rp[0], ap[0], bl, bh, c);
356 mul_add(rp[1], ap[1], bl, bh, c);
357 mul_add(rp[2], ap[2], bl, bh, c);
358 mul_add(rp[3], ap[3], bl, bh, c);
364 mul_add(rp[0], ap[0], bl, bh, c);
372 BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
385 mul(rp[0], ap[0], bl, bh, carry);
386 mul(rp[1], ap[1], bl, bh, carry);
387 mul(rp[2], ap[2], bl, bh, carry);
388 mul(rp[3], ap[3], bl, bh, carry);
394 mul(rp[0], ap[0], bl, bh, carry);
402 void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
409 sqr64(r[0], r[1], a[0]);
410 sqr64(r[2], r[3], a[1]);
411 sqr64(r[4], r[5], a[2]);
412 sqr64(r[6], r[7], a[3]);
418 sqr64(r[0], r[1], a[0]);
425 #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
427 #if defined(BN_LLONG) && defined(BN_DIV2W)
429 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
430 return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
435 /* Divide h,l by d and return the result. */
436 BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
437 BN_ULONG dh, dl, q, ret = 0, th, tl, t;
444 i = BN_num_bits_word(d);
445 assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
454 h = (h << i) | (l >> (BN_BITS2 - i));
457 dh = (d & BN_MASK2h) >> BN_BITS4;
458 dl = (d & BN_MASK2l);
460 if ((h >> BN_BITS4) == dh) {
470 if ((t & BN_MASK2h) ||
471 ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
478 t = (tl >> BN_BITS4);
479 tl = (tl << BN_BITS4) & BN_MASK2h;
497 h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
498 l = (l & BN_MASK2l) << BN_BITS4;
505 #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
508 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
518 ll += (BN_ULLONG)a[0] + b[0];
519 r[0] = (BN_ULONG)ll & BN_MASK2;
521 ll += (BN_ULLONG)a[1] + b[1];
522 r[1] = (BN_ULONG)ll & BN_MASK2;
524 ll += (BN_ULLONG)a[2] + b[2];
525 r[2] = (BN_ULONG)ll & BN_MASK2;
527 ll += (BN_ULLONG)a[3] + b[3];
528 r[3] = (BN_ULONG)ll & BN_MASK2;
536 ll += (BN_ULLONG)a[0] + b[0];
537 r[0] = (BN_ULONG)ll & BN_MASK2;
547 #else /* !BN_LLONG */
549 BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
561 t = (t + c) & BN_MASK2;
563 l = (t + b[0]) & BN_MASK2;
567 t = (t + c) & BN_MASK2;
569 l = (t + b[1]) & BN_MASK2;
573 t = (t + c) & BN_MASK2;
575 l = (t + b[2]) & BN_MASK2;
579 t = (t + c) & BN_MASK2;
581 l = (t + b[3]) & BN_MASK2;
591 t = (t + c) & BN_MASK2;
593 l = (t + b[0]) & BN_MASK2;
604 #endif /* !BN_LLONG */
606 BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
619 r[0] = (t1 - t2 - c) & BN_MASK2;
624 r[1] = (t1 - t2 - c) & BN_MASK2;
629 r[2] = (t1 - t2 - c) & BN_MASK2;
634 r[3] = (t1 - t2 - c) & BN_MASK2;
645 r[0] = (t1 - t2 - c) & BN_MASK2;
656 /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
657 /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
658 /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
659 /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
662 #define mul_add_c(a, b, c0, c1, c2) \
663 t = (BN_ULLONG)a * b; \
664 t1 = (BN_ULONG)Lw(t); \
665 t2 = (BN_ULONG)Hw(t); \
666 c0 = (c0 + t1) & BN_MASK2; \
669 c1 = (c1 + t2) & BN_MASK2; \
673 #define mul_add_c2(a, b, c0, c1, c2) \
674 t = (BN_ULLONG)a * b; \
675 tt = (t + t) & BN_MASK; \
678 t1 = (BN_ULONG)Lw(tt); \
679 t2 = (BN_ULONG)Hw(tt); \
680 c0 = (c0 + t1) & BN_MASK2; \
681 if ((c0 < t1) && (((++t2) & BN_MASK2) == 0)) \
683 c1 = (c1 + t2) & BN_MASK2; \
687 #define sqr_add_c(a, i, c0, c1, c2) \
688 t = (BN_ULLONG)a[i] * a[i]; \
689 t1 = (BN_ULONG)Lw(t); \
690 t2 = (BN_ULONG)Hw(t); \
691 c0 = (c0 + t1) & BN_MASK2; \
694 c1 = (c1 + t2) & BN_MASK2; \
698 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
700 #elif defined(BN_UMULT_LOHI)
702 #define mul_add_c(a, b, c0, c1, c2) \
704 BN_ULONG ta = (a), tb = (b); \
705 BN_UMULT_LOHI(t1, t2, ta, tb); \
707 t2 += (c0 < t1) ? 1 : 0; \
709 c2 += (c1 < t2) ? 1 : 0; \
712 #define mul_add_c2(a, b, c0, c1, c2) \
714 BN_ULONG ta = (a), tb = (b), t0; \
715 BN_UMULT_LOHI(t0, t1, ta, tb); \
717 c2 += (t2 < t1) ? 1 : 0; \
719 t2 += (t1 < t0) ? 1 : 0; \
721 t2 += (c0 < t1) ? 1 : 0; \
723 c2 += (c1 < t2) ? 1 : 0; \
726 #define sqr_add_c(a, i, c0, c1, c2) \
728 BN_ULONG ta = (a)[i]; \
729 BN_UMULT_LOHI(t1, t2, ta, ta); \
731 t2 += (c0 < t1) ? 1 : 0; \
733 c2 += (c1 < t2) ? 1 : 0; \
736 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
738 #elif defined(BN_UMULT_HIGH)
740 #define mul_add_c(a, b, c0, c1, c2) \
742 BN_ULONG ta = (a), tb = (b); \
744 t2 = BN_UMULT_HIGH(ta, tb); \
746 t2 += (c0 < t1) ? 1 : 0; \
748 c2 += (c1 < t2) ? 1 : 0; \
751 #define mul_add_c2(a, b, c0, c1, c2) \
753 BN_ULONG ta = (a), tb = (b), t0; \
754 t1 = BN_UMULT_HIGH(ta, tb); \
757 c2 += (t2 < t1) ? 1 : 0; \
759 t2 += (t1 < t0) ? 1 : 0; \
761 t2 += (c0 < t1) ? 1 : 0; \
763 c2 += (c1 < t2) ? 1 : 0; \
766 #define sqr_add_c(a, i, c0, c1, c2) \
768 BN_ULONG ta = (a)[i]; \
770 t2 = BN_UMULT_HIGH(ta, ta); \
772 t2 += (c0 < t1) ? 1 : 0; \
774 c2 += (c1 < t2) ? 1 : 0; \
777 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
779 #else /* !BN_LLONG */
780 #define mul_add_c(a, b, c0, c1, c2) \
785 mul64(t1, t2, bl, bh); \
786 c0 = (c0 + t1) & BN_MASK2; \
789 c1 = (c1 + t2) & BN_MASK2; \
793 #define mul_add_c2(a, b, c0, c1, c2) \
798 mul64(t1, t2, bl, bh); \
801 t2 = (t2 + t2) & BN_MASK2; \
804 t1 = (t1 + t1) & BN_MASK2; \
805 c0 = (c0 + t1) & BN_MASK2; \
806 if ((c0 < t1) && (((++t2) & BN_MASK2) == 0)) \
808 c1 = (c1 + t2) & BN_MASK2; \
812 #define sqr_add_c(a, i, c0, c1, c2) \
813 sqr64(t1, t2, (a)[i]); \
814 c0 = (c0 + t1) & BN_MASK2; \
817 c1 = (c1 + t2) & BN_MASK2; \
821 #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
822 #endif /* !BN_LLONG */
824 void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
825 #if defined(BN_LLONG)
827 #elif !defined(BN_UMULT_LOHI) && !defined(BN_UMULT_HIGH)
836 mul_add_c(a[0], b[0], c1, c2, c3);
839 mul_add_c(a[0], b[1], c2, c3, c1);
840 mul_add_c(a[1], b[0], c2, c3, c1);
843 mul_add_c(a[2], b[0], c3, c1, c2);
844 mul_add_c(a[1], b[1], c3, c1, c2);
845 mul_add_c(a[0], b[2], c3, c1, c2);
848 mul_add_c(a[0], b[3], c1, c2, c3);
849 mul_add_c(a[1], b[2], c1, c2, c3);
850 mul_add_c(a[2], b[1], c1, c2, c3);
851 mul_add_c(a[3], b[0], c1, c2, c3);
854 mul_add_c(a[4], b[0], c2, c3, c1);
855 mul_add_c(a[3], b[1], c2, c3, c1);
856 mul_add_c(a[2], b[2], c2, c3, c1);
857 mul_add_c(a[1], b[3], c2, c3, c1);
858 mul_add_c(a[0], b[4], c2, c3, c1);
861 mul_add_c(a[0], b[5], c3, c1, c2);
862 mul_add_c(a[1], b[4], c3, c1, c2);
863 mul_add_c(a[2], b[3], c3, c1, c2);
864 mul_add_c(a[3], b[2], c3, c1, c2);
865 mul_add_c(a[4], b[1], c3, c1, c2);
866 mul_add_c(a[5], b[0], c3, c1, c2);
869 mul_add_c(a[6], b[0], c1, c2, c3);
870 mul_add_c(a[5], b[1], c1, c2, c3);
871 mul_add_c(a[4], b[2], c1, c2, c3);
872 mul_add_c(a[3], b[3], c1, c2, c3);
873 mul_add_c(a[2], b[4], c1, c2, c3);
874 mul_add_c(a[1], b[5], c1, c2, c3);
875 mul_add_c(a[0], b[6], c1, c2, c3);
878 mul_add_c(a[0], b[7], c2, c3, c1);
879 mul_add_c(a[1], b[6], c2, c3, c1);
880 mul_add_c(a[2], b[5], c2, c3, c1);
881 mul_add_c(a[3], b[4], c2, c3, c1);
882 mul_add_c(a[4], b[3], c2, c3, c1);
883 mul_add_c(a[5], b[2], c2, c3, c1);
884 mul_add_c(a[6], b[1], c2, c3, c1);
885 mul_add_c(a[7], b[0], c2, c3, c1);
888 mul_add_c(a[7], b[1], c3, c1, c2);
889 mul_add_c(a[6], b[2], c3, c1, c2);
890 mul_add_c(a[5], b[3], c3, c1, c2);
891 mul_add_c(a[4], b[4], c3, c1, c2);
892 mul_add_c(a[3], b[5], c3, c1, c2);
893 mul_add_c(a[2], b[6], c3, c1, c2);
894 mul_add_c(a[1], b[7], c3, c1, c2);
897 mul_add_c(a[2], b[7], c1, c2, c3);
898 mul_add_c(a[3], b[6], c1, c2, c3);
899 mul_add_c(a[4], b[5], c1, c2, c3);
900 mul_add_c(a[5], b[4], c1, c2, c3);
901 mul_add_c(a[6], b[3], c1, c2, c3);
902 mul_add_c(a[7], b[2], c1, c2, c3);
905 mul_add_c(a[7], b[3], c2, c3, c1);
906 mul_add_c(a[6], b[4], c2, c3, c1);
907 mul_add_c(a[5], b[5], c2, c3, c1);
908 mul_add_c(a[4], b[6], c2, c3, c1);
909 mul_add_c(a[3], b[7], c2, c3, c1);
912 mul_add_c(a[4], b[7], c3, c1, c2);
913 mul_add_c(a[5], b[6], c3, c1, c2);
914 mul_add_c(a[6], b[5], c3, c1, c2);
915 mul_add_c(a[7], b[4], c3, c1, c2);
918 mul_add_c(a[7], b[5], c1, c2, c3);
919 mul_add_c(a[6], b[6], c1, c2, c3);
920 mul_add_c(a[5], b[7], c1, c2, c3);
923 mul_add_c(a[6], b[7], c2, c3, c1);
924 mul_add_c(a[7], b[6], c2, c3, c1);
927 mul_add_c(a[7], b[7], c3, c1, c2);
932 void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
933 #if defined(BN_LLONG)
935 #elif !defined(BN_UMULT_LOHI) && !defined(BN_UMULT_HIGH)
944 mul_add_c(a[0], b[0], c1, c2, c3);
947 mul_add_c(a[0], b[1], c2, c3, c1);
948 mul_add_c(a[1], b[0], c2, c3, c1);
951 mul_add_c(a[2], b[0], c3, c1, c2);
952 mul_add_c(a[1], b[1], c3, c1, c2);
953 mul_add_c(a[0], b[2], c3, c1, c2);
956 mul_add_c(a[0], b[3], c1, c2, c3);
957 mul_add_c(a[1], b[2], c1, c2, c3);
958 mul_add_c(a[2], b[1], c1, c2, c3);
959 mul_add_c(a[3], b[0], c1, c2, c3);
962 mul_add_c(a[3], b[1], c2, c3, c1);
963 mul_add_c(a[2], b[2], c2, c3, c1);
964 mul_add_c(a[1], b[3], c2, c3, c1);
967 mul_add_c(a[2], b[3], c3, c1, c2);
968 mul_add_c(a[3], b[2], c3, c1, c2);
971 mul_add_c(a[3], b[3], c1, c2, c3);
976 void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
977 #if defined(BN_LLONG)
979 #elif !defined(BN_UMULT_LOHI) && !defined(BN_UMULT_HIGH)
988 sqr_add_c(a, 0, c1, c2, c3);
991 sqr_add_c2(a, 1, 0, c2, c3, c1);
994 sqr_add_c(a, 1, c3, c1, c2);
995 sqr_add_c2(a, 2, 0, c3, c1, c2);
998 sqr_add_c2(a, 3, 0, c1, c2, c3);
999 sqr_add_c2(a, 2, 1, c1, c2, c3);
1002 sqr_add_c(a, 2, c2, c3, c1);
1003 sqr_add_c2(a, 3, 1, c2, c3, c1);
1004 sqr_add_c2(a, 4, 0, c2, c3, c1);
1007 sqr_add_c2(a, 5, 0, c3, c1, c2);
1008 sqr_add_c2(a, 4, 1, c3, c1, c2);
1009 sqr_add_c2(a, 3, 2, c3, c1, c2);
1012 sqr_add_c(a, 3, c1, c2, c3);
1013 sqr_add_c2(a, 4, 2, c1, c2, c3);
1014 sqr_add_c2(a, 5, 1, c1, c2, c3);
1015 sqr_add_c2(a, 6, 0, c1, c2, c3);
1018 sqr_add_c2(a, 7, 0, c2, c3, c1);
1019 sqr_add_c2(a, 6, 1, c2, c3, c1);
1020 sqr_add_c2(a, 5, 2, c2, c3, c1);
1021 sqr_add_c2(a, 4, 3, c2, c3, c1);
1024 sqr_add_c(a, 4, c3, c1, c2);
1025 sqr_add_c2(a, 5, 3, c3, c1, c2);
1026 sqr_add_c2(a, 6, 2, c3, c1, c2);
1027 sqr_add_c2(a, 7, 1, c3, c1, c2);
1030 sqr_add_c2(a, 7, 2, c1, c2, c3);
1031 sqr_add_c2(a, 6, 3, c1, c2, c3);
1032 sqr_add_c2(a, 5, 4, c1, c2, c3);
1035 sqr_add_c(a, 5, c2, c3, c1);
1036 sqr_add_c2(a, 6, 4, c2, c3, c1);
1037 sqr_add_c2(a, 7, 3, c2, c3, c1);
1040 sqr_add_c2(a, 7, 4, c3, c1, c2);
1041 sqr_add_c2(a, 6, 5, c3, c1, c2);
1044 sqr_add_c(a, 6, c1, c2, c3);
1045 sqr_add_c2(a, 7, 5, c1, c2, c3);
1048 sqr_add_c2(a, 7, 6, c2, c3, c1);
1051 sqr_add_c(a, 7, c3, c1, c2);
1056 void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
1057 #if defined(BN_LLONG)
1059 #elif !defined(BN_UMULT_LOHI) && !defined(BN_UMULT_HIGH)
1063 BN_ULONG c1, c2, c3;
1068 sqr_add_c(a, 0, c1, c2, c3);
1071 sqr_add_c2(a, 1, 0, c2, c3, c1);
1074 sqr_add_c(a, 1, c3, c1, c2);
1075 sqr_add_c2(a, 2, 0, c3, c1, c2);
1078 sqr_add_c2(a, 3, 0, c1, c2, c3);
1079 sqr_add_c2(a, 2, 1, c1, c2, c3);
1082 sqr_add_c(a, 2, c2, c3, c1);
1083 sqr_add_c2(a, 3, 1, c2, c3, c1);
1086 sqr_add_c2(a, 3, 2, c3, c1, c2);
1089 sqr_add_c(a, 3, c1, c2, c3);
1094 #if defined(OPENSSL_NO_ASM) || (!defined(OPENSSL_ARM) && !defined(OPENSSL_X86_64))
1095 /* This is essentially reference implementation, which may or may not
1096 * result in performance improvement. E.g. on IA-32 this routine was
1097 * observed to give 40% faster rsa1024 private key operations and 10%
1098 * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
1099 * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
1100 * reference implementation, one to be used as starting point for
1101 * platform-specific assembler. Mentioned numbers apply to compiler
1102 * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
1103 * can vary not only from platform to platform, but even for compiler
1104 * versions. Assembler vs. assembler improvement coefficients can
1105 * [and are known to] differ and are to be documented elsewhere. */
1106 int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
1107 const BN_ULONG *np, const BN_ULONG *n0p, int num) {
1108 BN_ULONG c0, c1, ml, *tp, n0;
1112 volatile BN_ULONG *vp;
1115 #if 0 /* template for platform-specific implementation */
1116 if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num);
1118 vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
1127 for (j = 0; j < num; ++j)
1128 mul(tp[j], ap[j], ml, mh, c0);
1130 for (j = 0; j < num; ++j)
1131 mul(tp[j], ap[j], ml, c0);
1138 for (i = 0; i < num; i++) {
1144 for (j = 0; j < num; ++j)
1145 mul_add(tp[j], ap[j], ml, mh, c0);
1147 for (j = 0; j < num; ++j)
1148 mul_add(tp[j], ap[j], ml, c0);
1150 c1 = (tp[num] + c0) & BN_MASK2;
1152 tp[num + 1] = (c1 < c0 ? 1 : 0);
1155 ml = (c1 * n0) & BN_MASK2;
1160 mul_add(c1, np[0], ml, mh, c0);
1162 mul_add(c1, ml, np[0], c0);
1164 for (j = 1; j < num; j++) {
1167 mul_add(c1, np[j], ml, mh, c0);
1169 mul_add(c1, ml, np[j], c0);
1171 tp[j - 1] = c1 & BN_MASK2;
1173 c1 = (tp[num] + c0) & BN_MASK2;
1175 tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
1178 if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
1179 c0 = bn_sub_words(rp, tp, np, num);
1180 if (tp[num] != 0 || c0 == 0) {
1181 for (i = 0; i < num + 2; i++)
1186 for (i = 0; i < num; i++)
1187 rp[i] = tp[i], vp[i] = 0;