1 dnl IA-64 mpn_mul_2 -- Multiply a n-limb number with a 2-limb number and store
2 dnl store the result to a (n+1)-limb number.
4 dnl Copyright 2004 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
27 C Note that this is very similar to addmul_2.asm. If you change this file,
28 C please change that file too.
31 C * Clean up variable names, and try to decrease the number of distinct
33 C * Cleanup feed-in code to not require zeroing several registers.
34 C * Make sure we don't depend on uninitialized predicate registers.
35 C * We currently cross-jump very aggressively, at the expense of a few cycles
36 C per operation. Consider changing that.
37 C * Could perhaps save a few cycles by using 1 c/l carry propagation in
39 C * Ultimately rewrite. The problem with this code is that it first uses a
40 C loaded u value in one xma pair, then leaves it live over several unrelated
41 C xma pairs, before it uses it again. It should actually be quite possible
42 C to just swap some aligned xma pairs around. But we should then schedule
43 C u loads further from the first use.
59 define(`pr0_0',`r16') define(`pr0_1',`r17')
60 define(`pr0_2',`r18') define(`pr0_3',`r19')
62 define(`pr1_0',`r20') define(`pr1_1',`r21')
63 define(`pr1_2',`r22') define(`pr1_3',`r23')
65 define(`acc1_0',`r24') define(`acc1_1',`r25')
66 define(`acc1_2',`r26') define(`acc1_3',`r27')
73 define(`fp0b_0',`f8') define(`fp0b_1',`f9')
74 define(`fp0b_2',`f10') define(`fp0b_3',`f11')
76 define(`fp1a_0',`f12') define(`fp1a_1',`f13')
77 define(`fp1a_2',`f14') define(`fp1a_3',`f15')
79 define(`fp1b_0',`f32') define(`fp1b_1',`f33')
80 define(`fp1b_2',`f34') define(`fp1b_3',`f35')
82 define(`fp2a_0',`f36') define(`fp2a_1',`f37')
83 define(`fp2a_2',`f38') define(`fp2a_3',`f39')
85 define(`u_0',`f44') define(`u_1',`f45')
86 define(`u_2',`f46') define(`u_3',`f47')
98 ` addp4 rp = 0, rp C M I
99 addp4 up = 0, up C M I
100 addp4 vp = 0, vp C M I
105 ldf8 ux = [up], 8 C M
106 ldf8 v0 = [vp], 8 C M
107 mov.i r2 = ar.lc C I0
114 ldf8 uy = [up], 8 C M
119 cmp.eq p10, p0 = 1, r14 C M I
120 cmp.eq p11, p0 = 2, r14 C M I
124 cmp.eq p12, p0 = 3, r14 C M I
127 (p10) br.dptk .Lb01 C B
128 (p11) br.dptk .Lb10 C B
129 (p12) br.dptk .Lb11 C B
134 .Lb00: ldf8 u_1 = [up], 8
138 cmp.ne p8, p9 = r0, r0
140 xma.l fp0b_3 = ux, v0, f0
141 cmp.ne p12, p13 = r0, r0
143 xma.hu fp1a_3 = ux, v0, f0
146 xma.l fp0b_0 = uy, v0, f0
147 xma.hu fp1a_0 = uy, v0, f0
149 getf.sig acc0 = fp0b_3
150 xma.l fp1b_3 = ux, v1, fp1a_3
151 xma.hu fp2a_3 = ux, v1, fp1a_3
153 xma.l fp0b_1 = u_1, v0, f0
154 xma.hu fp1a_1 = u_1, v0, f0
156 getf.sig pr0_0 = fp0b_0
157 xma.l fp1b_0 = uy, v1, fp1a_0
158 xma.hu fp2a_0 = uy, v1, fp1a_0
160 getf.sig pr1_3 = fp1b_3
161 getf.sig acc1_3 = fp2a_3
162 xma.l fp0b_2 = u_2, v0, f0
163 xma.hu fp1a_2 = u_2, v0, f0
166 .grt4: xma.l fp0b_0 = uy, v0, f0
167 xma.hu fp1a_0 = uy, v0, f0
169 getf.sig acc0 = fp0b_3
170 xma.l fp1b_3 = ux, v1, fp1a_3
172 xma.hu fp2a_3 = ux, v1, fp1a_3
174 xma.l fp0b_1 = u_1, v0, f0
175 xma.hu fp1a_1 = u_1, v0, f0
177 getf.sig pr0_0 = fp0b_0
178 xma.l fp1b_0 = uy, v1, fp1a_0
179 xma.hu fp2a_0 = uy, v1, fp1a_0
182 getf.sig pr1_3 = fp1b_3
184 getf.sig acc1_3 = fp2a_3
185 xma.l fp0b_2 = u_2, v0, f0
186 xma.hu fp1a_2 = u_2, v0, f0
191 .Lb01: ldf8 u_0 = [up], 8 C M
195 cmp.ne p6, p7 = r0, r0 C M I
197 xma.l fp0b_2 = ux, v0, f0 C F
198 cmp.ne p10, p11 = r0, r0 C M I
199 ldf8 u_1 = [up], 8 C M
200 xma.hu fp1a_2 = ux, v0, f0 C F
202 xma.l fp0b_3 = uy, v0, f0 C F
203 xma.hu fp1a_3 = uy, v0, f0 C F
205 getf.sig acc0 = fp0b_2 C M
206 xma.l fp1b_2 = ux, v1,fp1a_2 C F
207 xma.hu fp2a_2 = ux, v1,fp1a_2 C F
208 ldf8 u_2 = [up], 8 C M
211 xma.l fp0b_0 = u_0, v0, f0 C F
212 xma.hu fp1a_0 = u_0, v0, f0 C F
214 getf.sig pr0_3 = fp0b_3 C M
215 xma.l fp1b_3 = uy, v1,fp1a_3 C F
216 xma.hu fp2a_3 = uy, v1,fp1a_3 C F
218 getf.sig pr1_2 = fp1b_2 C M
219 getf.sig acc1_2 = fp2a_2 C M
220 xma.l fp0b_1 = u_1, v0, f0 C F
221 xma.hu fp1a_1 = u_1, v0, f0 C F
224 .grt5: xma.l fp0b_0 = u_0, v0, f0
225 xma.hu fp1a_0 = u_0, v0, f0
227 getf.sig pr0_3 = fp0b_3
228 xma.l fp1b_3 = uy, v1, fp1a_3
229 xma.hu fp2a_3 = uy, v1, fp1a_3
232 getf.sig pr1_2 = fp1b_2
234 getf.sig acc1_2 = fp2a_2
235 xma.l fp0b_1 = u_1, v0, f0
236 xma.hu fp1a_1 = u_1, v0, f0
240 C We have two variants for n = 2. They turn out to run at exactly the same
241 C speed. But the first, odd variant might allow one cycle to be trimmed.
249 xma.l fp0b_1 = ux, v0, f0 C 0
250 xma.hu fp1a_1 = ux, v0, f0 C 1
252 xma.l fp0b_2 = uy, v0, f0 C 1
253 xma.l fp1b_1 = ux, v1, f0 C 1
255 xma.hu fp1a_2 = uy, v0, f0 C 2
256 xma.hu fp2a_1 = ux, v1, f0 C 2
258 xma.l fp1b_2 = uy, v1, f0 C 2
259 xma.hu fp2a_2 = uy, v1, f0 C 3
261 getf.sig r16 = fp1a_1
262 stf8 [rp] = fp0b_1, 8
264 getf.sig r17 = fp0b_2
266 getf.sig r18 = fp1b_1
268 getf.sig r19 = fp1a_2
270 getf.sig r20 = fp2a_1
272 getf.sig r21 = fp1b_2
277 cmp.ltu p6, p7 = r24, r16
280 cmp.ltu p8, p9 = r26, r24
283 (p6) add r25 = r19, r20, 1
284 (p7) add r25 = r19, r20
286 (p8) add r27 = r25, r21, 1
287 (p9) add r27 = r25, r21
288 (p6) cmp.leu p10, p0 = r25, r19
289 (p7) cmp.ltu p10, p0 = r25, r19
292 (p8) cmp.leu p12, p0 = r27, r25
293 (p9) cmp.ltu p12, p0 = r27, r25
306 xma.l fp0b_1 = ux, v0, f0
307 xma.hu fp1a_1 = ux, v0, f0
309 xma.l fp0b_2 = uy, v0, f0
310 xma.hu fp1a_2 = uy, v0, f0
314 stf8 [rp] = fp0b_1, 8
315 xma.l fp1b_1 = ux, v1, fp1a_1
316 xma.hu fp2a_1 = ux, v1, fp1a_1
318 getf.sig acc0 = fp0b_2
319 xma.l fp1b_2 = uy, v1, fp1a_2
320 xma.hu fp2a_2 = uy, v1, fp1a_2
324 getf.sig pr1_1 = fp1b_1
326 getf.sig acc1_1 = fp2a_1
328 getf.sig pr1_2 = fp1b_2
336 cmp.ltu p8, p9 = s0, pr1_1
339 .pred.rel "mutex", p8, p9
340 (p8) add acc0 = pr1_2, acc1_1, 1
341 (p9) add acc0 = pr1_2, acc1_1
342 (p8) cmp.leu p10, p0 = r31, pr1_2
343 (p9) cmp.ltu p10, p0 = r31, pr1_2
351 .grt2: ldf8 u_3 = [up], 8
356 xma.l fp0b_1 = ux, v0, f0
358 xma.hu fp1a_1 = ux, v0, f0
360 xma.l fp0b_2 = uy, v0, f0
361 xma.hu fp1a_2 = uy, v0, f0
363 getf.sig acc0 = fp0b_1
364 xma.l fp1b_1 = ux, v1, fp1a_1
365 xma.hu fp2a_1 = ux, v1, fp1a_1
368 xma.l fp0b_3 = u_3, v0, f0
369 xma.hu fp1a_3 = u_3, v0, f0
371 getf.sig pr0_2 = fp0b_2
372 xma.l fp1b_2 = uy, v1, fp1a_2
373 xma.hu fp2a_2 = uy, v1, fp1a_2
376 getf.sig pr1_1 = fp1b_1
378 getf.sig acc1_1 = fp2a_1
379 xma.l fp0b_0 = u_0, v0, f0
380 cmp.ne p8, p9 = r0, r0
381 cmp.ne p12, p13 = r0, r0
382 xma.hu fp1a_0 = u_0, v0, f0
387 .Lb11: mov acc1_3 = 0
390 cmp.ne p6, p7 = r0, r0
395 xma.l fp0b_0 = ux, v0, f0
396 xma.hu fp1a_0 = ux, v0, f0
398 cmp.ne p10, p11 = r0, r0
399 xma.l fp0b_1 = uy, v0, f0
400 xma.hu fp1a_1 = uy, v0, f0
402 getf.sig acc0 = fp0b_0
403 xma.l fp1b_0 = ux, v1, fp1a_0
404 xma.hu fp2a_0 = ux, v1, fp1a_0
406 xma.l fp0b_2 = u_2, v0, f0
407 xma.hu fp1a_2 = u_2, v0, f0
409 getf.sig pr0_1 = fp0b_1
410 xma.l fp1b_1 = uy, v1, fp1a_1
411 xma.hu fp2a_1 = uy, v1, fp1a_1
413 getf.sig pr1_0 = fp1b_0
414 getf.sig acc1_0 = fp2a_0
417 .grt3: xma.l fp0b_0 = ux, v0, f0
418 cmp.ne p10, p11 = r0, r0
420 xma.hu fp1a_0 = ux, v0, f0
422 xma.l fp0b_1 = uy, v0, f0
423 xma.hu fp1a_1 = uy, v0, f0
425 getf.sig acc0 = fp0b_0
426 xma.l fp1b_0 = ux, v1, fp1a_0
428 xma.hu fp2a_0 = ux, v1, fp1a_0
430 xma.l fp0b_2 = u_2, v0, f0
431 xma.hu fp1a_2 = u_2, v0, f0
433 getf.sig pr0_1 = fp0b_1
434 xma.l fp1b_1 = uy, v1, fp1a_1
435 xma.hu fp2a_1 = uy, v1, fp1a_1
438 getf.sig pr1_0 = fp1b_0
440 getf.sig acc1_0 = fp2a_0
441 xma.l fp0b_3 = u_3, v0, f0
442 xma.hu fp1a_3 = u_3, v0, f0
446 C *** MAIN LOOP START ***
449 .pred.rel "mutex", p12, p13
450 getf.sig pr0_3 = fp0b_3
451 xma.l fp1b_3 = u_3, v1, fp1a_3
452 (p12) add s0 = pr1_0, acc0, 1
453 (p13) add s0 = pr1_0, acc0
454 xma.hu fp2a_3 = u_3, v1, fp1a_3
456 .pred.rel "mutex", p8, p9
457 .pred.rel "mutex", p12, p13
459 getf.sig pr1_2 = fp1b_2
460 (p8) cmp.leu p6, p7 = acc0, pr0_1
461 (p9) cmp.ltu p6, p7 = acc0, pr0_1
462 (p12) cmp.leu p10, p11 = s0, pr1_0
463 (p13) cmp.ltu p10, p11 = s0, pr1_0
465 .pred.rel "mutex", p6, p7
466 getf.sig acc1_2 = fp2a_2
468 xma.l fp0b_1 = u_1, v0, f0
469 (p6) add acc0 = pr0_2, acc1_0, 1
470 (p7) add acc0 = pr0_2, acc1_0
471 xma.hu fp1a_1 = u_1, v0, f0
474 .pred.rel "mutex", p10, p11
475 getf.sig pr0_0 = fp0b_0
476 xma.l fp1b_0 = u_0, v1, fp1a_0
477 (p10) add s0 = pr1_1, acc0, 1
478 (p11) add s0 = pr1_1, acc0
479 xma.hu fp2a_0 = u_0, v1, fp1a_0
481 .pred.rel "mutex", p6, p7
482 .pred.rel "mutex", p10, p11
484 getf.sig pr1_3 = fp1b_3
485 (p6) cmp.leu p8, p9 = acc0, pr0_2
486 (p7) cmp.ltu p8, p9 = acc0, pr0_2
487 (p10) cmp.leu p12, p13 = s0, pr1_1
488 (p11) cmp.ltu p12, p13 = s0, pr1_1
490 .pred.rel "mutex", p8, p9
491 getf.sig acc1_3 = fp2a_3
493 xma.l fp0b_2 = u_2, v0, f0
494 (p8) add acc0 = pr0_3, acc1_1, 1
495 (p9) add acc0 = pr0_3, acc1_1
496 xma.hu fp1a_2 = u_2, v0, f0
499 .pred.rel "mutex", p12, p13
500 getf.sig pr0_1 = fp0b_1
501 xma.l fp1b_1 = u_1, v1, fp1a_1
502 (p12) add s0 = pr1_2, acc0, 1
503 (p13) add s0 = pr1_2, acc0
504 xma.hu fp2a_1 = u_1, v1, fp1a_1
506 .pred.rel "mutex", p8, p9
507 .pred.rel "mutex", p12, p13
509 getf.sig pr1_0 = fp1b_0
510 (p8) cmp.leu p6, p7 = acc0, pr0_3
511 (p9) cmp.ltu p6, p7 = acc0, pr0_3
512 (p12) cmp.leu p10, p11 = s0, pr1_2
513 (p13) cmp.ltu p10, p11 = s0, pr1_2
515 .pred.rel "mutex", p6, p7
516 getf.sig acc1_0 = fp2a_0
518 xma.l fp0b_3 = u_3, v0, f0
519 (p6) add acc0 = pr0_0, acc1_2, 1
520 (p7) add acc0 = pr0_0, acc1_2
521 xma.hu fp1a_3 = u_3, v0, f0
524 .pred.rel "mutex", p10, p11
525 getf.sig pr0_2 = fp0b_2
526 xma.l fp1b_2 = u_2, v1, fp1a_2
527 (p10) add s0 = pr1_3, acc0, 1
528 (p11) add s0 = pr1_3, acc0
529 xma.hu fp2a_2 = u_2, v1, fp1a_2
531 .pred.rel "mutex", p6, p7
532 .pred.rel "mutex", p10, p11
534 getf.sig pr1_1 = fp1b_1
535 (p6) cmp.leu p8, p9 = acc0, pr0_0
536 (p7) cmp.ltu p8, p9 = acc0, pr0_0
537 (p10) cmp.leu p12, p13 = s0, pr1_3
538 (p11) cmp.ltu p12, p13 = s0, pr1_3
540 .pred.rel "mutex", p8, p9
541 getf.sig acc1_1 = fp2a_1
543 xma.l fp0b_0 = u_0, v0, f0
544 (p8) add acc0 = pr0_1, acc1_3, 1
545 (p9) add acc0 = pr0_1, acc1_3
546 xma.hu fp1a_0 = u_0, v0, f0
547 .LL10: br.cloop.dptk .Loop C 12
549 C *** MAIN LOOP END ***
552 .pred.rel "mutex", p12, p13
553 getf.sig pr0_3 = fp0b_3
554 xma.l fp1b_3 = u_3, v1, fp1a_3
555 (p12) add s0 = pr1_0, acc0, 1
556 (p13) add s0 = pr1_0, acc0
557 xma.hu fp2a_3 = u_3, v1, fp1a_3
559 .pred.rel "mutex", p8, p9
560 .pred.rel "mutex", p12, p13
561 getf.sig pr1_2 = fp1b_2
562 (p8) cmp.leu p6, p7 = acc0, pr0_1
563 (p9) cmp.ltu p6, p7 = acc0, pr0_1
564 (p12) cmp.leu p10, p11 = s0, pr1_0
565 (p13) cmp.ltu p10, p11 = s0, pr1_0
567 .pred.rel "mutex", p6, p7
568 getf.sig acc1_2 = fp2a_2
570 xma.l fp0b_1 = u_1, v0, f0
571 (p6) add acc0 = pr0_2, acc1_0, 1
572 (p7) add acc0 = pr0_2, acc1_0
573 xma.hu fp1a_1 = u_1, v0, f0
576 .pred.rel "mutex", p10, p11
577 getf.sig pr0_0 = fp0b_0
578 xma.l fp1b_0 = u_0, v1, fp1a_0
579 (p10) add s0 = pr1_1, acc0, 1
580 (p11) add s0 = pr1_1, acc0
581 xma.hu fp2a_0 = u_0, v1, fp1a_0
583 .pred.rel "mutex", p6, p7
584 .pred.rel "mutex", p10, p11
585 getf.sig pr1_3 = fp1b_3
586 (p6) cmp.leu p8, p9 = acc0, pr0_2
587 (p7) cmp.ltu p8, p9 = acc0, pr0_2
588 (p10) cmp.leu p12, p13 = s0, pr1_1
589 (p11) cmp.ltu p12, p13 = s0, pr1_1
591 .pred.rel "mutex", p8, p9
592 getf.sig acc1_3 = fp2a_3
594 xma.l fp0b_2 = u_2, v0, f0
595 (p8) add acc0 = pr0_3, acc1_1, 1
596 (p9) add acc0 = pr0_3, acc1_1
597 xma.hu fp1a_2 = u_2, v0, f0
600 .pred.rel "mutex", p12, p13
601 getf.sig pr0_1 = fp0b_1
602 xma.l fp1b_1 = u_1, v1, fp1a_1
603 (p12) add s0 = pr1_2, acc0, 1
604 (p13) add s0 = pr1_2, acc0
605 xma.hu fp2a_1 = u_1, v1, fp1a_1
607 .pred.rel "mutex", p8, p9
608 .pred.rel "mutex", p12, p13
609 getf.sig pr1_0 = fp1b_0
610 (p8) cmp.leu p6, p7 = acc0, pr0_3
611 (p9) cmp.ltu p6, p7 = acc0, pr0_3
612 (p12) cmp.leu p10, p11 = s0, pr1_2
613 (p13) cmp.ltu p10, p11 = s0, pr1_2
615 .pred.rel "mutex", p6, p7
616 getf.sig acc1_0 = fp2a_0
618 (p6) add acc0 = pr0_0, acc1_2, 1
619 (p7) add acc0 = pr0_0, acc1_2
622 .pred.rel "mutex", p10, p11
623 getf.sig pr0_2 = fp0b_2
624 xma.l fp1b_2 = u_2, v1, fp1a_2
625 (p10) add s0 = pr1_3, acc0, 1
626 (p11) add s0 = pr1_3, acc0
627 xma.hu fp2a_2 = u_2, v1, fp1a_2
629 .pred.rel "mutex", p6, p7
630 .pred.rel "mutex", p10, p11
631 getf.sig pr1_1 = fp1b_1
632 (p6) cmp.leu p8, p9 = acc0, pr0_0
633 (p7) cmp.ltu p8, p9 = acc0, pr0_0
634 (p10) cmp.leu p12, p13 = s0, pr1_3
635 (p11) cmp.ltu p12, p13 = s0, pr1_3
637 .pred.rel "mutex", p8, p9
638 getf.sig acc1_1 = fp2a_1
640 (p8) add acc0 = pr0_1, acc1_3, 1
641 (p9) add acc0 = pr0_1, acc1_3
643 .pred.rel "mutex", p12, p13
644 (p12) add s0 = pr1_0, acc0, 1
645 (p13) add s0 = pr1_0, acc0
647 .pred.rel "mutex", p8, p9
648 .pred.rel "mutex", p12, p13
649 getf.sig pr1_2 = fp1b_2
650 (p8) cmp.leu p6, p7 = acc0, pr0_1
651 (p9) cmp.ltu p6, p7 = acc0, pr0_1
652 (p12) cmp.leu p10, p11 = s0, pr1_0
653 (p13) cmp.ltu p10, p11 = s0, pr1_0
655 .pred.rel "mutex", p6, p7
656 getf.sig acc1_2 = fp2a_2
658 (p6) add acc0 = pr0_2, acc1_0, 1
659 (p7) add acc0 = pr0_2, acc1_0
661 .pred.rel "mutex", p10, p11
662 (p10) add s0 = pr1_1, acc0, 1
663 (p11) add s0 = pr1_1, acc0
665 .pred.rel "mutex", p6, p7
666 .pred.rel "mutex", p10, p11
667 (p6) cmp.leu p8, p9 = acc0, pr0_2
668 (p7) cmp.ltu p8, p9 = acc0, pr0_2
669 (p10) cmp.leu p12, p13 = s0, pr1_1
670 (p11) cmp.ltu p12, p13 = s0, pr1_1
672 .pred.rel "mutex", p8, p9
674 (p8) add acc0 = pr1_2, acc1_1, 1
675 (p9) add acc0 = pr1_2, acc1_1
677 .pred.rel "mutex", p8, p9
678 (p8) cmp.leu p10, p11 = acc0, pr1_2
679 (p9) cmp.ltu p10, p11 = acc0, pr1_2
680 (p12) add acc0 = 1, acc0
683 (p12) cmp.eq.or p10, p0 = 0, acc0
686 .pred.rel "mutex", p10, p11