1 dnl IA-64 mpn_addmul_2 -- Multiply a n-limb number with a 2-limb number and
2 dnl add the result to a (n+1)-limb number.
4 dnl Copyright 2004, 2005 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
27 C Note that this is very similar to mul_2.asm. If you change this file,
28 C please change that file too.
31 C * Clean up variable names, and try to decrease the number of distinct
33 C * Cleanup feed-in code to not require zeroing several registers.
34 C * Make sure we don't depend on uninitialized predicate registers.
35 C * We currently cross-jump very aggressively, at the expense of a few cycles
36 C per operation. Consider changing that.
37 C * Could perhaps save a few cycles by using 1 c/l carry propagation in
39 C * Ultimately rewrite. The problem with this code is that it first uses a
40 C loaded u value in one xma pair, then leaves it live over several unrelated
41 C xma pairs, before it uses it again. It should actually be quite possible
42 C to just swap some aligned xma pairs around. But we should then schedule
43 C u loads further from the first use.
59 define(`pr0_0',`r16') define(`pr0_1',`r17')
60 define(`pr0_2',`r18') define(`pr0_3',`r19')
62 define(`pr1_0',`r20') define(`pr1_1',`r21')
63 define(`pr1_2',`r22') define(`pr1_3',`r23')
65 define(`acc1_0',`r24') define(`acc1_1',`r25')
66 define(`acc1_2',`r26') define(`acc1_3',`r27')
73 define(`fp0b_0',`f8') define(`fp0b_1',`f9')
74 define(`fp0b_2',`f10') define(`fp0b_3',`f11')
76 define(`fp1a_0',`f12') define(`fp1a_1',`f13')
77 define(`fp1a_2',`f14') define(`fp1a_3',`f15')
79 define(`fp1b_0',`f32') define(`fp1b_1',`f33')
80 define(`fp1b_2',`f34') define(`fp1b_3',`f35')
82 define(`fp2a_0',`f36') define(`fp2a_1',`f37')
83 define(`fp2a_2',`f38') define(`fp2a_3',`f39')
85 define(`r_0',`f40') define(`r_1',`f41')
86 define(`r_2',`f42') define(`r_3',`f43')
88 define(`u_0',`f44') define(`u_1',`f45')
89 define(`u_2',`f46') define(`u_3',`f47')
97 PROLOGUE(mpn_addmul_2)
103 ` addp4 rp = 0, rp C M I
104 addp4 up = 0, up C M I
105 addp4 vp = 0, vp C M I
110 ldf8 ux = [up], 8 C M
111 ldf8 v0 = [vp], 8 C M
112 mov.i r2 = ar.lc C I0
114 ldf8 rx = [rp], 8 C M
119 ldf8 uy = [up], 8 C M
123 ldf8 ry = [rp], -8 C M
124 cmp.eq p10, p0 = 1, r14 C M I
125 cmp.eq p11, p0 = 2, r14 C M I
128 add srp = 16, rp C M I
129 cmp.eq p12, p0 = 3, r14 C M I
132 (p10) br.dptk .Lb01 C B
133 (p11) br.dptk .Lb10 C B
134 (p12) br.dptk .Lb11 C B
139 .Lb00: ldf8 r_1 = [srp], 8
144 cmp.ne p8, p9 = r0, r0
147 xma.l fp0b_3 = ux, v0, rx
148 cmp.ne p12, p13 = r0, r0
150 xma.hu fp1a_3 = ux, v0, rx
153 xma.l fp0b_0 = uy, v0, ry
154 xma.hu fp1a_0 = uy, v0, ry
156 getf.sig acc0 = fp0b_3
157 xma.l fp1b_3 = ux, v1, fp1a_3
158 xma.hu fp2a_3 = ux, v1, fp1a_3
160 xma.l fp0b_1 = u_1, v0, r_1
161 xma.hu fp1a_1 = u_1, v0, r_1
163 getf.sig pr0_0 = fp0b_0
164 xma.l fp1b_0 = uy, v1, fp1a_0
165 xma.hu fp2a_0 = uy, v1, fp1a_0
167 getf.sig pr1_3 = fp1b_3
168 getf.sig acc1_3 = fp2a_3
169 xma.l fp0b_2 = u_2, v0, r_2
170 xma.hu fp1a_2 = u_2, v0, r_2
173 .grt4: xma.l fp0b_0 = uy, v0, ry
174 xma.hu fp1a_0 = uy, v0, ry
177 getf.sig acc0 = fp0b_3
178 xma.l fp1b_3 = ux, v1, fp1a_3
180 xma.hu fp2a_3 = ux, v1, fp1a_3
182 xma.l fp0b_1 = u_1, v0, r_1
183 xma.hu fp1a_1 = u_1, v0, r_1
186 getf.sig pr0_0 = fp0b_0
187 xma.l fp1b_0 = uy, v1, fp1a_0
188 xma.hu fp2a_0 = uy, v1, fp1a_0
191 getf.sig pr1_3 = fp1b_3
193 getf.sig acc1_3 = fp2a_3
194 xma.l fp0b_2 = u_2, v0, r_2
195 xma.hu fp1a_2 = u_2, v0, r_2
200 .Lb01: ldf8 r_0 = [srp], 8 C M
201 ldf8 u_0 = [up], 8 C M
205 cmp.ne p6, p7 = r0, r0 C M I
207 ldf8 r_1 = [srp], 8 C M
208 xma.l fp0b_2 = ux, v0, rx C F
209 cmp.ne p10, p11 = r0, r0 C M I
210 ldf8 u_1 = [up], 8 C M
211 xma.hu fp1a_2 = ux, v0, rx C F
213 xma.l fp0b_3 = uy, v0, ry C F
214 xma.hu fp1a_3 = uy, v0, ry C F
216 getf.sig acc0 = fp0b_2 C M
217 ldf8 r_2 = [srp], 8 C M
218 xma.l fp1b_2 = ux, v1,fp1a_2 C F
219 xma.hu fp2a_2 = ux, v1,fp1a_2 C F
220 ldf8 u_2 = [up], 8 C M
223 xma.l fp0b_0 = u_0, v0, r_0 C F
224 xma.hu fp1a_0 = u_0, v0, r_0 C F
226 getf.sig pr0_3 = fp0b_3 C M
227 xma.l fp1b_3 = uy, v1,fp1a_3 C F
228 xma.hu fp2a_3 = uy, v1,fp1a_3 C F
230 getf.sig pr1_2 = fp1b_2 C M
231 getf.sig acc1_2 = fp2a_2 C M
232 xma.l fp0b_1 = u_1, v0, r_1 C F
233 xma.hu fp1a_1 = u_1, v0, r_1 C F
236 .grt5: xma.l fp0b_0 = u_0, v0, r_0
237 xma.hu fp1a_0 = u_0, v0, r_0
239 getf.sig pr0_3 = fp0b_3
241 xma.l fp1b_3 = uy, v1, fp1a_3
242 xma.hu fp2a_3 = uy, v1, fp1a_3
245 getf.sig pr1_2 = fp1b_2
247 getf.sig acc1_2 = fp2a_2
248 xma.l fp0b_1 = u_1, v0, r_1
249 xma.hu fp1a_1 = u_1, v0, r_1
259 xma.l fp0b_1 = ux, v0, rx
260 xma.hu fp1a_1 = ux, v0, rx
262 xma.l fp0b_2 = uy, v0, ry
263 xma.hu fp1a_2 = uy, v0, ry
267 stf8 [rp] = fp0b_1, 8
268 xma.l fp1b_1 = ux, v1, fp1a_1
269 xma.hu fp2a_1 = ux, v1, fp1a_1
271 getf.sig acc0 = fp0b_2
272 xma.l fp1b_2 = uy, v1, fp1a_2
273 xma.hu fp2a_2 = uy, v1, fp1a_2
277 getf.sig pr1_1 = fp1b_1
279 getf.sig acc1_1 = fp2a_1
281 getf.sig pr1_2 = fp1b_2
289 cmp.ltu p8, p9 = s0, pr1_1
292 .pred.rel "mutex", p8, p9
293 (p8) add acc0 = pr1_2, acc1_1, 1
294 (p9) add acc0 = pr1_2, acc1_1
295 (p8) cmp.leu p10, p0 = r31, pr1_2
296 (p9) cmp.ltu p10, p0 = r31, pr1_2
304 .grt2: ldf8 r_3 = [srp], 8
309 xma.l fp0b_1 = ux, v0, rx
312 xma.hu fp1a_1 = ux, v0, rx
315 xma.l fp0b_2 = uy, v0, ry
316 xma.hu fp1a_2 = uy, v0, ry
318 getf.sig acc0 = fp0b_1
320 xma.l fp1b_1 = ux, v1, fp1a_1
321 xma.hu fp2a_1 = ux, v1, fp1a_1
324 xma.l fp0b_3 = u_3, v0, r_3
325 xma.hu fp1a_3 = u_3, v0, r_3
327 getf.sig pr0_2 = fp0b_2
329 xma.l fp1b_2 = uy, v1, fp1a_2
330 xma.hu fp2a_2 = uy, v1, fp1a_2
333 getf.sig pr1_1 = fp1b_1
335 getf.sig acc1_1 = fp2a_1
336 xma.l fp0b_0 = u_0, v0, r_0
337 cmp.ne p8, p9 = r0, r0
338 cmp.ne p12, p13 = r0, r0
339 xma.hu fp1a_0 = u_0, v0, r_0
344 .Lb11: mov acc1_3 = 0
347 cmp.ne p6, p7 = r0, r0
353 xma.l fp0b_0 = ux, v0, rx
354 xma.hu fp1a_0 = ux, v0, rx
356 cmp.ne p10, p11 = r0, r0
357 xma.l fp0b_1 = uy, v0, ry
358 xma.hu fp1a_1 = uy, v0, ry
360 getf.sig acc0 = fp0b_0
361 xma.l fp1b_0 = ux, v1, fp1a_0
362 xma.hu fp2a_0 = ux, v1, fp1a_0
364 xma.l fp0b_2 = u_2, v0, r_2
365 xma.hu fp1a_2 = u_2, v0, r_2
367 getf.sig pr0_1 = fp0b_1
368 xma.l fp1b_1 = uy, v1, fp1a_1
369 xma.hu fp2a_1 = uy, v1, fp1a_1
371 getf.sig pr1_0 = fp1b_0
372 getf.sig acc1_0 = fp2a_0
375 .grt3: ldf8 r_3 = [srp], 8
376 xma.l fp0b_0 = ux, v0, rx
377 cmp.ne p10, p11 = r0, r0
379 xma.hu fp1a_0 = ux, v0, rx
381 xma.l fp0b_1 = uy, v0, ry
382 xma.hu fp1a_1 = uy, v0, ry
384 getf.sig acc0 = fp0b_0
386 xma.l fp1b_0 = ux, v1, fp1a_0
388 xma.hu fp2a_0 = ux, v1, fp1a_0
390 xma.l fp0b_2 = u_2, v0, r_2
391 xma.hu fp1a_2 = u_2, v0, r_2
393 getf.sig pr0_1 = fp0b_1
395 xma.l fp1b_1 = uy, v1, fp1a_1
396 xma.hu fp2a_1 = uy, v1, fp1a_1
399 getf.sig pr1_0 = fp1b_0
401 getf.sig acc1_0 = fp2a_0
402 xma.l fp0b_3 = u_3, v0, r_3
403 xma.hu fp1a_3 = u_3, v0, r_3
407 C *** MAIN LOOP START ***
410 .pred.rel "mutex", p12, p13
411 getf.sig pr0_3 = fp0b_3
413 xma.l fp1b_3 = u_3, v1, fp1a_3
414 (p12) add s0 = pr1_0, acc0, 1
415 (p13) add s0 = pr1_0, acc0
416 xma.hu fp2a_3 = u_3, v1, fp1a_3
418 .pred.rel "mutex", p8, p9
419 .pred.rel "mutex", p12, p13
421 getf.sig pr1_2 = fp1b_2
422 (p8) cmp.leu p6, p7 = acc0, pr0_1
423 (p9) cmp.ltu p6, p7 = acc0, pr0_1
424 (p12) cmp.leu p10, p11 = s0, pr1_0
425 (p13) cmp.ltu p10, p11 = s0, pr1_0
427 .pred.rel "mutex", p6, p7
428 getf.sig acc1_2 = fp2a_2
430 xma.l fp0b_1 = u_1, v0, r_1
431 (p6) add acc0 = pr0_2, acc1_0, 1
432 (p7) add acc0 = pr0_2, acc1_0
433 xma.hu fp1a_1 = u_1, v0, r_1
436 .pred.rel "mutex", p10, p11
437 getf.sig pr0_0 = fp0b_0
439 xma.l fp1b_0 = u_0, v1, fp1a_0
440 (p10) add s0 = pr1_1, acc0, 1
441 (p11) add s0 = pr1_1, acc0
442 xma.hu fp2a_0 = u_0, v1, fp1a_0
444 .pred.rel "mutex", p6, p7
445 .pred.rel "mutex", p10, p11
447 getf.sig pr1_3 = fp1b_3
448 (p6) cmp.leu p8, p9 = acc0, pr0_2
449 (p7) cmp.ltu p8, p9 = acc0, pr0_2
450 (p10) cmp.leu p12, p13 = s0, pr1_1
451 (p11) cmp.ltu p12, p13 = s0, pr1_1
453 .pred.rel "mutex", p8, p9
454 getf.sig acc1_3 = fp2a_3
456 xma.l fp0b_2 = u_2, v0, r_2
457 (p8) add acc0 = pr0_3, acc1_1, 1
458 (p9) add acc0 = pr0_3, acc1_1
459 xma.hu fp1a_2 = u_2, v0, r_2
462 .pred.rel "mutex", p12, p13
463 getf.sig pr0_1 = fp0b_1
465 xma.l fp1b_1 = u_1, v1, fp1a_1
466 (p12) add s0 = pr1_2, acc0, 1
467 (p13) add s0 = pr1_2, acc0
468 xma.hu fp2a_1 = u_1, v1, fp1a_1
470 .pred.rel "mutex", p8, p9
471 .pred.rel "mutex", p12, p13
473 getf.sig pr1_0 = fp1b_0
474 (p8) cmp.leu p6, p7 = acc0, pr0_3
475 (p9) cmp.ltu p6, p7 = acc0, pr0_3
476 (p12) cmp.leu p10, p11 = s0, pr1_2
477 (p13) cmp.ltu p10, p11 = s0, pr1_2
479 .pred.rel "mutex", p6, p7
480 getf.sig acc1_0 = fp2a_0
482 xma.l fp0b_3 = u_3, v0, r_3
483 (p6) add acc0 = pr0_0, acc1_2, 1
484 (p7) add acc0 = pr0_0, acc1_2
485 xma.hu fp1a_3 = u_3, v0, r_3
488 .pred.rel "mutex", p10, p11
489 getf.sig pr0_2 = fp0b_2
491 xma.l fp1b_2 = u_2, v1, fp1a_2
492 (p10) add s0 = pr1_3, acc0, 1
493 (p11) add s0 = pr1_3, acc0
494 xma.hu fp2a_2 = u_2, v1, fp1a_2
496 .pred.rel "mutex", p6, p7
497 .pred.rel "mutex", p10, p11
499 getf.sig pr1_1 = fp1b_1
500 (p6) cmp.leu p8, p9 = acc0, pr0_0
501 (p7) cmp.ltu p8, p9 = acc0, pr0_0
502 (p10) cmp.leu p12, p13 = s0, pr1_3
503 (p11) cmp.ltu p12, p13 = s0, pr1_3
505 .pred.rel "mutex", p8, p9
506 getf.sig acc1_1 = fp2a_1
508 xma.l fp0b_0 = u_0, v0, r_0
509 (p8) add acc0 = pr0_1, acc1_3, 1
510 (p9) add acc0 = pr0_1, acc1_3
511 xma.hu fp1a_0 = u_0, v0, r_0
512 .LL10: br.cloop.dptk .Loop C 12
514 C *** MAIN LOOP END ***
517 .pred.rel "mutex", p12, p13
518 getf.sig pr0_3 = fp0b_3
519 xma.l fp1b_3 = u_3, v1, fp1a_3
520 (p12) add s0 = pr1_0, acc0, 1
521 (p13) add s0 = pr1_0, acc0
522 xma.hu fp2a_3 = u_3, v1, fp1a_3
524 .pred.rel "mutex", p8, p9
525 .pred.rel "mutex", p12, p13
526 getf.sig pr1_2 = fp1b_2
527 (p8) cmp.leu p6, p7 = acc0, pr0_1
528 (p9) cmp.ltu p6, p7 = acc0, pr0_1
529 (p12) cmp.leu p10, p11 = s0, pr1_0
530 (p13) cmp.ltu p10, p11 = s0, pr1_0
532 .pred.rel "mutex", p6, p7
533 getf.sig acc1_2 = fp2a_2
535 xma.l fp0b_1 = u_1, v0, r_1
536 (p6) add acc0 = pr0_2, acc1_0, 1
537 (p7) add acc0 = pr0_2, acc1_0
538 xma.hu fp1a_1 = u_1, v0, r_1
541 .pred.rel "mutex", p10, p11
542 getf.sig pr0_0 = fp0b_0
543 xma.l fp1b_0 = u_0, v1, fp1a_0
544 (p10) add s0 = pr1_1, acc0, 1
545 (p11) add s0 = pr1_1, acc0
546 xma.hu fp2a_0 = u_0, v1, fp1a_0
548 .pred.rel "mutex", p6, p7
549 .pred.rel "mutex", p10, p11
550 getf.sig pr1_3 = fp1b_3
551 (p6) cmp.leu p8, p9 = acc0, pr0_2
552 (p7) cmp.ltu p8, p9 = acc0, pr0_2
553 (p10) cmp.leu p12, p13 = s0, pr1_1
554 (p11) cmp.ltu p12, p13 = s0, pr1_1
556 .pred.rel "mutex", p8, p9
557 getf.sig acc1_3 = fp2a_3
559 xma.l fp0b_2 = u_2, v0, r_2
560 (p8) add acc0 = pr0_3, acc1_1, 1
561 (p9) add acc0 = pr0_3, acc1_1
562 xma.hu fp1a_2 = u_2, v0, r_2
565 .pred.rel "mutex", p12, p13
566 getf.sig pr0_1 = fp0b_1
567 xma.l fp1b_1 = u_1, v1, fp1a_1
568 (p12) add s0 = pr1_2, acc0, 1
569 (p13) add s0 = pr1_2, acc0
570 xma.hu fp2a_1 = u_1, v1, fp1a_1
572 .pred.rel "mutex", p8, p9
573 .pred.rel "mutex", p12, p13
574 getf.sig pr1_0 = fp1b_0
575 (p8) cmp.leu p6, p7 = acc0, pr0_3
576 (p9) cmp.ltu p6, p7 = acc0, pr0_3
577 (p12) cmp.leu p10, p11 = s0, pr1_2
578 (p13) cmp.ltu p10, p11 = s0, pr1_2
580 .pred.rel "mutex", p6, p7
581 getf.sig acc1_0 = fp2a_0
583 (p6) add acc0 = pr0_0, acc1_2, 1
584 (p7) add acc0 = pr0_0, acc1_2
587 .pred.rel "mutex", p10, p11
588 getf.sig pr0_2 = fp0b_2
589 xma.l fp1b_2 = u_2, v1, fp1a_2
590 (p10) add s0 = pr1_3, acc0, 1
591 (p11) add s0 = pr1_3, acc0
592 xma.hu fp2a_2 = u_2, v1, fp1a_2
594 .pred.rel "mutex", p6, p7
595 .pred.rel "mutex", p10, p11
596 getf.sig pr1_1 = fp1b_1
597 (p6) cmp.leu p8, p9 = acc0, pr0_0
598 (p7) cmp.ltu p8, p9 = acc0, pr0_0
599 (p10) cmp.leu p12, p13 = s0, pr1_3
600 (p11) cmp.ltu p12, p13 = s0, pr1_3
602 .pred.rel "mutex", p8, p9
603 getf.sig acc1_1 = fp2a_1
605 (p8) add acc0 = pr0_1, acc1_3, 1
606 (p9) add acc0 = pr0_1, acc1_3
609 .pred.rel "mutex", p12, p13
610 (p12) add s0 = pr1_0, acc0, 1
611 (p13) add s0 = pr1_0, acc0
613 .pred.rel "mutex", p8, p9
614 .pred.rel "mutex", p12, p13
615 getf.sig pr1_2 = fp1b_2
616 (p8) cmp.leu p6, p7 = acc0, pr0_1
617 (p9) cmp.ltu p6, p7 = acc0, pr0_1
618 (p12) cmp.leu p10, p11 = s0, pr1_0
619 (p13) cmp.ltu p10, p11 = s0, pr1_0
621 .pred.rel "mutex", p6, p7
622 getf.sig acc1_2 = fp2a_2
624 (p6) add acc0 = pr0_2, acc1_0, 1
625 (p7) add acc0 = pr0_2, acc1_0
627 .pred.rel "mutex", p10, p11
628 (p10) add s0 = pr1_1, acc0, 1
629 (p11) add s0 = pr1_1, acc0
631 .pred.rel "mutex", p6, p7
632 .pred.rel "mutex", p10, p11
633 (p6) cmp.leu p8, p9 = acc0, pr0_2
634 (p7) cmp.ltu p8, p9 = acc0, pr0_2
635 (p10) cmp.leu p12, p13 = s0, pr1_1
636 (p11) cmp.ltu p12, p13 = s0, pr1_1
638 .pred.rel "mutex", p8, p9
640 (p8) add acc0 = pr1_2, acc1_1, 1
641 (p9) add acc0 = pr1_2, acc1_1
643 .pred.rel "mutex", p8, p9
644 (p8) cmp.leu p10, p11 = acc0, pr1_2
645 (p9) cmp.ltu p10, p11 = acc0, pr1_2
646 (p12) add acc0 = 1, acc0
649 (p12) cmp.eq.or p10, p0 = 0, acc0
652 .pred.rel "mutex", p10, p11