1 dnl IA-64 mpn_addmul_1 -- Multiply a limb vector with a limb and add the
2 dnl result to a second limb vector.
4 dnl Copyright 2000, 2001, 2002, 2003, 2004, 2005, 2007 Free Software
7 dnl This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22 include(`../config.m4')
29 C * Further optimize feed-in and wind-down code, both for speed and code size.
30 C * Handle low limb input and results specially, using a common stf8 in the
32 C * Use 1 c/l carry propagation scheme in wind-down code.
33 C * Use extra pointer registers for `up' and rp to speed up feed-in loads.
34 C * Work out final differences with mul_1.asm. That function is 300 bytes
35 C smaller than this due to better loop scheduling and thus simpler feed-in
45 PROLOGUE(mpn_addmul_1)
51 ` addp4 rp = 0, rp C M I
52 addp4 up = 0, up C M I
57 adds r15 = -1, n C M I
68 setf.sig f6 = vl C M2 M3
69 cmp.eq p10, p0 = 0, r14 C M I
70 shr.u r31 = r15, 2 C I0
73 cmp.eq p11, p0 = 2, r14 C M I
74 cmp.eq p12, p0 = 3, r14 C M I
79 cmp.ne p6, p7 = r0, r0 C M I
80 mov.i ar.lc = r31 C I0
81 cmp.ne p8, p9 = r0, r0 C M I
84 (p10) br.dptk .Lb00 C B
85 (p11) br.dptk .Lb10 C B
86 (p12) br.dptk .Lb11 C B
90 .Lb01: br.cloop.dptk .grt1 C B
92 xma.l f39 = f7, f6, f8 C F
93 xma.hu f43 = f7, f6, f8 C F
95 getf.sig r8 = f43 C M2
96 stf8 [r20] = f39 C M2 M3
98 br.ret.sptk.many b0 C B
108 xma.l f39 = f7, f6, f8
110 xma.hu f43 = f7, f6, f8
116 xma.l f36 = f32, f6, f44
117 xma.hu f40 = f32, f6, f44
120 xma.l f37 = f33, f6, f45
121 xma.hu f41 = f33, f6, f45
125 xma.l f38 = f34, f6, f46
126 xma.hu f42 = f34, f6, f46
130 xma.l f39 = f35, f6, f47
131 xma.hu f43 = f35, f6, f47
139 xma.l f36 = f32, f6, f44
140 xma.hu f40 = f32, f6, f44
143 xma.l f37 = f33, f6, f45
145 xma.hu f41 = f33, f6, f45
151 xma.l f38 = f34, f6, f46
153 xma.hu f42 = f34, f6, f46
159 xma.l f39 = f35, f6, f47
161 xma.hu f43 = f35, f6, f47
169 .Lb10: ldf8 f35 = [up], 8
173 xma.l f38 = f7, f6, f8
174 xma.hu f42 = f7, f6, f8
176 xma.l f39 = f35, f6, f47
177 xma.hu f43 = f35, f6, f47
190 xma.l f38 = f7, f6, f8
192 xma.hu f42 = f7, f6, f8
195 xma.l f39 = f35, f6, f47
197 xma.hu f43 = f35, f6, f47
204 xma.l f36 = f32, f6, f44
205 xma.hu f40 = f32, f6, f44
209 xma.l f37 = f33, f6, f45
210 xma.hu f41 = f33, f6, f45
214 xma.l f38 = f34, f6, f46
215 xma.hu f42 = f34, f6, f46
219 xma.l f39 = f35, f6, f47
220 xma.hu f43 = f35, f6, f47
225 xma.l f36 = f32, f6, f44
226 xma.hu f40 = f32, f6, f44
232 xma.l f37 = f33, f6, f45
234 xma.hu f41 = f33, f6, f45
240 xma.l f38 = f34, f6, f46
242 xma.hu f42 = f34, f6, f46
249 .Lb11: ldf8 f34 = [up], 8
257 xma.l f37 = f7, f6, f8
258 xma.hu f41 = f7, f6, f8
259 xma.l f38 = f34, f6, f46
260 xma.hu f42 = f34, f6, f46
261 xma.l f39 = f35, f6, f47
262 xma.hu f43 = f35, f6, f47
274 xma.l f37 = f7, f6, f8
276 xma.hu f41 = f7, f6, f8
279 xma.l f38 = f34, f6, f46
281 xma.hu f42 = f34, f6, f46
284 xma.l f39 = f35, f6, f47
286 xma.hu f43 = f35, f6, f47
289 getf.sig r25 = f37 C FIXME
294 stf8 [r20] = f37, 8 C FIXME
295 xma.l f36 = f32, f6, f44
297 xma.hu f40 = f32, f6, f44
300 xma.l f37 = f33, f6, f45
302 xma.hu f41 = f33, f6, f45
305 xma.l f38 = f34, f6, f46
307 xma.hu f42 = f34, f6, f46
312 xma.l f36 = f32, f6, f44
314 xma.hu f40 = f32, f6, f44
320 xma.l f37 = f33, f6, f45
322 xma.hu f41 = f33, f6, f45
329 .Lb00: ldf8 f33 = [up], 8
336 xma.l f36 = f7, f6, f8
338 xma.hu f40 = f7, f6, f8
341 xma.l f37 = f33, f6, f45
342 xma.hu f41 = f33, f6, f45
343 xma.l f38 = f34, f6, f46
344 xma.hu f42 = f34, f6, f46
348 xma.l f39 = f35, f6, f47
350 xma.hu f43 = f35, f6, f47
360 xma.l f37 = f33, f6, f45
362 xma.hu f41 = f33, f6, f45
365 xma.l f38 = f34, f6, f46
367 xma.hu f42 = f34, f6, f46
370 getf.sig r24 = f36 C FIXME
371 xma.l f39 = f35, f6, f47
374 xma.hu f43 = f35, f6, f47
382 stf8 [r20] = f36, 8 C FIXME
383 xma.l f36 = f32, f6, f44
386 xma.hu f40 = f32, f6, f44
388 xma.l f37 = f33, f6, f45
390 xma.hu f41 = f33, f6, f45
395 xma.l f36 = f32, f6, f44
397 xma.hu f40 = f32, f6, f44
404 C *** MAIN LOOP START ***
405 ALIGN(32) C insn fed cycle #
407 .pred.rel "mutex", p6, p7 C num by i1 i2
408 getf.sig r29 = f41 C 00 16 0 0
409 xma.l f36 = f32, f6, f44 C 01 06,15 0 0
410 (p6) add r14 = r30, r27, 1 C 02 0 0
411 ldf8 f47 = [rp], 8 C 03 0 0
412 xma.hu f40 = f32, f6, f44 C 04 06,15 0 0
413 (p7) add r14 = r30, r27 C 05 0 0
415 .pred.rel "mutex", p6, p7
416 ldf8 f32 = [up], 8 C 06 1 1
417 (p6) cmp.leu p8, p9 = r14, r27 C 07 1 1
418 (p7) cmp.ltu p8, p9 = r14, r27 C 08 1 1
419 getf.sig r26 = f38 C 09 25 2 1
420 st8 [r20] = r14, 8 C 10 2 1
424 .pred.rel "mutex", p8, p9
425 getf.sig r30 = f42 C 12 28 3 2
426 xma.l f37 = f33, f6, f45 C 13 18,27 3 2
427 (p8) add r16 = r31, r24, 1 C 14 3 2
428 ldf8 f44 = [rp], 8 C 15 3 2
429 xma.hu f41 = f33, f6, f45 C 16 18,27 3 2
430 (p9) add r16 = r31, r24 C 17 3 2
432 .pred.rel "mutex", p8, p9
433 ldf8 f33 = [up], 8 C 18 4 3
434 (p8) cmp.leu p6, p7 = r16, r24 C 19 4 3
435 (p9) cmp.ltu p6, p7 = r16, r24 C 20 4 3
436 getf.sig r27 = f39 C 21 37 5 3
437 st8 [r20] = r16, 8 C 22 5 3
441 .pred.rel "mutex", p6, p7
442 getf.sig r31 = f43 C 24 40 6 4
443 xma.l f38 = f34, f6, f46 C 25 30,39 6 4
444 (p6) add r14 = r28, r25, 1 C 26 6 4
445 ldf8 f45 = [rp], 8 C 27 6 4
446 xma.hu f42 = f34, f6, f46 C 28 30,39 6 4
447 (p7) add r14 = r28, r25 C 29 6 4
449 .pred.rel "mutex", p6, p7
450 ldf8 f34 = [up], 8 C 30 7 5
451 (p6) cmp.leu p8, p9 = r14, r25 C 31 7 5
452 (p7) cmp.ltu p8, p9 = r14, r25 C 32 7 5
453 getf.sig r24 = f36 C 33 01 8 5
454 st8 [r20] = r14, 8 C 34 8 5
458 .pred.rel "mutex", p8, p9
459 getf.sig r28 = f40 C 36 04 9 6
460 xma.l f39 = f35, f6, f47 C 37 42,03 9 6
461 (p8) add r16 = r29, r26, 1 C 38 9 6
462 ldf8 f46 = [rp], 8 C 39 9 6
463 xma.hu f43 = f35, f6, f47 C 40 42,03 9 6
464 (p9) add r16 = r29, r26 C 41 9 6
466 .pred.rel "mutex", p8, p9
467 ldf8 f35 = [up], 8 C 42 10 7
468 (p8) cmp.leu p6, p7 = r16, r26 C 43 10 7
469 (p9) cmp.ltu p6, p7 = r16, r26 C 44 10 7
470 getf.sig r25 = f37 C 45 13 11 7
471 st8 [r20] = r16, 8 C 46 11 7
472 br.cloop.dptk .Loop C 47 11 7
473 C *** MAIN LOOP END ***
476 .pred.rel "mutex", p6, p7
478 xma.l f36 = f32, f6, f44 C
479 (p6) add r14 = r30, r27, 1 C
481 xma.hu f40 = f32, f6, f44 C
482 (p7) add r14 = r30, r27 C
484 .pred.rel "mutex", p6, p7
485 (p6) cmp.leu p8, p9 = r14, r27 C
486 (p7) cmp.ltu p8, p9 = r14, r27 C
490 .pred.rel "mutex", p8, p9
492 xma.l f37 = f33, f6, f45 C
493 (p8) add r16 = r31, r24, 1 C
494 xma.hu f41 = f33, f6, f45 C
495 (p9) add r16 = r31, r24 C
497 .pred.rel "mutex", p8, p9
498 (p8) cmp.leu p6, p7 = r16, r24 C
499 (p9) cmp.ltu p6, p7 = r16, r24 C
504 .pred.rel "mutex", p6, p7
506 xma.l f38 = f34, f6, f46 C
507 (p6) add r14 = r28, r25, 1 C
508 xma.hu f42 = f34, f6, f46 C
509 (p7) add r14 = r28, r25 C
511 .pred.rel "mutex", p6, p7
512 (p6) cmp.leu p8, p9 = r14, r25 C
513 (p7) cmp.ltu p8, p9 = r14, r25 C
518 .pred.rel "mutex", p8, p9
520 xma.l f39 = f35, f6, f47 C
521 (p8) add r16 = r29, r26, 1 C
522 xma.hu f43 = f35, f6, f47 C
523 (p9) add r16 = r29, r26 C
525 .pred.rel "mutex", p8, p9
526 (p8) cmp.leu p6, p7 = r16, r26 C
527 (p9) cmp.ltu p6, p7 = r16, r26 C
532 .pred.rel "mutex", p6, p7
534 (p6) add r14 = r30, r27, 1 C
535 (p7) add r14 = r30, r27 C
537 .pred.rel "mutex", p6, p7
538 (p6) cmp.leu p8, p9 = r14, r27 C
539 (p7) cmp.ltu p8, p9 = r14, r27 C
544 .pred.rel "mutex", p8, p9
546 (p8) add r16 = r31, r24, 1 C
547 (p9) add r16 = r31, r24 C
549 .pred.rel "mutex", p8, p9
550 (p8) cmp.leu p6, p7 = r16, r24 C
551 (p9) cmp.ltu p6, p7 = r16, r24 C
556 .pred.rel "mutex", p6, p7
558 (p6) add r14 = r28, r25, 1 C
559 (p7) add r14 = r28, r25 C
561 .pred.rel "mutex", p6, p7
563 (p6) cmp.leu p8, p9 = r14, r25 C
564 (p7) cmp.ltu p8, p9 = r14, r25 C
567 .pred.rel "mutex", p8, p9
568 (p8) add r16 = r29, r26, 1 C
569 (p9) add r16 = r29, r26 C
571 .pred.rel "mutex", p8, p9
573 (p8) cmp.leu p6, p7 = r16, r26 C
574 (p9) cmp.ltu p6, p7 = r16, r26 C
577 .pred.rel "mutex", p6, p7
578 (p6) add r14 = r30, r27, 1 C
579 (p7) add r14 = r30, r27 C
581 .pred.rel "mutex", p6, p7
583 (p6) cmp.leu p8, p9 = r14, r27 C
584 (p7) cmp.ltu p8, p9 = r14, r27 C
586 (p8) add r8 = 1, r8 C M I
587 mov.i ar.lc = r2 C I0
588 br.ret.sptk.many b0 C B