1 dnl IA-64 mpn_add_n/mpn_sub_n -- mpn addition and subtraction.
3 dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
27 C * Consider using special code for small n, using something like
28 C "switch (8 * (n >= 8) + (n mod 8))" to enter it and feed-in code.
36 ifdef(`OPERATION_add_n',`
41 define(func, mpn_add_n)
43 ifdef(`OPERATION_sub_n',`
48 define(func, mpn_sub_n)
51 C Some useful aliases for registers we use
52 define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
53 define(`u4',`r18') define(`u5',`r19') define(`u6',`r20') define(`u7',`r21')
54 define(`v0',`r24') define(`v1',`r25') define(`v2',`r26') define(`v3',`r27')
55 define(`v4',`r28') define(`v5',`r29') define(`v6',`r30') define(`v7',`r31')
56 define(`w0',`r22') define(`w1',`r9') define(`w2',`r8') define(`w3',`r23')
57 define(`w4',`r22') define(`w5',`r9') define(`w6',`r8') define(`w7',`r23')
60 MULFUNC_PROLOGUE(mpn_add_n mpn_sub_n)
68 addp4 rp = 0, rp C M I
69 addp4 up = 0, up C M I
70 addp4 vp = 0, vp C M I
75 ld8 r11 = [vp], 8 C M01
76 ld8 r10 = [up], 8 C M01
81 cmp.lt p15, p14 = 8, n C M I
86 cmp.eq p6, p0 = 1, r14 C M I
87 cmp.eq p7, p0 = 2, r14 C M I
88 cmp.eq p8, p0 = 3, r14 C M I
91 (p6) br.dptk .Lb001 C B
92 (p7) br.dptk .Lb010 C B
93 (p8) br.dptk .Lb011 C B
97 cmp.eq p9, p0 = 4, r14 C M I
98 cmp.eq p10, p0 = 5, r14 C M I
99 cmp.eq p11, p0 = 6, r14 C M I
102 (p9) br.dptk .Lb100 C B
103 (p10) br.dptk .Lb101 C B
104 (p11) br.dptk .Lb110 C B
108 cmp.eq p12, p0 = 7, r14 C M I
109 add n = -1, n C loop count M I
110 (p12) br.dptk .Lb111 C B
114 .Lb000: ld8 v2 = [vp], 8 C M01
115 ld8 u2 = [up], 8 C M01
116 add rpx = 8, rp C M I
118 ld8 v3 = [vp], 8 C M01
119 ld8 u3 = [up], 8 C M01
120 ADDSUB w1 = r10, r11 C M I
122 ld8 v4 = [vp], 8 C M01
123 ld8 u4 = [up], 8 C M01
124 cmp.PRED p7, p0 = w1, r10 C M I
126 ld8 v5 = [vp], 8 C M01
127 ld8 u5 = [up], 8 C M01
128 ADDSUB w2 = u2, v2 C M I
130 ld8 v6 = [vp], 8 C M01
131 ld8 u6 = [up], 8 C M01
132 cmp.PRED p8, p0 = w2, u2 C M I
134 ld8 v7 = [vp], 8 C M01
135 ld8 u7 = [up], 8 C M01
136 ADDSUB w3 = u3, v3 C M I
138 ld8 v0 = [vp], 8 C M01
139 ld8 u0 = [up], 8 C M01
140 cmp.PRED p9, p0 = w3, u3 C M I
141 (p7) cmp.eq.or p8, p0 = LIM, w2 C M I
142 (p7) add w2 = INCR, w2 C M I
143 (p14) br.cond.dptk .Lcj8 C B
146 .grt8: ld8 v1 = [vp], 8 C M01
147 ld8 u1 = [up], 8 C M01
151 ld8 v2 = [vp], 8 C M01
153 ld8 u2 = [up], 8 C M01
157 ld8 v3 = [vp], 8 C M01
158 ld8 u3 = [up], 8 C M01
162 .Lb001: add rpx = 16, rp C M I
163 ADDSUB w0 = r10, r11 C M I
164 (p15) br.cond.dpnt .grt1 C B
166 cmp.PRED p6, p0 = w0, r10 C M I
170 .grt1: ld8 v1 = [vp], 8 C M01
171 ld8 u1 = [up], 8 C M01
174 ld8 v2 = [vp], 8 C M01
175 ld8 u2 = [up], 8 C M01
176 cmp.ne p9, p0 = r0, r0 C read near Loop
178 ld8 v3 = [vp], 8 C M01
179 ld8 u3 = [up], 8 C M01
182 ld8 v4 = [vp], 8 C M01
183 ld8 u4 = [up], 8 C M01
184 cmp.PRED p6, p0 = w0, r10 C M I
186 ld8 v5 = [vp], 8 C M01
187 ld8 u5 = [up], 8 C M01
188 ADDSUB w1 = u1, v1 C M I
190 ld8 v6 = [vp], 8 C M01
191 ld8 u6 = [up], 8 C M01
192 cmp.PRED p7, p0 = w1, u1 C M I
194 ld8 v7 = [vp], 8 C M01
195 ld8 u7 = [up], 8 C M01
196 ADDSUB w2 = u2, v2 C M I
199 ld8 v0 = [vp], 8 C M01
201 ld8 u0 = [up], 8 C M01
202 br.cloop.dptk .Loop C B
205 .Lb010: ld8 v0 = [vp], 8 C M01
206 ld8 u0 = [up], 8 C M01
207 add rpx = 24, rp C M I
208 ADDSUB w7 = r10, r11 C M I
209 (p15) br.cond.dpnt .grt2 C B
211 cmp.PRED p9, p0 = w7, r10 C M I
212 ADDSUB w0 = u0, v0 C M I
215 .grt2: ld8 v1 = [vp], 8 C M01
216 ld8 u1 = [up], 8 C M01
219 ld8 v2 = [vp], 8 C M01
220 ld8 u2 = [up], 8 C M01
222 ld8 v3 = [vp], 8 C M01
223 ld8 u3 = [up], 8 C M01
226 ld8 v4 = [vp], 8 C M01
227 ld8 u4 = [up], 8 C M01
229 ld8 v5 = [vp], 8 C M01
230 ld8 u5 = [up], 8 C M01
231 cmp.PRED p9, p0 = w7, r10 C M I
233 ld8 v6 = [vp], 8 C M01
234 ld8 u6 = [up], 8 C M01
235 ADDSUB w0 = u0, v0 C M I
238 ld8 v7 = [vp], 8 C M01
240 ld8 u7 = [up], 8 C M01
243 .Lb011: ld8 v7 = [vp], 8 C M01
244 ld8 u7 = [up], 8 C M01
245 ADDSUB w6 = r10, r11 C M I
247 ld8 v0 = [vp], 8 C M01
248 ld8 u0 = [up], 8 C M01
249 (p15) br.cond.dpnt .grt3 C B
251 cmp.PRED p8, p0 = w6, r10 C M I
252 ADDSUB w7 = u7, v7 C M I
254 st8 [rp] = w6, 8 C M23
255 cmp.PRED p9, p0 = w7, u7 C M I
258 .grt3: ld8 v1 = [vp], 8 C M01
259 ld8 u1 = [up], 8 C M01
260 add rpx = 32, rp C M I
262 ld8 v2 = [vp], 8 C M01
263 ld8 u2 = [up], 8 C M01
266 ld8 v3 = [vp], 8 C M01
267 ld8 u3 = [up], 8 C M01
268 cmp.PRED p8, p0 = w6, r10 C M I
270 ld8 v4 = [vp], 8 C M01
271 ld8 u4 = [up], 8 C M01
273 ADDSUB w7 = u7, v7 C M I
277 ld8 v5 = [vp], 8 C M01
278 ld8 u5 = [up], 8 C M01
279 cmp.PRED p9, p0 = w7, u7 C M I
282 ld8 v6 = [vp], 8 C M01
284 ld8 u6 = [up], 8 C M01
285 (p8) cmp.eq.or p9, p0 = LIM, w7 C M I
287 ld8 v7 = [vp], 8 C M01
288 ld8 u7 = [up], 8 C M01
289 (p8) add w7 = INCR, w7 C M I
290 st8 [rp] = w6, 8 C M23
291 ADDSUB w0 = u0, v0 C M I
294 .Lb100: ld8 v6 = [vp], 8 C M01
295 ld8 u6 = [up], 8 C M01
296 add rpx = 8, rp C M I
298 ld8 v7 = [vp], 8 C M01
299 ld8 u7 = [up], 8 C M01
300 ADDSUB w5 = r10, r11 C M I
302 ld8 v0 = [vp], 8 C M01
303 ld8 u0 = [up], 8 C M01
304 (p15) br.cond.dpnt .grt4 C B
306 cmp.PRED p7, p0 = w5, r10 C M I
307 ADDSUB w6 = u6, v6 C M I
309 cmp.PRED p8, p0 = w6, u6 C M I
310 ADDSUB w7 = u7, v7 C M I
313 .grt4: ld8 v1 = [vp], 8 C M01
314 ld8 u1 = [up], 8 C M01
316 cmp.PRED p7, p0 = w5, r10 C M I
318 ld8 v2 = [vp], 8 C M01
319 ld8 u2 = [up], 8 C M01
320 ADDSUB w6 = u6, v6 C M I
322 ld8 v3 = [vp], 8 C M01
323 ld8 u3 = [up], 8 C M01
324 cmp.PRED p8, p0 = w6, u6 C M I
326 ld8 v4 = [vp], 8 C M01
327 ld8 u4 = [up], 8 C M01
330 ld8 v5 = [vp], 8 C M01
331 ld8 u5 = [up], 8 C M01
332 ADDSUB w7 = u7, v7 C M I
335 ld8 v6 = [vp], 8 C M01
337 ld8 u6 = [up], 8 C M01
338 cmp.PRED p9, p0 = w7, u7 C M I
340 ld8 v7 = [vp], 8 C M01
341 ld8 u7 = [up], 8 C M01
342 (p7) cmp.eq.or p8, p0 = LIM, w6 C M I
343 (p7) add w6 = INCR, w6 C M I
346 .Lb101: ld8 v5 = [vp], 8 C M01
347 ld8 u5 = [up], 8 C M01
348 add rpx = 16, rp C M I
350 ld8 v6 = [vp], 8 C M01
351 ld8 u6 = [up], 8 C M01
352 ADDSUB w4 = r10, r11 C M I
354 ld8 v7 = [vp], 8 C M01
355 ld8 u7 = [up], 8 C M01
356 cmp.PRED p6, p0 = w4, r10 C M I
358 ld8 v0 = [vp], 8 C M01
359 ld8 u0 = [up], 8 C M01
360 ADDSUB w5 = u5, v5 C M I
362 (p15) br.cond.dpnt .grt5 C B
364 cmp.PRED p7, p0 = w5, u5 C M I
365 ADDSUB w6 = u6, v6 C M I
368 .grt5: ld8 v1 = [vp], 8 C M01
369 ld8 u1 = [up], 8 C M01
371 ld8 v2 = [vp], 8 C M01
372 ld8 u2 = [up], 8 C M01
375 ld8 v3 = [vp], 8 C M01
376 ld8 u3 = [up], 8 C M01
377 cmp.PRED p7, p0 = w5, u5 C M I
379 ld8 v4 = [vp], 8 C M01
380 ld8 u4 = [up], 8 C M01
381 ADDSUB w6 = u6, v6 C M I
384 ld8 v5 = [vp], 8 C M01
386 ld8 u5 = [up], 8 C M01
389 .Lb110: ld8 v4 = [vp], 8 C M01
390 ld8 u4 = [up], 8 C M01
391 add rpx = 24, rp C M I
393 ld8 v5 = [vp], 8 C M01
394 ld8 u5 = [up], 8 C M01
395 ADDSUB w3 = r10, r11 C M I
397 ld8 v6 = [vp], 8 C M01
398 ld8 u6 = [up], 8 C M01
401 ld8 v7 = [vp], 8 C M01
402 ld8 u7 = [up], 8 C M01
403 cmp.PRED p9, p0 = w3, r10 C M I
405 ld8 v0 = [vp], 8 C M01
406 ld8 u0 = [up], 8 C M01
407 ADDSUB w4 = u4, v4 C M I
408 (p14) br.cond.dptk .Lcj67 C B
411 .grt6: ld8 v1 = [vp], 8 C M01
412 ld8 u1 = [up], 8 C M01
414 cmp.PRED p9, p0 = w3, r10 C M I
418 ld8 v2 = [vp], 8 C M01
419 ld8 u2 = [up], 8 C M01
420 ADDSUB w4 = u4, v4 C M I
423 ld8 v3 = [vp], 8 C M01
425 ld8 u3 = [up], 8 C M01
428 .Lb111: ld8 v3 = [vp], 8 C M01
429 ld8 u3 = [up], 8 C M01
430 add rpx = 32, rp C M I
432 ld8 v4 = [vp], 8 C M01
433 ld8 u4 = [up], 8 C M01
434 ADDSUB w2 = r10, r11 C M I
436 ld8 v5 = [vp], 8 C M01
437 ld8 u5 = [up], 8 C M01
438 cmp.PRED p8, p0 = w2, r10 C M I
440 ld8 v6 = [vp], 8 C M01
441 ld8 u6 = [up], 8 C M01
442 ADDSUB w3 = u3, v3 C M I
444 ld8 v7 = [vp], 8 C M01
445 ld8 u7 = [up], 8 C M01
446 cmp.PRED p9, p0 = w3, u3 C M I
448 ld8 v0 = [vp], 8 C M01
449 ld8 u0 = [up], 8 C M01
450 (p15) br.cond.dpnt .grt7 C B
452 st8 [rp] = w2, 8 C M23
453 (p8) cmp.eq.or p9, p0 = LIM, w3 C M I
454 (p8) add w3 = INCR, w3 C M I
455 ADDSUB w4 = u4, v4 C M I
458 .grt7: ld8 v1 = [vp], 8 C M01
459 ld8 u1 = [up], 8 C M01
461 (p8) cmp.eq.or p9, p0 = LIM, w3 C M I
466 ld8 v2 = [vp], 8 C M01
468 ld8 u2 = [up], 8 C M01
469 (p8) add w3 = INCR, w3 C M I
472 ld8 v3 = [vp], 8 C M01
473 ld8 u3 = [up], 8 C M01
475 st8 [rp] = w2, 8 C M23
476 ADDSUB w4 = u4, v4 C M I
479 C *** MAIN LOOP START ***
481 .Loop: ld8 v1 = [vp], 8 C M01
482 cmp.PRED p7, p0 = w1, u1 C M I
483 (p9) cmp.eq.or p6, p0 = LIM, w0 C M I
484 ld8 u1 = [up], 8 C M01
485 (p9) add w0 = INCR, w0 C M I
486 ADDSUB w2 = u2, v2 C M I
488 ld8 v2 = [vp], 8 C M01
489 cmp.PRED p8, p0 = w2, u2 C M I
490 (p6) cmp.eq.or p7, p0 = LIM, w1 C M I
491 ld8 u2 = [up], 8 C M01
492 (p6) add w1 = INCR, w1 C M I
493 ADDSUB w3 = u3, v3 C M I
495 st8 [rp] = w0, 8 C M23
496 ld8 v3 = [vp], 8 C M01
497 cmp.PRED p9, p0 = w3, u3 C M I
498 (p7) cmp.eq.or p8, p0 = LIM, w2 C M I
499 ld8 u3 = [up], 8 C M01
500 (p7) add w2 = INCR, w2 C M I
502 .LL000: st8 [rp] = w1, 16 C M23
503 st8 [rpx] = w2, 32 C M23
504 (p8) cmp.eq.or p9, p0 = LIM, w3 C M I
506 (p8) add w3 = INCR, w3 C M I
507 ADDSUB w4 = u4, v4 C M I
509 .LL11x: st8 [rp] = w3, 8 C M23
510 ld8 v4 = [vp], 8 C M01
511 cmp.PRED p6, p0 = w4, u4 C M I
512 ld8 u4 = [up], 8 C M01
513 ADDSUB w5 = u5, v5 C M I
515 ld8 v5 = [vp], 8 C M01
516 cmp.PRED p7, p0 = w5, u5 C M I
517 (p9) cmp.eq.or p6, p0 = LIM, w4 C M I
518 ld8 u5 = [up], 8 C M01
519 (p9) add w4 = INCR, w4 C M I
520 ADDSUB w6 = u6, v6 C M I
522 .LL101: ld8 v6 = [vp], 8 C M01
523 cmp.PRED p8, p0 = w6, u6 C M I
524 (p6) cmp.eq.or p7, p0 = LIM, w5 C M I
525 ld8 u6 = [up], 8 C M01
526 (p6) add w5 = INCR, w5 C M I
527 ADDSUB w7 = u7, v7 C M I
529 st8 [rp] = w4, 8 C M23
530 ld8 v7 = [vp], 8 C M01
531 cmp.PRED p9, p0 = w7, u7 C M I
532 (p7) cmp.eq.or p8, p0 = LIM, w6 C M I
533 ld8 u7 = [up], 8 C M01
534 (p7) add w6 = INCR, w6 C M I
536 .LL100: st8 [rp] = w5, 16 C M23
537 st8 [rpx] = w6, 32 C M23
538 (p8) cmp.eq.or p9, p0 = LIM, w7 C M I
540 (p8) add w7 = INCR, w7 C M I
541 ADDSUB w0 = u0, v0 C M I
543 .LL01x: st8 [rp] = w7, 8 C M23
544 ld8 v0 = [vp], 8 C M01
545 cmp.PRED p6, p0 = w0, u0 C M I
546 ld8 u0 = [up], 8 C M01
547 ADDSUB w1 = u1, v1 C M I
548 br.cloop.dptk .Loop C B
550 C *** MAIN LOOP END ***
552 cmp.PRED p7, p0 = w1, u1 C M I
553 (p9) cmp.eq.or p6, p0 = LIM, w0 C M I
554 (p9) add w0 = INCR, w0 C M I
555 ADDSUB w2 = u2, v2 C M I
557 .Lcj9: cmp.PRED p8, p0 = w2, u2 C M I
558 (p6) cmp.eq.or p7, p0 = LIM, w1 C M I
559 st8 [rp] = w0, 8 C M23
560 (p6) add w1 = INCR, w1 C M I
561 ADDSUB w3 = u3, v3 C M I
563 cmp.PRED p9, p0 = w3, u3 C M I
564 (p7) cmp.eq.or p8, p0 = LIM, w2 C M I
565 (p7) add w2 = INCR, w2 C M I
567 .Lcj8: st8 [rp] = w1, 16 C M23
568 st8 [rpx] = w2, 32 C M23
569 (p8) cmp.eq.or p9, p0 = LIM, w3 C M I
570 (p8) add w3 = INCR, w3 C M I
571 ADDSUB w4 = u4, v4 C M I
573 .Lcj67: st8 [rp] = w3, 8 C M23
574 cmp.PRED p6, p0 = w4, u4 C M I
575 ADDSUB w5 = u5, v5 C M I
577 cmp.PRED p7, p0 = w5, u5 C M I
578 (p9) cmp.eq.or p6, p0 = LIM, w4 C M I
579 (p9) add w4 = INCR, w4 C M I
580 ADDSUB w6 = u6, v6 C M I
582 .Lcj5: cmp.PRED p8, p0 = w6, u6 C M I
583 (p6) cmp.eq.or p7, p0 = LIM, w5 C M I
584 st8 [rp] = w4, 8 C M23
585 (p6) add w5 = INCR, w5 C M I
586 ADDSUB w7 = u7, v7 C M I
588 .Lcj4: cmp.PRED p9, p0 = w7, u7 C M I
589 (p7) cmp.eq.or p8, p0 = LIM, w6 C M I
590 (p7) add w6 = INCR, w6 C M I
592 st8 [rp] = w5, 16 C M23
593 st8 [rpx] = w6, 32 C M23
595 (p8) cmp.eq.or p9, p0 = LIM, w7 C M I
596 (p8) add w7 = INCR, w7 C M I
597 ADDSUB w0 = u0, v0 C M I
599 .Lcj2: st8 [rp] = w7, 8 C M23
600 cmp.PRED p6, p0 = w0, u0 C M I
602 (p9) cmp.eq.or p6, p0 = LIM, w0 C M I
603 (p9) add w0 = INCR, w0 C M I
606 .Lcj1: st8 [rp] = w0, 8 C M23
607 mov.i ar.lc = r2 C I0
608 (p6) mov r8 = 1 C M I
609 br.ret.sptk.many b0 C B