1 dnl IA-64 mpn_mul_1, mpn_mul_1c -- Multiply a limb vector with a limb and
2 dnl store the result in a second limb vector.
4 dnl Copyright 2000, 2001, 2002, 2003, 2004, 2006, 2007 Free Software
7 dnl This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22 include(`../config.m4')
29 C * Further optimize feed-in and wind-down code, both for speed and code size.
30 C * Handle low limb input and results specially, using a common stf8 in the
32 C * Use 1 c/l carry propagation scheme in wind-down code.
33 C * Use extra pointer register for `up' to speed up feed-in loads.
34 C * Work out final differences with addmul_1.asm.
41 define(`cy', `r36') C for mpn_mul_1c
50 ` addp4 rp = 0, rp C M I
51 addp4 up = 0, up C M I
56 adds r15 = -1, n C M I
68 setf.sig f6 = vl C M2 M3
69 shr.u r31 = r15, 2 C I0
70 cmp.eq p10, p0 = 0, r14 C M I
73 cmp.eq p11, p0 = 2, r14 C M I
74 cmp.eq p12, p0 = 3, r14 C M I
79 cmp.ne p6, p7 = r0, r0 C M I
80 mov.i ar.lc = r31 C I0
81 cmp.ne p8, p9 = r0, r0 C M I
84 (p10) br.dptk .Lb00 C B
85 (p11) br.dptk .Lb10 C B
86 (p12) br.dptk .Lb11 C B
91 br.cloop.dptk .grt1 C B
93 xma.l f39 = f7, f6, f9 C F
94 xma.hu f43 = f7, f6, f9 C F
96 getf.sig r8 = f43 C M2
97 stf8 [rp] = f39 C M2 M3
99 br.ret.sptk.many b0 C B
107 xma.l f39 = f7, f6, f9
108 xma.hu f43 = f7, f6, f9
113 xma.l f36 = f32, f6, f0
114 xma.hu f40 = f32, f6, f0
117 xma.l f37 = f33, f6, f0
118 xma.hu f41 = f33, f6, f0
122 xma.l f38 = f34, f6, f0
123 xma.hu f42 = f34, f6, f0
127 xma.l f39 = f35, f6, f0
128 xma.hu f43 = f35, f6, f0
135 xma.l f36 = f32, f6, f0
136 xma.hu f40 = f32, f6, f0
140 xma.l f37 = f33, f6, f0
141 xma.hu f41 = f33, f6, f0
145 xma.l f38 = f34, f6, f0
148 xma.hu f42 = f34, f6, f0
152 xma.l f39 = f35, f6, f0
155 xma.hu f43 = f35, f6, f0
159 .Lb10: ldf8 f35 = [up], 8
163 xma.l f38 = f7, f6, f9
164 xma.hu f42 = f7, f6, f9
167 xma.l f39 = f35, f6, f42
168 xma.hu f43 = f35, f6, f42
180 xma.l f38 = f7, f6, f9
181 xma.hu f42 = f7, f6, f9
184 xma.l f39 = f35, f6, f0
185 xma.hu f43 = f35, f6, f0
191 xma.l f36 = f32, f6, f0
192 xma.hu f40 = f32, f6, f0
196 xma.l f37 = f33, f6, f0
197 xma.hu f41 = f33, f6, f0
201 xma.l f38 = f34, f6, f0
202 xma.hu f42 = f34, f6, f0
206 xma.l f39 = f35, f6, f0
207 xma.hu f43 = f35, f6, f0
212 xma.l f36 = f32, f6, f0
213 xma.hu f40 = f32, f6, f0
217 xma.l f37 = f33, f6, f0
220 xma.hu f41 = f33, f6, f0
224 xma.l f38 = f34, f6, f0
227 xma.hu f42 = f34, f6, f0
231 .Lb11: ldf8 f34 = [up], 8
238 xma.l f37 = f7, f6, f9
239 xma.hu f41 = f7, f6, f9
240 xma.l f38 = f34, f6, f0
241 xma.hu f42 = f34, f6, f0
242 xma.l f39 = f35, f6, f0
243 xma.hu f43 = f35, f6, f0
255 xma.l f37 = f7, f6, f9
256 xma.hu f41 = f7, f6, f9
259 xma.l f38 = f34, f6, f0
260 xma.hu f42 = f34, f6, f0
264 xma.l f39 = f35, f6, f0
265 xma.hu f43 = f35, f6, f0
272 xma.l f36 = f32, f6, f0
274 xma.hu f40 = f32, f6, f0
277 xma.l f37 = f33, f6, f0
279 xma.hu f41 = f33, f6, f0
283 xma.l f38 = f34, f6, f0
284 xma.hu f42 = f34, f6, f0
289 xma.l f36 = f32, f6, f0
290 xma.hu f40 = f32, f6, f0
294 xma.l f37 = f33, f6, f0
297 xma.hu f41 = f33, f6, f0
301 .Lb00: ldf8 f33 = [up], 8
307 xma.l f36 = f7, f6, f9
308 xma.hu f40 = f7, f6, f9
311 xma.l f37 = f33, f6, f0
312 xma.hu f41 = f33, f6, f0
313 xma.l f38 = f34, f6, f0
314 xma.hu f42 = f34, f6, f0
318 xma.l f39 = f35, f6, f0
320 xma.hu f43 = f35, f6, f0
330 xma.l f37 = f33, f6, f0
331 xma.hu f41 = f33, f6, f0
335 xma.l f38 = f34, f6, f0
336 xma.hu f42 = f34, f6, f0
340 xma.l f39 = f35, f6, f0
344 xma.hu f43 = f35, f6, f0
349 xma.l f36 = f32, f6, f0
351 xma.hu f40 = f32, f6, f0
355 xma.l f37 = f33, f6, f0
356 xma.hu f41 = f33, f6, f0
361 xma.l f36 = f32, f6, f0
362 xma.hu f40 = f32, f6, f0
366 C *** MAIN LOOP START ***
369 .pred.rel "mutex",p6,p7
371 xma.l f36 = f32, f6, f0
372 (p6) cmp.leu p8, p9 = r24, r17
374 xma.hu f40 = f32, f6, f0
375 (p7) cmp.ltu p8, p9 = r24, r17
378 .pred.rel "mutex",p8,p9
380 (p8) add r24 = r18, r21, 1
383 (p9) add r24 = r18, r21
386 .pred.rel "mutex",p8,p9
388 xma.l f37 = f33, f6, f0
389 (p8) cmp.leu p6, p7 = r24, r18
391 xma.hu f41 = f33, f6, f0
392 (p9) cmp.ltu p6, p7 = r24, r18
395 .pred.rel "mutex",p6,p7
397 (p6) add r24 = r19, r22, 1
400 (p7) add r24 = r19, r22
403 .pred.rel "mutex",p6,p7
405 xma.l f38 = f34, f6, f0
406 (p6) cmp.leu p8, p9 = r24, r19
408 xma.hu f42 = f34, f6, f0
409 (p7) cmp.ltu p8, p9 = r24, r19
412 .pred.rel "mutex",p8,p9
414 (p8) add r24 = r16, r23, 1
417 (p9) add r24 = r16, r23
420 .pred.rel "mutex",p8,p9
422 xma.l f39 = f35, f6, f0
423 (p8) cmp.leu p6, p7 = r24, r16
425 xma.hu f43 = f35, f6, f0
426 (p9) cmp.ltu p6, p7 = r24, r16
429 .pred.rel "mutex",p6,p7
431 (p6) add r24 = r17, r20, 1
434 (p7) add r24 = r17, r20
436 C *** MAIN LOOP END ***
440 .pred.rel "mutex",p6,p7
442 xma.l f36 = f32, f6, f0
443 (p6) cmp.leu p8, p9 = r24, r17
445 xma.hu f40 = f32, f6, f0
446 (p7) cmp.ltu p8, p9 = r24, r17
448 .pred.rel "mutex",p8,p9
450 (p8) add r24 = r18, r21, 1
451 (p9) add r24 = r18, r21
453 .pred.rel "mutex",p8,p9
455 xma.l f37 = f33, f6, f0
456 (p8) cmp.leu p6, p7 = r24, r18
458 xma.hu f41 = f33, f6, f0
459 (p9) cmp.ltu p6, p7 = r24, r18
462 .pred.rel "mutex",p6,p7
464 (p6) add r24 = r19, r22, 1
465 (p7) add r24 = r19, r22
467 .pred.rel "mutex",p6,p7
469 xma.l f38 = f34, f6, f0
470 (p6) cmp.leu p8, p9 = r24, r19
472 xma.hu f42 = f34, f6, f0
473 (p7) cmp.ltu p8, p9 = r24, r19
476 .pred.rel "mutex",p8,p9
478 (p8) add r24 = r16, r23, 1
479 (p9) add r24 = r16, r23
481 .pred.rel "mutex",p8,p9
483 xma.l f39 = f35, f6, f0
484 (p8) cmp.leu p6, p7 = r24, r16
486 xma.hu f43 = f35, f6, f0
487 (p9) cmp.ltu p6, p7 = r24, r16
490 .pred.rel "mutex",p6,p7
492 (p6) add r24 = r17, r20, 1
493 (p7) add r24 = r17, r20
495 .pred.rel "mutex",p6,p7
496 (p6) cmp.leu p8, p9 = r24, r17
497 (p7) cmp.ltu p8, p9 = r24, r17
502 .pred.rel "mutex",p8,p9
504 (p8) add r24 = r18, r21, 1
505 (p9) add r24 = r18, r21
507 .pred.rel "mutex",p8,p9
508 (p8) cmp.leu p6, p7 = r24, r18
509 (p9) cmp.ltu p6, p7 = r24, r18
514 .pred.rel "mutex",p6,p7
516 (p6) add r24 = r19, r22, 1
517 (p7) add r24 = r19, r22
519 .pred.rel "mutex",p6,p7
521 (p6) cmp.leu p8, p9 = r24, r19
522 (p7) cmp.ltu p8, p9 = r24, r19
525 .pred.rel "mutex",p8,p9
526 (p8) add r24 = r16, r23, 1
527 (p9) add r24 = r16, r23
529 .pred.rel "mutex",p8,p9
531 (p8) cmp.leu p6, p7 = r24, r16
532 (p9) cmp.ltu p6, p7 = r24, r16
535 .pred.rel "mutex",p6,p7
536 (p6) add r24 = r17, r20, 1
537 (p7) add r24 = r17, r20
539 .pred.rel "mutex",p6,p7
541 (p6) cmp.leu p8, p9 = r24, r17
542 (p7) cmp.ltu p8, p9 = r24, r17
544 .pred.rel "mutex",p8,p9
556 ` addp4 rp = 0, rp C M I
557 addp4 up = 0, up C M I
562 adds r15 = -1, n C M I
563 setf.sig f9 = cy C M2 M3
564 mov.i r2 = ar.lc C I0
567 ldf8 f7 = [up], 8 C M