1 dnl IA-64 mpn_submul_1 -- Multiply a limb vector with a limb and subtract the
2 dnl result from a second limb vector.
4 dnl Copyright 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
25 C Itanium 2: 2.25 (alignment dependent, sometimes it seems to need 3 c/l)
28 C * Optimize feed-in and wind-down code, both for speed and code size.
29 C * Handle low limb input and results specially, using a common stf8 in the
31 C * Delay r8, r10 initialization, put cmp-p6 in 1st bundle and br .Ldone in
32 C 2nd bundle. This will allow the bbb bundle to be one cycle earlier and
42 PROLOGUE(mpn_submul_1)
48 ` addp4 rp = 0, rp C M I
49 addp4 up = 0, up C M I
56 sub vl = r0, vl C M I negate vl
61 add r19 = -1, n C M I n - 1
65 cmp.eq p6, p0 = 0, vl C M I
66 mov r8 = 0 C M I zero cylimb
70 setf.sig f6 = vl C M2 M3
72 shr.u r19 = r19, 2 C I0
77 cmp.eq p10, p0 = 0, r14 C M I
78 (p6) br.spnt .Ldone C B vl == 0
81 cmp.eq p11, p0 = 2, r14 C M I
82 cmp.eq p12, p0 = 3, r14 C M I
86 (p10) br.dptk .Lb00 C B
87 (p11) br.dptk .Lb10 C B
88 (p12) br.dptk .Lb11 C B
92 .Lb01: br.cloop.dptk .grt1
94 xma.l f39 = f7, f6, f8
95 xma.hu f43 = f7, f6, f8
97 getf.sig r27 = f39 C lo
98 getf.sig r31 = f43 C hi
102 .grt1: ldf8 f44 = [rp], 8
109 xma.l f39 = f7, f6, f8
111 xma.hu f43 = f7, f6, f8
114 xma.l f36 = f32, f6, f44
116 xma.hu f40 = f32, f6, f44
120 getf.sig r27 = f39 C lo
121 xma.l f37 = f33, f6, f45
123 xma.hu f41 = f33, f6, f45
125 getf.sig r31 = f43 C hi
126 getf.sig r24 = f36 C lo
127 xma.l f38 = f34, f6, f46
129 xma.hu f42 = f34, f6, f46
131 getf.sig r28 = f40 C hi
132 getf.sig r25 = f37 C lo
133 xma.l f39 = f35, f6, f47
135 xma.hu f43 = f35, f6, f47
137 getf.sig r29 = f41 C hi
138 getf.sig r26 = f38 C lo
142 .grt5: ldf8 f44 = [rp], 8
145 getf.sig r27 = f39 C lo
146 xma.l f37 = f33, f6, f45
148 xma.hu f41 = f33, f6, f45
151 getf.sig r31 = f43 C hi
154 getf.sig r24 = f36 C lo
155 xma.l f38 = f34, f6, f46
157 xma.hu f42 = f34, f6, f46
160 getf.sig r28 = f40 C hi
163 getf.sig r25 = f37 C lo
164 xma.l f39 = f35, f6, f47
166 xma.hu f43 = f35, f6, f47
169 getf.sig r29 = f41 C hi
172 getf.sig r26 = f38 C lo
173 xma.l f36 = f32, f6, f44
175 xma.hu f40 = f32, f6, f44
180 .Lb10: ldf8 f47 = [rp], 8
184 xma.l f38 = f7, f6, f8
185 xma.hu f42 = f7, f6, f8
187 xma.l f39 = f35, f6, f47
188 xma.hu f43 = f35, f6, f47
190 getf.sig r26 = f38 C lo
191 getf.sig r30 = f42 C hi
194 getf.sig r27 = f39 C lo
195 getf.sig r31 = f43 C hi
199 .grt2: ldf8 f44 = [rp], 8
204 xma.l f38 = f7, f6, f8
205 xma.hu f42 = f7, f6, f8
209 xma.l f39 = f35, f6, f47
210 xma.hu f43 = f35, f6, f47
215 getf.sig r26 = f38 C lo
216 xma.l f36 = f32, f6, f44
218 xma.hu f40 = f32, f6, f44
221 getf.sig r30 = f42 C hi
223 getf.sig r27 = f39 C lo
224 xma.l f37 = f33, f6, f45
226 xma.hu f41 = f33, f6, f45
228 getf.sig r31 = f43 C hi
229 getf.sig r24 = f36 C lo
230 xma.l f38 = f34, f6, f46
232 xma.hu f42 = f34, f6, f46
234 getf.sig r28 = f40 C hi
235 getf.sig r25 = f37 C lo
236 xma.l f39 = f35, f6, f47
238 xma.hu f43 = f35, f6, f47
241 .grt6: ldf8 f44 = [rp], 8
242 getf.sig r30 = f42 C hi
245 getf.sig r27 = f39 C lo
246 xma.l f37 = f33, f6, f45
248 xma.hu f41 = f33, f6, f45
251 getf.sig r31 = f43 C hi
254 getf.sig r24 = f36 C lo
255 xma.l f38 = f34, f6, f46
257 xma.hu f42 = f34, f6, f46
260 getf.sig r28 = f40 C hi
263 getf.sig r25 = f37 C lo
264 xma.l f39 = f35, f6, f47
266 xma.hu f43 = f35, f6, f47
270 .Lb11: ldf8 f46 = [rp], 8
277 xma.l f37 = f7, f6, f8
278 xma.hu f41 = f7, f6, f8
280 xma.l f38 = f34, f6, f46
281 xma.hu f42 = f34, f6, f46
283 getf.sig r25 = f37 C lo
284 xma.l f39 = f35, f6, f47
285 xma.hu f43 = f35, f6, f47
287 getf.sig r29 = f41 C hi
290 getf.sig r26 = f38 C lo
291 getf.sig r30 = f42 C hi
294 getf.sig r27 = f39 C lo
295 getf.sig r31 = f43 C hi
299 .grt3: ldf8 f44 = [rp], 8
300 xma.l f37 = f7, f6, f8
302 xma.hu f41 = f7, f6, f8
305 xma.l f38 = f34, f6, f46
307 xma.hu f42 = f34, f6, f46
312 getf.sig r25 = f37 C lo
313 xma.l f39 = f35, f6, f47
315 xma.hu f43 = f35, f6, f47
318 getf.sig r29 = f41 C hi
321 getf.sig r26 = f38 C lo
322 xma.l f36 = f32, f6, f44
324 xma.hu f40 = f32, f6, f44
328 getf.sig r30 = f42 C hi
329 getf.sig r27 = f39 C lo
330 xma.l f37 = f33, f6, f45
332 xma.hu f41 = f33, f6, f45
334 getf.sig r31 = f43 C hi
335 getf.sig r24 = f36 C lo
336 xma.l f38 = f34, f6, f46
338 xma.hu f42 = f34, f6, f46
341 .grt7: ldf8 f44 = [rp], 8
342 getf.sig r30 = f42 C hi
345 getf.sig r27 = f39 C lo
346 xma.l f37 = f33, f6, f45
348 xma.hu f41 = f33, f6, f45
351 getf.sig r31 = f43 C hi
354 getf.sig r24 = f36 C lo
355 xma.l f38 = f34, f6, f46
357 xma.hu f42 = f34, f6, f46
361 .Lb00: ldf8 f45 = [rp], 8
368 xma.l f36 = f7, f6, f8
370 xma.hu f40 = f7, f6, f8
373 xma.l f37 = f33, f6, f45
374 xma.hu f41 = f33, f6, f45
376 getf.sig r24 = f36 C lo
377 xma.l f38 = f34, f6, f46
379 xma.hu f42 = f34, f6, f46
381 getf.sig r28 = f40 C hi
382 xma.l f39 = f35, f6, f47
383 getf.sig r25 = f37 C lo
385 xma.hu f43 = f35, f6, f47
387 getf.sig r29 = f41 C hi
388 getf.sig r26 = f38 C lo
391 getf.sig r30 = f42 C hi
392 getf.sig r27 = f39 C lo
396 .grt4: ldf8 f44 = [rp], 8
397 xma.l f37 = f33, f6, f45
399 xma.hu f41 = f33, f6, f45
403 xma.l f38 = f34, f6, f46
404 getf.sig r24 = f36 C lo
406 xma.hu f42 = f34, f6, f46
409 getf.sig r28 = f40 C hi
411 xma.l f39 = f35, f6, f47
412 getf.sig r25 = f37 C lo
414 xma.hu f43 = f35, f6, f47
417 getf.sig r29 = f41 C hi
420 getf.sig r26 = f38 C lo
421 xma.l f36 = f32, f6, f44
423 xma.hu f40 = f32, f6, f44
427 getf.sig r30 = f42 C hi
428 getf.sig r27 = f39 C lo
429 xma.l f37 = f33, f6, f45
431 xma.hu f41 = f33, f6, f45
434 .grt8: ldf8 f44 = [rp], 8
435 getf.sig r30 = f42 C hi
438 getf.sig r27 = f39 C lo
439 xma.l f37 = f33, f6, f45
441 xma.hu f41 = f33, f6, f45
448 cmp.ltu p6, p0 = r27, r8 C lo cmp
449 sub r14 = r27, r8 C lo sub
452 getf.sig r30 = f42 C hi
454 sub r8 = r20, r31 C hi sub
458 getf.sig r27 = f39 C lo
460 xma.l f37 = f33, f6, f45
464 xma.hu f41 = f33, f6, f45
469 .LL00: ldf8 f45 = [rp], 8
470 cmp.ltu p6, p0 = r24, r8
474 getf.sig r31 = f43 C hi
480 getf.sig r24 = f36 C lo
482 xma.l f38 = f34, f6, f46
486 xma.hu f42 = f34, f6, f46
491 .LL11: ldf8 f46 = [rp], 8
492 cmp.ltu p6, p0 = r25, r8
496 getf.sig r28 = f40 C hi
502 getf.sig r25 = f37 C lo
504 xma.l f39 = f35, f6, f47
508 xma.hu f43 = f35, f6, f47
513 .LL10: ldf8 f47 = [rp], 8
514 cmp.ltu p6, p0 = r26, r8
518 getf.sig r29 = f41 C hi
524 getf.sig r26 = f38 C lo
526 xma.l f36 = f32, f6, f44
530 xma.hu f40 = f32, f6, f44
537 cmp.ltu p6, p0 = r27, r8
544 xma.l f37 = f33, f6, f45
546 xma.hu f41 = f33, f6, f45
550 cmp.ltu p6, p0 = r24, r8
557 xma.l f38 = f34, f6, f46
559 xma.hu f42 = f34, f6, f46
563 cmp.ltu p6, p0 = r25, r8
570 xma.l f39 = f35, f6, f47
572 xma.hu f43 = f35, f6, f47
576 cmp.ltu p6, p0 = r26, r8
587 cmp.ltu p6, p0 = r27, r8
598 cmp.ltu p6, p0 = r24, r8
607 cmp.ltu p6, p0 = r25, r8
615 cmp.ltu p6, p0 = r26, r8
623 cmp.ltu p6, p0 = r27, r8
631 .Ldone: mov ar.lc = r2