1 dnl HP-PA 2.0 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and
2 dnl add the result to a second limb vector.
4 dnl Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
25 C 8500,8600,8700: 6.375
27 C The feed-in and wind-down code has not yet been scheduled. Many cycles
28 C could be saved there per call.
31 C The main loop "BIG" is 4-way unrolled, mainly to allow
32 C effective use of ADD,DC. Delays in moving data via the cache from the FP
33 C registers to the IU registers, have demanded a deep software pipeline, and
34 C a lot of stack slots for partial products in flight.
38 C do 0, 1, 2, or 3 limbs
39 C if done, restore-some-regs and return
45 C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
46 C slots marked FREE, as well as some slots in the caller's "frame marker".
83 C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
86 include(`../config.m4')
92 define(`vlimb',`%r23') C
94 define(`climb',`%r23') C
96 ifdef(`HAVE_ABI_2_0w',
100 PROLOGUE(mpn_addmul_1)
102 ifdef(`HAVE_ABI_2_0w',
103 ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
105 std,ma %r3, 0x100(%r30)
108 ldo 0(%r0), climb C clear climb
109 fldd -0x138(%r30), %fr8 C put vlimb in fp register
111 define(`p032a1',`%r1') C
112 define(`p032a2',`%r19') C
114 define(`m032',`%r20') C
115 define(`m096',`%r21') C
117 define(`p000a',`%r22') C
118 define(`p064a',`%r29') C
120 define(`s000',`%r31') C
122 define(`ma000',`%r4') C
123 define(`ma064',`%r20') C
125 define(`r000',`%r3') C
127 extrd,u n, 63, 2, %r5
128 cmpb,= %r5, %r0, L(BIG)
133 xmpyu %fr8R, %fr4L, %fr22
134 xmpyu %fr8L, %fr4R, %fr23
135 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
136 xmpyu %fr8R, %fr4R, %fr24
137 xmpyu %fr8L, %fr4L, %fr25
138 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
139 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
140 addib,<> -1, %r5, L(two_or_more)
141 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
143 ldd -0x78(%r30), p032a1
144 ldd -0x70(%r30), p032a2
145 ldd -0x80(%r30), p000a
147 ldd -0x68(%r30), p064a
152 xmpyu %fr8R, %fr4L, %fr22
153 xmpyu %fr8L, %fr4R, %fr23
154 ldd -0x78(%r30), p032a1
155 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
156 xmpyu %fr8R, %fr4R, %fr24
157 xmpyu %fr8L, %fr4L, %fr25
158 ldd -0x70(%r30), p032a2
159 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
160 ldd -0x80(%r30), p000a
161 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
162 ldd -0x68(%r30), p064a
163 addib,<> -1, %r5, L(three_or_more)
164 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
166 add p032a1, p032a2, m032
167 add,dc %r0, %r0, m096
168 depd,z m032, 31, 32, ma000
169 extrd,u m032, 31, 32, ma064
172 depd m096, 31, 32, ma064
176 add p032a1, p032a2, m032
177 add,dc %r0, %r0, m096
178 depd,z m032, 31, 32, ma000
179 extrd,u m032, 31, 32, ma064
181 C addib,= -1, %r5, L(0_out)
182 depd m096, 31, 32, ma064
184 C xmpyu %fr8R, %fr4L, %fr22
185 C xmpyu %fr8L, %fr4R, %fr23
186 C ldd -0x78(%r30), p032a1
187 C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
189 C xmpyu %fr8R, %fr4R, %fr24
190 C xmpyu %fr8L, %fr4L, %fr25
191 C ldd -0x70(%r30), p032a2
192 C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
195 C add climb, p000a, s000
196 C ldd -0x80(%r30), p000a
197 C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
199 C add,dc p064a, %r0, climb
201 C ldd -0x68(%r30), p064a
202 C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
204 C add ma000, s000, s000
205 C add,dc ma064, climb, climb
208 C add r000, s000, s000
209 C add,dc %r0, climb, climb
212 C add p032a1, p032a2, m032
213 C add,dc %r0, %r0, m096
215 C depd,z m032, 31, 32, ma000
216 C extrd,u m032, 31, 32, ma064
218 C addib,<> -1, %r5, L(loop0)
219 C depd m096, 31, 32, ma064
222 xmpyu %fr8R, %fr4L, %fr22
223 xmpyu %fr8L, %fr4R, %fr23
224 ldd -0x78(%r30), p032a1
225 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
226 xmpyu %fr8R, %fr4R, %fr24
227 xmpyu %fr8L, %fr4L, %fr25
228 ldd -0x70(%r30), p032a2
229 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
231 add climb, p000a, s000
232 ldd -0x80(%r30), p000a
233 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
234 add,dc p064a, %r0, climb
235 ldd -0x68(%r30), p064a
236 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
237 add ma000, s000, s000
238 add,dc ma064, climb, climb
240 add,dc %r0, climb, climb
242 add p032a1, p032a2, m032
243 add,dc %r0, %r0, m096
244 depd,z m032, 31, 32, ma000
245 extrd,u m032, 31, 32, ma064
247 depd m096, 31, 32, ma064
249 ldd -0x78(%r30), p032a1
250 ldd -0x70(%r30), p032a2
252 add climb, p000a, s000
253 ldd -0x80(%r30), p000a
254 add,dc p064a, %r0, climb
255 ldd -0x68(%r30), p064a
256 add ma000, s000, s000
257 add,dc ma064, climb, climb
259 add,dc %r0, climb, climb
262 add p032a1, p032a2, m032
263 add,dc %r0, %r0, m096
264 depd,z m032, 31, 32, ma000
265 extrd,u m032, 31, 32, ma064
267 depd m096, 31, 32, ma064
269 add climb, p000a, s000
270 add,dc p064a, %r0, climb
271 add ma000, s000, s000
272 add,dc ma064, climb, climb
274 add,dc %r0, climb, climb
277 cmpib,>= 4, n, L(done)
280 C 4-way unrolled code.
284 define(`p032a1',`%r1') C
285 define(`p032a2',`%r19') C
286 define(`p096b1',`%r20') C
287 define(`p096b2',`%r21') C
288 define(`p160c1',`%r22') C
289 define(`p160c2',`%r29') C
290 define(`p224d1',`%r31') C
291 define(`p224d2',`%r3') C
293 define(`m032',`%r4') C
294 define(`m096',`%r5') C
295 define(`m160',`%r6') C
296 define(`m224',`%r7') C
297 define(`m288',`%r8') C
299 define(`p000a',`%r1') C
300 define(`p064a',`%r19') C
301 define(`p064b',`%r20') C
302 define(`p128b',`%r21') C
303 define(`p128c',`%r22') C
304 define(`p192c',`%r29') C
305 define(`p192d',`%r31') C
306 define(`p256d',`%r3') C
308 define(`s000',`%r10') C
309 define(`s064',`%r11') C
310 define(`s128',`%r12') C
311 define(`s192',`%r13') C
313 define(`ma000',`%r9') C
314 define(`ma064',`%r4') C
315 define(`ma128',`%r5') C
316 define(`ma192',`%r6') C
317 define(`ma256',`%r7') C
319 define(`r000',`%r1') C
320 define(`r064',`%r19') C
321 define(`r128',`%r20') C
322 define(`r192',`%r21') C
328 std %r10, -0xc8(%r30)
329 std %r11, -0xc0(%r30)
330 std %r12, -0xb8(%r30)
331 std %r13, -0xb0(%r30)
333 ifdef(`HAVE_ABI_2_0w',
334 ` extrd,u n, 61, 62, n C right shift 2
335 ',` extrd,u n, 61, 30, n C right shift 2, zero extend
343 xmpyu %fr8R, %fr4L, %fr22
344 xmpyu %fr8L, %fr4R, %fr23
345 xmpyu %fr8R, %fr5L, %fr24
346 xmpyu %fr8L, %fr5R, %fr25
347 xmpyu %fr8R, %fr6L, %fr26
348 xmpyu %fr8L, %fr6R, %fr27
349 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
350 xmpyu %fr8R, %fr7L, %fr28
351 xmpyu %fr8L, %fr7R, %fr29
352 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
353 xmpyu %fr8R, %fr4R, %fr30
354 xmpyu %fr8L, %fr4L, %fr31
355 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
356 xmpyu %fr8R, %fr5R, %fr22
357 xmpyu %fr8L, %fr5L, %fr23
358 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
359 xmpyu %fr8R, %fr6R, %fr24
360 xmpyu %fr8L, %fr6L, %fr25
361 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
362 xmpyu %fr8R, %fr7R, %fr26
363 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
364 addib,<> -1, n, L(8_or_more)
365 xmpyu %fr8L, %fr7L, %fr27
366 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
367 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
368 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
369 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
370 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
371 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
372 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
373 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
374 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
375 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
376 ldd -0x78(%r30), p032a1
377 ldd -0x70(%r30), p032a2
378 ldd -0x38(%r30), p096b1
379 ldd -0x30(%r30), p096b2
380 ldd -0x58(%r30), p160c1
381 ldd -0x50(%r30), p160c2
382 ldd -0x18(%r30), p224d1
383 ldd -0x10(%r30), p224d2
388 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
389 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
391 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
392 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
393 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
394 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
395 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
396 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
397 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
398 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
403 xmpyu %fr8R, %fr4L, %fr22
404 ldd -0x78(%r30), p032a1
405 xmpyu %fr8L, %fr4R, %fr23
406 xmpyu %fr8R, %fr5L, %fr24
407 ldd -0x70(%r30), p032a2
408 xmpyu %fr8L, %fr5R, %fr25
409 xmpyu %fr8R, %fr6L, %fr26
410 ldd -0x38(%r30), p096b1
411 xmpyu %fr8L, %fr6R, %fr27
412 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
413 xmpyu %fr8R, %fr7L, %fr28
414 ldd -0x30(%r30), p096b2
415 xmpyu %fr8L, %fr7R, %fr29
416 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
417 xmpyu %fr8R, %fr4R, %fr30
418 ldd -0x58(%r30), p160c1
419 xmpyu %fr8L, %fr4L, %fr31
420 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
421 xmpyu %fr8R, %fr5R, %fr22
422 ldd -0x50(%r30), p160c2
423 xmpyu %fr8L, %fr5L, %fr23
424 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
425 xmpyu %fr8R, %fr6R, %fr24
426 ldd -0x18(%r30), p224d1
427 xmpyu %fr8L, %fr6L, %fr25
428 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
429 xmpyu %fr8R, %fr7R, %fr26
430 ldd -0x10(%r30), p224d2
431 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
432 addib,= -1, n, L(end2)
433 xmpyu %fr8L, %fr7L, %fr27
435 add p032a1, p032a2, m032
436 ldd -0x80(%r30), p000a
437 add,dc p096b1, p096b2, m096
438 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
440 add,dc p160c1, p160c2, m160
441 ldd -0x68(%r30), p064a
442 add,dc p224d1, p224d2, m224
443 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
445 add,dc %r0, %r0, m288
446 ldd -0x40(%r30), p064b
448 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
450 depd,z m032, 31, 32, ma000
451 ldd -0x28(%r30), p128b
452 extrd,u m032, 31, 32, ma064
453 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
455 depd m096, 31, 32, ma064
456 ldd -0x60(%r30), p128c
457 extrd,u m096, 31, 32, ma128
458 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
460 depd m160, 31, 32, ma128
461 ldd -0x48(%r30), p192c
462 extrd,u m160, 31, 32, ma192
463 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
465 depd m224, 31, 32, ma192
466 ldd -0x20(%r30), p192d
467 extrd,u m224, 31, 32, ma256
468 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
470 depd m288, 31, 32, ma256
471 ldd -0x88(%r30), p256d
472 add climb, p000a, s000
473 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
475 add,dc p064a, p064b, s064
477 add,dc p128b, p128c, s128
478 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
480 add,dc p192c, p192d, s192
482 add,dc p256d, %r0, climb
483 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
486 add ma000, s000, s000 C accum mid 0
488 add,dc ma064, s064, s064 C accum mid 1
490 add,dc ma128, s128, s128 C accum mid 2
492 add,dc ma192, s192, s192 C accum mid 3
495 add,dc ma256, climb, climb
497 add r000, s000, s000 C accum rlimb 0
500 add,dc r064, s064, s064 C accum rlimb 1
501 add,dc r128, s128, s128 C accum rlimb 2
504 add,dc r192, s192, s192 C accum rlimb 3
505 add,dc %r0, climb, climb
508 xmpyu %fr8R, %fr4L, %fr22
509 ldd -0x78(%r30), p032a1
510 xmpyu %fr8L, %fr4R, %fr23
513 xmpyu %fr8R, %fr5L, %fr24
514 ldd -0x70(%r30), p032a2
515 xmpyu %fr8L, %fr5R, %fr25
518 xmpyu %fr8R, %fr6L, %fr26
519 ldd -0x38(%r30), p096b1
520 xmpyu %fr8L, %fr6R, %fr27
521 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
523 xmpyu %fr8R, %fr7L, %fr28
524 ldd -0x30(%r30), p096b2
525 xmpyu %fr8L, %fr7R, %fr29
526 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
528 xmpyu %fr8R, %fr4R, %fr30
529 ldd -0x58(%r30), p160c1
530 xmpyu %fr8L, %fr4L, %fr31
531 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
533 xmpyu %fr8R, %fr5R, %fr22
534 ldd -0x50(%r30), p160c2
535 xmpyu %fr8L, %fr5L, %fr23
536 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
538 xmpyu %fr8R, %fr6R, %fr24
539 ldd -0x18(%r30), p224d1
540 xmpyu %fr8L, %fr6L, %fr25
541 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
543 xmpyu %fr8R, %fr7R, %fr26
544 ldd -0x10(%r30), p224d2
545 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
546 xmpyu %fr8L, %fr7L, %fr27
548 addib,<> -1, n, L(loop)
552 add p032a1, p032a2, m032
553 ldd -0x80(%r30), p000a
554 add,dc p096b1, p096b2, m096
555 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
556 add,dc p160c1, p160c2, m160
557 ldd -0x68(%r30), p064a
558 add,dc p224d1, p224d2, m224
559 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
560 add,dc %r0, %r0, m288
561 ldd -0x40(%r30), p064b
562 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
563 depd,z m032, 31, 32, ma000
564 ldd -0x28(%r30), p128b
565 extrd,u m032, 31, 32, ma064
566 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
567 depd m096, 31, 32, ma064
568 ldd -0x60(%r30), p128c
569 extrd,u m096, 31, 32, ma128
570 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
571 depd m160, 31, 32, ma128
572 ldd -0x48(%r30), p192c
573 extrd,u m160, 31, 32, ma192
574 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
575 depd m224, 31, 32, ma192
576 ldd -0x20(%r30), p192d
577 extrd,u m224, 31, 32, ma256
578 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
579 depd m288, 31, 32, ma256
580 ldd -0x88(%r30), p256d
581 add climb, p000a, s000
582 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
583 add,dc p064a, p064b, s064
585 add,dc p128b, p128c, s128
586 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
587 add,dc p192c, p192d, s192
589 add,dc p256d, %r0, climb
590 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
592 add ma000, s000, s000 C accum mid 0
594 add,dc ma064, s064, s064 C accum mid 1
595 add,dc ma128, s128, s128 C accum mid 2
596 add,dc ma192, s192, s192 C accum mid 3
597 add,dc ma256, climb, climb
598 add r000, s000, s000 C accum rlimb 0
599 add,dc r064, s064, s064 C accum rlimb 1
600 add,dc r128, s128, s128 C accum rlimb 2
602 add,dc r192, s192, s192 C accum rlimb 3
603 add,dc %r0, climb, climb
605 ldd -0x78(%r30), p032a1
607 ldd -0x70(%r30), p032a2
609 ldd -0x38(%r30), p096b1
610 ldd -0x30(%r30), p096b2
611 ldd -0x58(%r30), p160c1
612 ldd -0x50(%r30), p160c2
613 ldd -0x18(%r30), p224d1
614 ldd -0x10(%r30), p224d2
618 add p032a1, p032a2, m032
619 ldd -0x80(%r30), p000a
620 add,dc p096b1, p096b2, m096
621 add,dc p160c1, p160c2, m160
622 ldd -0x68(%r30), p064a
623 add,dc p224d1, p224d2, m224
624 add,dc %r0, %r0, m288
625 ldd -0x40(%r30), p064b
626 depd,z m032, 31, 32, ma000
627 ldd -0x28(%r30), p128b
628 extrd,u m032, 31, 32, ma064
629 depd m096, 31, 32, ma064
630 ldd -0x60(%r30), p128c
631 extrd,u m096, 31, 32, ma128
632 depd m160, 31, 32, ma128
633 ldd -0x48(%r30), p192c
634 extrd,u m160, 31, 32, ma192
635 depd m224, 31, 32, ma192
636 ldd -0x20(%r30), p192d
637 extrd,u m224, 31, 32, ma256
638 depd m288, 31, 32, ma256
639 ldd -0x88(%r30), p256d
640 add climb, p000a, s000
641 add,dc p064a, p064b, s064
643 add,dc p128b, p128c, s128
644 add,dc p192c, p192d, s192
646 add,dc p256d, %r0, climb
648 add ma000, s000, s000 C accum mid 0
650 add,dc ma064, s064, s064 C accum mid 1
651 add,dc ma128, s128, s128 C accum mid 2
652 add,dc ma192, s192, s192 C accum mid 3
653 add,dc ma256, climb, climb
654 add r000, s000, s000 C accum rlimb 0
655 add,dc r064, s064, s064 C accum rlimb 1
656 add,dc r128, s128, s128 C accum rlimb 2
658 add,dc r192, s192, s192 C accum rlimb 3
659 add,dc %r0, climb, climb
664 ldd -0xb0(%r30), %r13
665 ldd -0xb8(%r30), %r12
666 ldd -0xc0(%r30), %r11
667 ldd -0xc8(%r30), %r10
673 ifdef(`HAVE_ABI_2_0w',
675 ',` extrd,u climb, 63, 32, %r29
676 extrd,u climb, 31, 32, %r28
681 ldd,mb -0x100(%r30), %r3
682 EPILOGUE(mpn_addmul_1)