1 dnl HP-PA 2.0 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2 dnl the result in a second limb vector.
4 dnl Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
25 C 8500,8600,8700: 5.625
27 C The feed-in and wind-down code has not yet been scheduled. Many cycles
28 C could be saved there per call.
31 C The main loop "BIG" is 4-way unrolled, mainly to allow
32 C effective use of ADD,DC. Delays in moving data via the cache from the FP
33 C registers to the IU registers, have demanded a deep software pipeline, and
34 C a lot of stack slots for partial products in flight.
38 C do 0, 1, 2, or 3 limbs
39 C if done, restore-some-regs and return
45 C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
46 C slots marked FREE, as well as some slots in the caller's "frame marker".
83 C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
86 include(`../config.m4')
92 define(`vlimb',`%r23') C
94 define(`climb',`%r23') C
96 ifdef(`HAVE_ABI_2_0w',
102 ifdef(`HAVE_ABI_2_0w',
103 ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
105 std,ma %r3, 0x100(%r30)
108 ldo 0(%r0), climb C clear climb
109 fldd -0x138(%r30), %fr8 C put vlimb in fp register
111 define(`p032a1',`%r1') C
112 define(`p032a2',`%r19') C
114 define(`m032',`%r20') C
115 define(`m096',`%r21') C
117 define(`p000a',`%r22') C
118 define(`p064a',`%r29') C
120 define(`s000',`%r31') C
122 define(`ma000',`%r4') C
123 define(`ma064',`%r20') C
125 C define(`r000',`%r3') C FIXME don't save r3 for n < 4.
127 extrd,u n, 63, 2, %r5
128 cmpb,= %r5, %r0, L(BIG)
133 xmpyu %fr8R, %fr4L, %fr22
134 xmpyu %fr8L, %fr4R, %fr23
135 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
136 xmpyu %fr8R, %fr4R, %fr24
137 xmpyu %fr8L, %fr4L, %fr25
138 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
139 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
140 addib,<> -1, %r5, L(two_or_more)
141 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
143 ldd -0x78(%r30), p032a1
144 ldd -0x70(%r30), p032a2
145 ldd -0x80(%r30), p000a
147 ldd -0x68(%r30), p064a
152 xmpyu %fr8R, %fr4L, %fr22
153 xmpyu %fr8L, %fr4R, %fr23
154 ldd -0x78(%r30), p032a1
155 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
156 xmpyu %fr8R, %fr4R, %fr24
157 xmpyu %fr8L, %fr4L, %fr25
158 ldd -0x70(%r30), p032a2
159 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
160 ldd -0x80(%r30), p000a
161 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
162 ldd -0x68(%r30), p064a
163 addib,<> -1, %r5, L(three_or_more)
164 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
166 add p032a1, p032a2, m032
167 add,dc %r0, %r0, m096
168 depd,z m032, 31, 32, ma000
169 extrd,u m032, 31, 32, ma064
171 depd m096, 31, 32, ma064
175 add p032a1, p032a2, m032
176 add,dc %r0, %r0, m096
177 depd,z m032, 31, 32, ma000
178 extrd,u m032, 31, 32, ma064
179 C addib,= -1, %r5, L(0_out)
180 depd m096, 31, 32, ma064
182 C xmpyu %fr8R, %fr4L, %fr22
183 C xmpyu %fr8L, %fr4R, %fr23
184 C ldd -0x78(%r30), p032a1
185 C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
187 C xmpyu %fr8R, %fr4R, %fr24
188 C xmpyu %fr8L, %fr4L, %fr25
189 C ldd -0x70(%r30), p032a2
190 C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
193 C add climb, p000a, s000
194 C ldd -0x80(%r30), p000a
195 C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
197 C add,dc p064a, %r0, climb
199 C ldd -0x68(%r30), p064a
200 C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
202 C add ma000, s000, s000
203 C add,dc ma064, climb, climb
208 C add p032a1, p032a2, m032
209 C add,dc %r0, %r0, m096
211 C depd,z m032, 31, 32, ma000
212 C extrd,u m032, 31, 32, ma064
213 C addib,<> -1, %r5, L(loop0)
214 C depd m096, 31, 32, ma064
217 xmpyu %fr8R, %fr4L, %fr22
218 xmpyu %fr8L, %fr4R, %fr23
219 ldd -0x78(%r30), p032a1
220 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
221 xmpyu %fr8R, %fr4R, %fr24
222 xmpyu %fr8L, %fr4L, %fr25
223 ldd -0x70(%r30), p032a2
224 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
226 add climb, p000a, s000
227 ldd -0x80(%r30), p000a
228 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
229 add,dc p064a, %r0, climb
230 ldd -0x68(%r30), p064a
231 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
232 add ma000, s000, s000
233 add,dc ma064, climb, climb
235 add p032a1, p032a2, m032
236 add,dc %r0, %r0, m096
237 depd,z m032, 31, 32, ma000
238 extrd,u m032, 31, 32, ma064
239 depd m096, 31, 32, ma064
241 ldd -0x78(%r30), p032a1
242 ldd -0x70(%r30), p032a2
244 add climb, p000a, s000
245 ldd -0x80(%r30), p000a
246 add,dc p064a, %r0, climb
247 ldd -0x68(%r30), p064a
248 add ma000, s000, s000
249 add,dc ma064, climb, climb
252 add p032a1, p032a2, m032
253 add,dc %r0, %r0, m096
254 depd,z m032, 31, 32, ma000
255 extrd,u m032, 31, 32, ma064
256 depd m096, 31, 32, ma064
258 add climb, p000a, s000
259 add,dc p064a, %r0, climb
260 add ma000, s000, s000
261 add,dc ma064, climb, climb
264 cmpib,>= 4, n, L(done)
267 C 4-way unrolled code.
271 define(`p032a1',`%r1') C
272 define(`p032a2',`%r19') C
273 define(`p096b1',`%r20') C
274 define(`p096b2',`%r21') C
275 define(`p160c1',`%r22') C
276 define(`p160c2',`%r29') C
277 define(`p224d1',`%r31') C
278 define(`p224d2',`%r3') C
280 define(`m032',`%r4') C
281 define(`m096',`%r5') C
282 define(`m160',`%r6') C
283 define(`m224',`%r7') C
284 define(`m288',`%r8') C
286 define(`p000a',`%r1') C
287 define(`p064a',`%r19') C
288 define(`p064b',`%r20') C
289 define(`p128b',`%r21') C
290 define(`p128c',`%r22') C
291 define(`p192c',`%r29') C
292 define(`p192d',`%r31') C
293 define(`p256d',`%r3') C
295 define(`s000',`%r10') C
296 define(`s064',`%r11') C
297 define(`s128',`%r12') C
298 define(`s192',`%r13') C
300 define(`ma000',`%r9') C
301 define(`ma064',`%r4') C
302 define(`ma128',`%r5') C
303 define(`ma192',`%r6') C
304 define(`ma256',`%r7') C
310 std %r10, -0xc8(%r30)
311 std %r11, -0xc0(%r30)
312 std %r12, -0xb8(%r30)
313 std %r13, -0xb0(%r30)
315 ifdef(`HAVE_ABI_2_0w',
316 ` extrd,u n, 61, 62, n C right shift 2
317 ',` extrd,u n, 61, 30, n C right shift 2, zero extend
325 xmpyu %fr8R, %fr4L, %fr22
326 xmpyu %fr8L, %fr4R, %fr23
327 xmpyu %fr8R, %fr5L, %fr24
328 xmpyu %fr8L, %fr5R, %fr25
329 xmpyu %fr8R, %fr6L, %fr26
330 xmpyu %fr8L, %fr6R, %fr27
331 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
332 xmpyu %fr8R, %fr7L, %fr28
333 xmpyu %fr8L, %fr7R, %fr29
334 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
335 xmpyu %fr8R, %fr4R, %fr30
336 xmpyu %fr8L, %fr4L, %fr31
337 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
338 xmpyu %fr8R, %fr5R, %fr22
339 xmpyu %fr8L, %fr5L, %fr23
340 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
341 xmpyu %fr8R, %fr6R, %fr24
342 xmpyu %fr8L, %fr6L, %fr25
343 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
344 xmpyu %fr8R, %fr7R, %fr26
345 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
346 addib,<> -1, n, L(8_or_more)
347 xmpyu %fr8L, %fr7L, %fr27
348 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
349 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
350 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
351 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
352 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
353 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
354 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
355 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
356 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
357 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
358 ldd -0x78(%r30), p032a1
359 ldd -0x70(%r30), p032a2
360 ldd -0x38(%r30), p096b1
361 ldd -0x30(%r30), p096b2
362 ldd -0x58(%r30), p160c1
363 ldd -0x50(%r30), p160c2
364 ldd -0x18(%r30), p224d1
365 ldd -0x10(%r30), p224d2
370 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
371 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
373 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
374 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
375 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
376 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
377 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
378 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
379 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
380 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
385 xmpyu %fr8R, %fr4L, %fr22
386 ldd -0x78(%r30), p032a1
387 xmpyu %fr8L, %fr4R, %fr23
388 xmpyu %fr8R, %fr5L, %fr24
389 ldd -0x70(%r30), p032a2
390 xmpyu %fr8L, %fr5R, %fr25
391 xmpyu %fr8R, %fr6L, %fr26
392 ldd -0x38(%r30), p096b1
393 xmpyu %fr8L, %fr6R, %fr27
394 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
395 xmpyu %fr8R, %fr7L, %fr28
396 ldd -0x30(%r30), p096b2
397 xmpyu %fr8L, %fr7R, %fr29
398 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
399 xmpyu %fr8R, %fr4R, %fr30
400 ldd -0x58(%r30), p160c1
401 xmpyu %fr8L, %fr4L, %fr31
402 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
403 xmpyu %fr8R, %fr5R, %fr22
404 ldd -0x50(%r30), p160c2
405 xmpyu %fr8L, %fr5L, %fr23
406 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
407 xmpyu %fr8R, %fr6R, %fr24
408 ldd -0x18(%r30), p224d1
409 xmpyu %fr8L, %fr6L, %fr25
410 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
411 xmpyu %fr8R, %fr7R, %fr26
412 ldd -0x10(%r30), p224d2
413 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
414 addib,= -1, n, L(end2)
415 xmpyu %fr8L, %fr7L, %fr27
417 add p032a1, p032a2, m032
418 ldd -0x80(%r30), p000a
419 add,dc p096b1, p096b2, m096
420 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
422 add,dc p160c1, p160c2, m160
423 ldd -0x68(%r30), p064a
424 add,dc p224d1, p224d2, m224
425 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
427 add,dc %r0, %r0, m288
428 ldd -0x40(%r30), p064b
430 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
432 depd,z m032, 31, 32, ma000
433 ldd -0x28(%r30), p128b
434 extrd,u m032, 31, 32, ma064
435 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
437 depd m096, 31, 32, ma064
438 ldd -0x60(%r30), p128c
439 extrd,u m096, 31, 32, ma128
440 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
442 depd m160, 31, 32, ma128
443 ldd -0x48(%r30), p192c
444 extrd,u m160, 31, 32, ma192
445 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
447 depd m224, 31, 32, ma192
448 ldd -0x20(%r30), p192d
449 extrd,u m224, 31, 32, ma256
450 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
452 depd m288, 31, 32, ma256
453 ldd -0x88(%r30), p256d
454 add climb, p000a, s000
455 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
457 add,dc p064a, p064b, s064
458 add,dc p128b, p128c, s128
459 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
461 add,dc p192c, p192d, s192
462 add,dc p256d, %r0, climb
463 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
465 add ma000, s000, s000 C accum mid 0
467 add,dc ma064, s064, s064 C accum mid 1
470 add,dc ma128, s128, s128 C accum mid 2
472 add,dc ma192, s192, s192 C accum mid 3
475 add,dc ma256, climb, climb
479 xmpyu %fr8R, %fr4L, %fr22
480 ldd -0x78(%r30), p032a1
481 xmpyu %fr8L, %fr4R, %fr23
484 xmpyu %fr8R, %fr5L, %fr24
485 ldd -0x70(%r30), p032a2
486 xmpyu %fr8L, %fr5R, %fr25
489 xmpyu %fr8R, %fr6L, %fr26
490 ldd -0x38(%r30), p096b1
491 xmpyu %fr8L, %fr6R, %fr27
492 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
494 xmpyu %fr8R, %fr7L, %fr28
495 ldd -0x30(%r30), p096b2
496 xmpyu %fr8L, %fr7R, %fr29
497 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
499 xmpyu %fr8R, %fr4R, %fr30
500 ldd -0x58(%r30), p160c1
501 xmpyu %fr8L, %fr4L, %fr31
502 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
504 xmpyu %fr8R, %fr5R, %fr22
505 ldd -0x50(%r30), p160c2
506 xmpyu %fr8L, %fr5L, %fr23
507 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
509 xmpyu %fr8R, %fr6R, %fr24
510 ldd -0x18(%r30), p224d1
511 xmpyu %fr8L, %fr6L, %fr25
512 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
514 xmpyu %fr8R, %fr7R, %fr26
515 ldd -0x10(%r30), p224d2
516 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
517 xmpyu %fr8L, %fr7L, %fr27
519 addib,<> -1, n, L(loop)
523 add p032a1, p032a2, m032
524 ldd -0x80(%r30), p000a
525 add,dc p096b1, p096b2, m096
526 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
527 add,dc p160c1, p160c2, m160
528 ldd -0x68(%r30), p064a
529 add,dc p224d1, p224d2, m224
530 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
531 add,dc %r0, %r0, m288
532 ldd -0x40(%r30), p064b
533 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
534 depd,z m032, 31, 32, ma000
535 ldd -0x28(%r30), p128b
536 extrd,u m032, 31, 32, ma064
537 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
538 depd m096, 31, 32, ma064
539 ldd -0x60(%r30), p128c
540 extrd,u m096, 31, 32, ma128
541 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
542 depd m160, 31, 32, ma128
543 ldd -0x48(%r30), p192c
544 extrd,u m160, 31, 32, ma192
545 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
546 depd m224, 31, 32, ma192
547 ldd -0x20(%r30), p192d
548 extrd,u m224, 31, 32, ma256
549 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
550 depd m288, 31, 32, ma256
551 ldd -0x88(%r30), p256d
552 add climb, p000a, s000
553 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
554 add,dc p064a, p064b, s064
555 add,dc p128b, p128c, s128
556 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
557 add,dc p192c, p192d, s192
558 add,dc p256d, %r0, climb
559 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
560 add ma000, s000, s000 C accum mid 0
561 add,dc ma064, s064, s064 C accum mid 1
562 add,dc ma128, s128, s128 C accum mid 2
563 add,dc ma192, s192, s192 C accum mid 3
564 add,dc ma256, climb, climb
567 ldd -0x78(%r30), p032a1
569 ldd -0x70(%r30), p032a2
571 ldd -0x38(%r30), p096b1
572 ldd -0x30(%r30), p096b2
573 ldd -0x58(%r30), p160c1
574 ldd -0x50(%r30), p160c2
575 ldd -0x18(%r30), p224d1
576 ldd -0x10(%r30), p224d2
580 add p032a1, p032a2, m032
581 ldd -0x80(%r30), p000a
582 add,dc p096b1, p096b2, m096
583 add,dc p160c1, p160c2, m160
584 ldd -0x68(%r30), p064a
585 add,dc p224d1, p224d2, m224
586 add,dc %r0, %r0, m288
587 ldd -0x40(%r30), p064b
588 depd,z m032, 31, 32, ma000
589 ldd -0x28(%r30), p128b
590 extrd,u m032, 31, 32, ma064
591 depd m096, 31, 32, ma064
592 ldd -0x60(%r30), p128c
593 extrd,u m096, 31, 32, ma128
594 depd m160, 31, 32, ma128
595 ldd -0x48(%r30), p192c
596 extrd,u m160, 31, 32, ma192
597 depd m224, 31, 32, ma192
598 ldd -0x20(%r30), p192d
599 extrd,u m224, 31, 32, ma256
600 depd m288, 31, 32, ma256
601 ldd -0x88(%r30), p256d
602 add climb, p000a, s000
603 add,dc p064a, p064b, s064
604 add,dc p128b, p128c, s128
605 add,dc p192c, p192d, s192
606 add,dc p256d, %r0, climb
607 add ma000, s000, s000 C accum mid 0
608 add,dc ma064, s064, s064 C accum mid 1
609 add,dc ma128, s128, s128 C accum mid 2
610 add,dc ma192, s192, s192 C accum mid 3
611 add,dc ma256, climb, climb
617 ldd -0xb0(%r30), %r13
618 ldd -0xb8(%r30), %r12
619 ldd -0xc0(%r30), %r11
620 ldd -0xc8(%r30), %r10
626 ifdef(`HAVE_ABI_2_0w',
628 ',` extrd,u climb, 63, 32, %r29
629 extrd,u climb, 31, 32, %r28
634 ldd,mb -0x100(%r30), %r3