1 dnl HP-PA 2.0 64-bit mpn_submul_1 -- Multiply a limb vector with a limb and
2 dnl subtract the result from a second limb vector.
4 dnl Copyright 1998, 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
27 C The feed-in and wind-down code has not yet been scheduled. Many cycles
28 C could be saved there per call.
31 C The main loop "BIG" is 4-way unrolled, mainly to allow
32 C effective use of ADD,DC. Delays in moving data via the cache from the FP
33 C registers to the IU registers, have demanded a deep software pipeline, and
34 C a lot of stack slots for partial products in flight.
38 C do 0, 1, 2, or 3 limbs
39 C if done, restore-some-regs and return
45 C HP-PA stack grows upwards. We could allocate 8 fewer slots by using the
46 C slots marked FREE, as well as some slots in the caller's "frame marker".
83 C -38/-138 vlimb home slot. For 2.0N, the vlimb arg will arrive here.
86 include(`../config.m4')
92 define(`vlimb',`%r23') C
94 define(`climb',`%r23') C
96 ifdef(`HAVE_ABI_2_0w',
100 PROLOGUE(mpn_submul_1)
102 ifdef(`HAVE_ABI_2_0w',
103 ` std vlimb, -0x38(%r30) C store vlimb into "home" slot
105 std,ma %r3, 0x100(%r30)
108 ldo 0(%r0), climb C clear climb
109 fldd -0x138(%r30), %fr8 C put vlimb in fp register
111 define(`p032a1',`%r1') C
112 define(`p032a2',`%r19') C
114 define(`m032',`%r20') C
115 define(`m096',`%r21') C
117 define(`p000a',`%r22') C
118 define(`p064a',`%r29') C
120 define(`s000',`%r31') C
122 define(`ma000',`%r4') C
123 define(`ma064',`%r20') C
125 define(`r000',`%r3') C
127 extrd,u n, 63, 2, %r5
128 cmpb,= %r5, %r0, L(BIG)
133 xmpyu %fr8R, %fr4L, %fr22
134 xmpyu %fr8L, %fr4R, %fr23
135 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
136 xmpyu %fr8R, %fr4R, %fr24
137 xmpyu %fr8L, %fr4L, %fr25
138 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
139 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
140 addib,<> -1, %r5, L(two_or_more)
141 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
143 ldd -0x78(%r30), p032a1
144 ldd -0x70(%r30), p032a2
145 ldd -0x80(%r30), p000a
147 ldd -0x68(%r30), p064a
152 xmpyu %fr8R, %fr4L, %fr22
153 xmpyu %fr8L, %fr4R, %fr23
154 ldd -0x78(%r30), p032a1
155 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
156 xmpyu %fr8R, %fr4R, %fr24
157 xmpyu %fr8L, %fr4L, %fr25
158 ldd -0x70(%r30), p032a2
159 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
160 ldd -0x80(%r30), p000a
161 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
162 ldd -0x68(%r30), p064a
163 addib,<> -1, %r5, L(three_or_more)
164 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
166 add p032a1, p032a2, m032
167 add,dc %r0, %r0, m096
168 depd,z m032, 31, 32, ma000
169 extrd,u m032, 31, 32, ma064
172 depd m096, 31, 32, ma064
176 add p032a1, p032a2, m032
177 add,dc %r0, %r0, m096
178 depd,z m032, 31, 32, ma000
179 extrd,u m032, 31, 32, ma064
181 C addib,= -1, %r5, L(0_out)
182 depd m096, 31, 32, ma064
184 C xmpyu %fr8R, %fr4L, %fr22
185 C xmpyu %fr8L, %fr4R, %fr23
186 C ldd -0x78(%r30), p032a1
187 C fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
189 C xmpyu %fr8R, %fr4R, %fr24
190 C xmpyu %fr8L, %fr4L, %fr25
191 C ldd -0x70(%r30), p032a2
192 C fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
195 C add climb, p000a, s000
196 C ldd -0x80(%r30), p000a
197 C fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
199 C add,dc p064a, %r0, climb
201 C ldd -0x68(%r30), p064a
202 C fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
204 C add ma000, s000, s000
205 C add,dc ma064, climb, climb
208 C sub r000, s000, s000
209 C sub,db %r0, climb, climb
210 C sub %r0, climb, climb
213 C add p032a1, p032a2, m032
214 C add,dc %r0, %r0, m096
216 C depd,z m032, 31, 32, ma000
217 C extrd,u m032, 31, 32, ma064
219 C addib,<> -1, %r5, L(loop0)
220 C depd m096, 31, 32, ma064
223 xmpyu %fr8R, %fr4L, %fr22
224 xmpyu %fr8L, %fr4R, %fr23
225 ldd -0x78(%r30), p032a1
226 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
227 xmpyu %fr8R, %fr4R, %fr24
228 xmpyu %fr8L, %fr4L, %fr25
229 ldd -0x70(%r30), p032a2
230 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
232 add climb, p000a, s000
233 ldd -0x80(%r30), p000a
234 fstd %fr24, -0x80(%r30) C low product to -0x80..-0x79
235 add,dc p064a, %r0, climb
236 ldd -0x68(%r30), p064a
237 fstd %fr25, -0x68(%r30) C high product to -0x68..-0x61
238 add ma000, s000, s000
239 add,dc ma064, climb, climb
241 sub,db %r0, climb, climb
242 sub %r0, climb, climb
244 add p032a1, p032a2, m032
245 add,dc %r0, %r0, m096
246 depd,z m032, 31, 32, ma000
247 extrd,u m032, 31, 32, ma064
249 depd m096, 31, 32, ma064
251 ldd -0x78(%r30), p032a1
252 ldd -0x70(%r30), p032a2
254 add climb, p000a, s000
255 ldd -0x80(%r30), p000a
256 add,dc p064a, %r0, climb
257 ldd -0x68(%r30), p064a
258 add ma000, s000, s000
259 add,dc ma064, climb, climb
261 sub,db %r0, climb, climb
262 sub %r0, climb, climb
265 add p032a1, p032a2, m032
266 add,dc %r0, %r0, m096
267 depd,z m032, 31, 32, ma000
268 extrd,u m032, 31, 32, ma064
270 depd m096, 31, 32, ma064
272 add climb, p000a, s000
273 add,dc p064a, %r0, climb
274 add ma000, s000, s000
275 add,dc ma064, climb, climb
277 sub,db %r0, climb, climb
278 sub %r0, climb, climb
281 cmpib,>= 4, n, L(done)
284 C 4-way unrolled code.
288 define(`p032a1',`%r1') C
289 define(`p032a2',`%r19') C
290 define(`p096b1',`%r20') C
291 define(`p096b2',`%r21') C
292 define(`p160c1',`%r22') C
293 define(`p160c2',`%r29') C
294 define(`p224d1',`%r31') C
295 define(`p224d2',`%r3') C
297 define(`m032',`%r4') C
298 define(`m096',`%r5') C
299 define(`m160',`%r6') C
300 define(`m224',`%r7') C
301 define(`m288',`%r8') C
303 define(`p000a',`%r1') C
304 define(`p064a',`%r19') C
305 define(`p064b',`%r20') C
306 define(`p128b',`%r21') C
307 define(`p128c',`%r22') C
308 define(`p192c',`%r29') C
309 define(`p192d',`%r31') C
310 define(`p256d',`%r3') C
312 define(`s000',`%r10') C
313 define(`s064',`%r11') C
314 define(`s128',`%r12') C
315 define(`s192',`%r13') C
317 define(`ma000',`%r9') C
318 define(`ma064',`%r4') C
319 define(`ma128',`%r5') C
320 define(`ma192',`%r6') C
321 define(`ma256',`%r7') C
323 define(`r000',`%r1') C
324 define(`r064',`%r19') C
325 define(`r128',`%r20') C
326 define(`r192',`%r21') C
332 std %r10, -0xc8(%r30)
333 std %r11, -0xc0(%r30)
334 std %r12, -0xb8(%r30)
335 std %r13, -0xb0(%r30)
337 ifdef(`HAVE_ABI_2_0w',
338 ` extrd,u n, 61, 62, n C right shift 2
339 ',` extrd,u n, 61, 30, n C right shift 2, zero extend
347 xmpyu %fr8R, %fr4L, %fr22
348 xmpyu %fr8L, %fr4R, %fr23
349 xmpyu %fr8R, %fr5L, %fr24
350 xmpyu %fr8L, %fr5R, %fr25
351 xmpyu %fr8R, %fr6L, %fr26
352 xmpyu %fr8L, %fr6R, %fr27
353 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
354 xmpyu %fr8R, %fr7L, %fr28
355 xmpyu %fr8L, %fr7R, %fr29
356 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
357 xmpyu %fr8R, %fr4R, %fr30
358 xmpyu %fr8L, %fr4L, %fr31
359 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
360 xmpyu %fr8R, %fr5R, %fr22
361 xmpyu %fr8L, %fr5L, %fr23
362 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
363 xmpyu %fr8R, %fr6R, %fr24
364 xmpyu %fr8L, %fr6L, %fr25
365 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
366 xmpyu %fr8R, %fr7R, %fr26
367 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
368 addib,<> -1, n, L(8_or_more)
369 xmpyu %fr8L, %fr7L, %fr27
370 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
371 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
372 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
373 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
374 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
375 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
376 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
377 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
378 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
379 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
380 ldd -0x78(%r30), p032a1
381 ldd -0x70(%r30), p032a2
382 ldd -0x38(%r30), p096b1
383 ldd -0x30(%r30), p096b2
384 ldd -0x58(%r30), p160c1
385 ldd -0x50(%r30), p160c2
386 ldd -0x18(%r30), p224d1
387 ldd -0x10(%r30), p224d2
392 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
393 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
395 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
396 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
397 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
398 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
399 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
400 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
401 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
402 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
407 xmpyu %fr8R, %fr4L, %fr22
408 ldd -0x78(%r30), p032a1
409 xmpyu %fr8L, %fr4R, %fr23
410 xmpyu %fr8R, %fr5L, %fr24
411 ldd -0x70(%r30), p032a2
412 xmpyu %fr8L, %fr5R, %fr25
413 xmpyu %fr8R, %fr6L, %fr26
414 ldd -0x38(%r30), p096b1
415 xmpyu %fr8L, %fr6R, %fr27
416 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
417 xmpyu %fr8R, %fr7L, %fr28
418 ldd -0x30(%r30), p096b2
419 xmpyu %fr8L, %fr7R, %fr29
420 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
421 xmpyu %fr8R, %fr4R, %fr30
422 ldd -0x58(%r30), p160c1
423 xmpyu %fr8L, %fr4L, %fr31
424 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
425 xmpyu %fr8R, %fr5R, %fr22
426 ldd -0x50(%r30), p160c2
427 xmpyu %fr8L, %fr5L, %fr23
428 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
429 xmpyu %fr8R, %fr6R, %fr24
430 ldd -0x18(%r30), p224d1
431 xmpyu %fr8L, %fr6L, %fr25
432 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
433 xmpyu %fr8R, %fr7R, %fr26
434 ldd -0x10(%r30), p224d2
435 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
436 addib,= -1, n, L(end2)
437 xmpyu %fr8L, %fr7L, %fr27
439 add p032a1, p032a2, m032
440 ldd -0x80(%r30), p000a
441 add,dc p096b1, p096b2, m096
442 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
444 add,dc p160c1, p160c2, m160
445 ldd -0x68(%r30), p064a
446 add,dc p224d1, p224d2, m224
447 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
449 add,dc %r0, %r0, m288
450 ldd -0x40(%r30), p064b
452 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
454 depd,z m032, 31, 32, ma000
455 ldd -0x28(%r30), p128b
456 extrd,u m032, 31, 32, ma064
457 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
459 depd m096, 31, 32, ma064
460 ldd -0x60(%r30), p128c
461 extrd,u m096, 31, 32, ma128
462 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
464 depd m160, 31, 32, ma128
465 ldd -0x48(%r30), p192c
466 extrd,u m160, 31, 32, ma192
467 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
469 depd m224, 31, 32, ma192
470 ldd -0x20(%r30), p192d
471 extrd,u m224, 31, 32, ma256
472 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
474 depd m288, 31, 32, ma256
475 ldd -0x88(%r30), p256d
476 add climb, p000a, s000
477 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
479 add,dc p064a, p064b, s064
481 add,dc p128b, p128c, s128
482 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
484 add,dc p192c, p192d, s192
486 add,dc p256d, %r0, climb
487 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
490 add ma000, s000, s000 C accum mid 0
492 add,dc ma064, s064, s064 C accum mid 1
494 add,dc ma128, s128, s128 C accum mid 2
496 add,dc ma192, s192, s192 C accum mid 3
499 add,dc ma256, climb, climb
501 sub r000, s000, s000 C accum rlimb 0
504 sub,db r064, s064, s064 C accum rlimb 1
505 sub,db r128, s128, s128 C accum rlimb 2
508 sub,db r192, s192, s192 C accum rlimb 3
509 sub,db %r0, climb, climb
510 sub %r0, climb, climb
513 xmpyu %fr8R, %fr4L, %fr22
514 ldd -0x78(%r30), p032a1
515 xmpyu %fr8L, %fr4R, %fr23
518 xmpyu %fr8R, %fr5L, %fr24
519 ldd -0x70(%r30), p032a2
520 xmpyu %fr8L, %fr5R, %fr25
523 xmpyu %fr8R, %fr6L, %fr26
524 ldd -0x38(%r30), p096b1
525 xmpyu %fr8L, %fr6R, %fr27
526 fstd %fr22, -0x78(%r30) C mid product to -0x78..-0x71
528 xmpyu %fr8R, %fr7L, %fr28
529 ldd -0x30(%r30), p096b2
530 xmpyu %fr8L, %fr7R, %fr29
531 fstd %fr23, -0x70(%r30) C mid product to -0x70..-0x69
533 xmpyu %fr8R, %fr4R, %fr30
534 ldd -0x58(%r30), p160c1
535 xmpyu %fr8L, %fr4L, %fr31
536 fstd %fr24, -0x38(%r30) C mid product to -0x38..-0x31
538 xmpyu %fr8R, %fr5R, %fr22
539 ldd -0x50(%r30), p160c2
540 xmpyu %fr8L, %fr5L, %fr23
541 fstd %fr25, -0x30(%r30) C mid product to -0x30..-0x29
543 xmpyu %fr8R, %fr6R, %fr24
544 ldd -0x18(%r30), p224d1
545 xmpyu %fr8L, %fr6L, %fr25
546 fstd %fr26, -0x58(%r30) C mid product to -0x58..-0x51
548 xmpyu %fr8R, %fr7R, %fr26
549 ldd -0x10(%r30), p224d2
550 fstd %fr27, -0x50(%r30) C mid product to -0x50..-0x49
551 xmpyu %fr8L, %fr7L, %fr27
553 addib,<> -1, n, L(loop)
557 add p032a1, p032a2, m032
558 ldd -0x80(%r30), p000a
559 add,dc p096b1, p096b2, m096
560 fstd %fr28, -0x18(%r30) C mid product to -0x18..-0x11
561 add,dc p160c1, p160c2, m160
562 ldd -0x68(%r30), p064a
563 add,dc p224d1, p224d2, m224
564 fstd %fr29, -0x10(%r30) C mid product to -0x10..-0x09
565 add,dc %r0, %r0, m288
566 ldd -0x40(%r30), p064b
567 fstd %fr30, -0x80(%r30) C low product to -0x80..-0x79
568 depd,z m032, 31, 32, ma000
569 ldd -0x28(%r30), p128b
570 extrd,u m032, 31, 32, ma064
571 fstd %fr31, -0x68(%r30) C high product to -0x68..-0x61
572 depd m096, 31, 32, ma064
573 ldd -0x60(%r30), p128c
574 extrd,u m096, 31, 32, ma128
575 fstd %fr22, -0x40(%r30) C low product to -0x40..-0x39
576 depd m160, 31, 32, ma128
577 ldd -0x48(%r30), p192c
578 extrd,u m160, 31, 32, ma192
579 fstd %fr23, -0x28(%r30) C high product to -0x28..-0x21
580 depd m224, 31, 32, ma192
581 ldd -0x20(%r30), p192d
582 extrd,u m224, 31, 32, ma256
583 fstd %fr24, -0x60(%r30) C low product to -0x60..-0x59
584 depd m288, 31, 32, ma256
585 ldd -0x88(%r30), p256d
586 add climb, p000a, s000
587 fstd %fr25, -0x48(%r30) C high product to -0x48..-0x41
588 add,dc p064a, p064b, s064
590 add,dc p128b, p128c, s128
591 fstd %fr26, -0x20(%r30) C low product to -0x20..-0x19
592 add,dc p192c, p192d, s192
594 add,dc p256d, %r0, climb
595 fstd %fr27, -0x88(%r30) C high product to -0x88..-0x81
597 add ma000, s000, s000 C accum mid 0
599 add,dc ma064, s064, s064 C accum mid 1
600 add,dc ma128, s128, s128 C accum mid 2
601 add,dc ma192, s192, s192 C accum mid 3
602 add,dc ma256, climb, climb
603 sub r000, s000, s000 C accum rlimb 0
604 sub,db r064, s064, s064 C accum rlimb 1
605 sub,db r128, s128, s128 C accum rlimb 2
607 sub,db r192, s192, s192 C accum rlimb 3
608 sub,db %r0, climb, climb
609 sub %r0, climb, climb
611 ldd -0x78(%r30), p032a1
613 ldd -0x70(%r30), p032a2
615 ldd -0x38(%r30), p096b1
616 ldd -0x30(%r30), p096b2
617 ldd -0x58(%r30), p160c1
618 ldd -0x50(%r30), p160c2
619 ldd -0x18(%r30), p224d1
620 ldd -0x10(%r30), p224d2
624 add p032a1, p032a2, m032
625 ldd -0x80(%r30), p000a
626 add,dc p096b1, p096b2, m096
627 add,dc p160c1, p160c2, m160
628 ldd -0x68(%r30), p064a
629 add,dc p224d1, p224d2, m224
630 add,dc %r0, %r0, m288
631 ldd -0x40(%r30), p064b
632 depd,z m032, 31, 32, ma000
633 ldd -0x28(%r30), p128b
634 extrd,u m032, 31, 32, ma064
635 depd m096, 31, 32, ma064
636 ldd -0x60(%r30), p128c
637 extrd,u m096, 31, 32, ma128
638 depd m160, 31, 32, ma128
639 ldd -0x48(%r30), p192c
640 extrd,u m160, 31, 32, ma192
641 depd m224, 31, 32, ma192
642 ldd -0x20(%r30), p192d
643 extrd,u m224, 31, 32, ma256
644 depd m288, 31, 32, ma256
645 ldd -0x88(%r30), p256d
646 add climb, p000a, s000
647 add,dc p064a, p064b, s064
649 add,dc p128b, p128c, s128
650 add,dc p192c, p192d, s192
652 add,dc p256d, %r0, climb
654 add ma000, s000, s000 C accum mid 0
656 add,dc ma064, s064, s064 C accum mid 1
657 add,dc ma128, s128, s128 C accum mid 2
658 add,dc ma192, s192, s192 C accum mid 3
659 add,dc ma256, climb, climb
660 sub r000, s000, s000 C accum rlimb 0
661 sub,db r064, s064, s064 C accum rlimb 1
662 sub,db r128, s128, s128 C accum rlimb 2
664 sub,db r192, s192, s192 C accum rlimb 3
665 sub,db %r0, climb, climb
666 sub %r0, climb, climb
671 ldd -0xb0(%r30), %r13
672 ldd -0xb8(%r30), %r12
673 ldd -0xc0(%r30), %r11
674 ldd -0xc8(%r30), %r10
680 ifdef(`HAVE_ABI_2_0w',
682 ',` extrd,u climb, 63, 32, %r29
683 extrd,u climb, 31, 32, %r28
688 ldd,mb -0x100(%r30), %r3
689 EPILOGUE(mpn_submul_1)