1 dnl SPARC v9 64-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
2 dnl the result to a second limb vector.
4 dnl Copyright 1998, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
7 dnl This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22 include(`../config.m4')
28 C Algorithm: We use eight floating-point multiplies per limb product, with the
29 C invariant v operand split into four 16-bit pieces, and the up operand split
30 C into 32-bit pieces. We sum pairs of 48-bit partial products using
31 C floating-point add, then convert the four 49-bit product-sums and transfer
32 C them to the integer unit.
34 C Possible optimizations:
35 C 0. Rewrite to use algorithm of mpn_addmul_2.
36 C 1. Align the stack area where we transfer the four 49-bit product-sums
37 C to a 32-byte boundary. That would minimize the cache collision.
38 C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
39 C be to align the area to map to the area immediately before up?)
40 C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the
41 C develop mpn_addmul_2. This would save many integer instructions.
42 C 3. Unrolling. Questionable if it is worth the code expansion, given that
43 C it could only save 1 cycle/limb.
44 C 4. Specialize for particular v values. If its upper 32 bits are zero, we
45 C could save many operations, in the FPU (fmuld), but more so in the IEU
46 C since we'll be summing 48-bit quantities, which might be simpler.
47 C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
48 C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should
49 C not be greater than needed for L2 cache latency, and also not so great
50 C that i16 needs to be copied.
51 C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
52 C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU
55 C Instruction classification (as per UltraSPARC-1/2 functional units):
59 C 10 ISHIFT + 14 IADDLOG
61 C 55 insns totally (plus one mov insn that should be optimized out)
63 C The loop executes 56 instructions in 14 cycles on UltraSPARC-1/2, i.e we
64 C sustain the peak execution rate of 4 instructions/cycle.
73 REGISTER(%g2,#scratch)
74 REGISTER(%g3,#scratch)
76 define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
77 define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
78 define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
79 define(`u00',`%f32') define(`u32', `%f34')
80 define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
83 define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
84 define(`xffffffff',`%l7')
87 PROLOGUE(mpn_addmul_1)
89 C Initialization. (1) Split v operand into four 16-bit chunks and store them
90 C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
91 C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'.
95 srlx %g4, 48, xffff C store mask in register `xffff'
100 stx %g3, [%sp+2223+8]
103 stx %g2, [%sp+2223+16]
105 stx %g3, [%sp+2223+24]
106 srlx %g4, 32, xffffffff C store mask in register `xffffffff'
117 ldd [%sp+2223+0], v00
118 ldd [%sp+2223+8], v16
119 ldd [%sp+2223+16], v32
120 ldd [%sp+2223+24], v48
121 ld [%sp+2223+0],%f2 C zero f2
122 ld [%sp+2223+0],%f4 C zero f4
123 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
124 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
130 C Start real work. (We sneakingly read f3 and f5 above...)
131 C The software pipeline is very deep, requiring 4 feed-in stages.
141 bnz,pt %icc, .L_two_or_more
145 fmuld u32, v32, r64 C FIXME not urgent
149 fmuld u32, v48, r80 C FIXME not urgent
153 std a00, [%sp+2223+0]
154 std a16, [%sp+2223+8]
155 std a32, [%sp+2223+16]
156 std a48, [%sp+2223+24]
160 ldx [%i0+%i2], rlimb C read rp[i]
162 ldx [%sp+2223+0], i00
163 ldx [%sp+2223+8], i16
164 ldx [%sp+2223+16], i32
165 ldx [%sp+2223+24], i48
166 std a00, [%sp+2223+0]
167 std a16, [%sp+2223+8]
170 srlx rlimb, 32, %g4 C HI(rlimb)
171 and rlimb, xffffffff, %g5 C LO(rlimb)
172 add i00, %g5, %g5 C i00+ now in g5
173 ldx [%sp+2223+0], i00
174 srlx i16, 48, %l4 C (i16 >> 48)
176 ldx [%sp+2223+8], i16
177 srlx i48, 16, %l5 C (i48 >> 16)
178 add i32, %g4, %g4 C i32+ now in g4
179 sllx i48, 32, %l6 C (i48 << 32)
180 srlx %g4, 32, %o3 C (i32 >> 32)
181 add %l5, %l4, %o1 C hi64- in %o1
182 std a00, [%sp+2223+0]
183 sllx %g4, 16, %o2 C (i32 << 16)
184 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
185 std a16, [%sp+2223+8]
186 sllx %o1, 48, %o3 C (hi64 << 48)
187 add %g2, %o2, %o2 C mi64- in %o2
188 add %l6, %o2, %o2 C mi64- in %o2
189 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
190 add cy, %g5, %o4 C x = prev(i00) + cy
195 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
196 fmuld u32, v32, r64 C FIXME not urgent
198 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
201 fmuld u32, v48, r80 C FIXME not urgent
207 std a00, [%sp+2223+0]
209 std a16, [%sp+2223+8]
211 std a32, [%sp+2223+16]
213 std a48, [%sp+2223+24]
219 bnz,pt %icc, .L_three_or_more
223 fmuld u32, v32, r64 C FIXME not urgent
226 ldx [%i0+%i2], rlimb C read rp[i]
228 fmuld u32, v48, r80 C FIXME not urgent
230 ldx [%sp+2223+0], i00
232 ldx [%sp+2223+8], i16
233 ldx [%sp+2223+16], i32
234 ldx [%sp+2223+24], i48
236 std a00, [%sp+2223+0]
237 std a16, [%sp+2223+8]
238 std a32, [%sp+2223+16]
239 std a48, [%sp+2223+24]
243 srlx rlimb, 32, %g4 C HI(rlimb)
244 and rlimb, xffffffff, %g5 C LO(rlimb)
245 ldx [%i0+%i2], rlimb C read rp[i]
246 add i00, %g5, %g5 C i00+ now in g5
248 ldx [%sp+2223+0], i00
249 srlx i16, 48, %l4 C (i16 >> 48)
251 ldx [%sp+2223+8], i16
252 srlx i48, 16, %l5 C (i48 >> 16)
253 add i32, %g4, %g4 C i32+ now in g4
254 ldx [%sp+2223+16], i32
255 sllx i48, 32, %l6 C (i48 << 32)
256 ldx [%sp+2223+24], i48
257 srlx %g4, 32, %o3 C (i32 >> 32)
258 add %l5, %l4, %o1 C hi64- in %o1
259 std a00, [%sp+2223+0]
260 sllx %g4, 16, %o2 C (i32 << 16)
261 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
262 std a16, [%sp+2223+8]
263 sllx %o1, 48, %o3 C (hi64 << 48)
264 add %g2, %o2, %o2 C mi64- in %o2
265 add %l6, %o2, %o2 C mi64- in %o2
266 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
267 add cy, %g5, %o4 C x = prev(i00) + cy
272 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
273 fmuld u32, v32, r64 C FIXME not urgent
275 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
277 ldx [%i0+%i2], rlimb C read rp[i]
279 fmuld u32, v48, r80 C FIXME not urgent
281 ldx [%sp+2223+0], i00
283 ldx [%sp+2223+8], i16
285 ldx [%sp+2223+16], i32
287 ldx [%sp+2223+24], i48
289 std a00, [%sp+2223+0]
291 std a16, [%sp+2223+8]
293 std a32, [%sp+2223+16]
295 std a48, [%sp+2223+24]
301 bnz,pt %icc, .L_four_or_more
305 fmuld u32, v32, r64 C FIXME not urgent
308 srlx rlimb, 32, %g4 C HI(rlimb)
309 and rlimb, xffffffff, %g5 C LO(rlimb)
310 ldx [%i0+%i2], rlimb C read rp[i]
312 add i00, %g5, %g5 C i00+ now in g5
313 fmuld u32, v48, r80 C FIXME not urgent
315 ldx [%sp+2223+0], i00
317 srlx i16, 48, %l4 C (i16 >> 48)
319 ldx [%sp+2223+8], i16
320 srlx i48, 16, %l5 C (i48 >> 16)
321 add i32, %g4, %g4 C i32+ now in g4
322 ldx [%sp+2223+16], i32
323 sllx i48, 32, %l6 C (i48 << 32)
324 ldx [%sp+2223+24], i48
326 srlx %g4, 32, %o3 C (i32 >> 32)
327 add %l5, %l4, %o1 C hi64- in %o1
328 std a00, [%sp+2223+0]
329 sllx %g4, 16, %o2 C (i32 << 16)
330 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
331 std a16, [%sp+2223+8]
332 sllx %o1, 48, %o3 C (hi64 << 48)
333 add %g2, %o2, %o2 C mi64- in %o2
334 std a32, [%sp+2223+16]
335 add %l6, %o2, %o2 C mi64- in %o2
336 std a48, [%sp+2223+24]
337 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
338 add cy, %g5, %o4 C x = prev(i00) + cy
343 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
344 fmuld u32, v32, r64 C FIXME not urgent
346 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
348 srlx rlimb, 32, %g4 C HI(rlimb)
349 and rlimb, xffffffff, %g5 C LO(rlimb)
350 ldx [%i0+%i2], rlimb C read rp[i]
352 add i00, %g5, %g5 C i00+ now in g5
353 fmuld u32, v48, r80 C FIXME not urgent
355 ldx [%sp+2223+0], i00
357 srlx i16, 48, %l4 C (i16 >> 48)
359 ldx [%sp+2223+8], i16
361 srlx i48, 16, %l5 C (i48 >> 16)
362 add i32, %g4, %g4 C i32+ now in g4
363 ldx [%sp+2223+16], i32
365 sllx i48, 32, %l6 C (i48 << 32)
366 ldx [%sp+2223+24], i48
368 srlx %g4, 32, %o3 C (i32 >> 32)
369 add %l5, %l4, %o1 C hi64- in %o1
370 std a00, [%sp+2223+0]
372 sllx %g4, 16, %o2 C (i32 << 16)
373 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
374 std a16, [%sp+2223+8]
376 sllx %o1, 48, %o3 C (hi64 << 48)
377 add %g2, %o2, %o2 C mi64- in %o2
378 std a32, [%sp+2223+16]
380 add %l6, %o2, %o2 C mi64- in %o2
381 std a48, [%sp+2223+24]
384 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
387 add cy, %g5, %o4 C x = prev(i00) + cy
399 srlx %o4, 16, %o5 C (x >> 16)
400 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
401 fmuld u32, v32, r64 C FIXME not urgent
404 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
405 and %o4, xffff, %o5 C (x & 0xffff)
406 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
409 srlx rlimb, 32, %g4 C HI(rlimb)
410 and rlimb, xffffffff, %g5 C LO(rlimb)
411 ldx [%i0+%i2], rlimb C read rp[i]
414 srlx %o2, 48, %o7 C (mi64 >> 48)
415 add i00, %g5, %g5 C i00+ now in g5
416 fmuld u32, v48, r80 C FIXME not urgent
419 sllx %o2, 16, %i3 C (mi64 << 16)
420 add %o7, %o1, cy C new cy
421 ldx [%sp+2223+0], i00
424 srlx i16, 48, %l4 C (i16 >> 48)
426 ldx [%sp+2223+8], i16
429 srlx i48, 16, %l5 C (i48 >> 16)
430 add i32, %g4, %g4 C i32+ now in g4
431 ldx [%sp+2223+16], i32
434 sllx i48, 32, %l6 C (i48 << 32)
436 ldx [%sp+2223+24], i48
439 srlx %g4, 32, %o3 C (i32 >> 32)
440 add %l5, %l4, %o1 C hi64- in %o1
441 std a00, [%sp+2223+0]
444 sllx %g4, 16, %o2 C (i32 << 16)
445 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
446 std a16, [%sp+2223+8]
449 sllx %o1, 48, %o3 C (hi64 << 48)
450 add %g2, %o2, %o2 C mi64- in %o2
451 std a32, [%sp+2223+16]
454 add %l6, %o2, %o2 C mi64- in %o2
455 std a48, [%sp+2223+24]
459 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
464 add cy, %g5, %o4 C x = prev(i00) + cy
471 srlx %o4, 16, %o5 C (x >> 16)
472 fmuld u32, v32, r64 C FIXME not urgent
474 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
475 and %o4, xffff, %o5 C (x & 0xffff)
477 srlx rlimb, 32, %g4 C HI(rlimb)
478 and rlimb, xffffffff, %g5 C LO(rlimb)
479 ldx [%i0+%i2], rlimb C read rp[i]
481 srlx %o2, 48, %o7 C (mi64 >> 48)
482 add i00, %g5, %g5 C i00+ now in g5
483 fmuld u32, v48, r80 C FIXME not urgent
485 sllx %o2, 16, %i3 C (mi64 << 16)
486 add %o7, %o1, cy C new cy
487 ldx [%sp+2223+0], i00
489 srlx i16, 48, %l4 C (i16 >> 48)
491 ldx [%sp+2223+8], i16
492 srlx i48, 16, %l5 C (i48 >> 16)
493 add i32, %g4, %g4 C i32+ now in g4
494 ldx [%sp+2223+16], i32
495 sllx i48, 32, %l6 C (i48 << 32)
497 ldx [%sp+2223+24], i48
499 srlx %g4, 32, %o3 C (i32 >> 32)
500 add %l5, %l4, %o1 C hi64- in %o1
501 std a00, [%sp+2223+0]
502 sllx %g4, 16, %o2 C (i32 << 16)
503 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
504 std a16, [%sp+2223+8]
505 sllx %o1, 48, %o3 C (hi64 << 48)
506 add %g2, %o2, %o2 C mi64- in %o2
507 std a32, [%sp+2223+16]
508 add %l6, %o2, %o2 C mi64- in %o2
509 std a48, [%sp+2223+24]
510 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
512 add cy, %g5, %o4 C x = prev(i00) + cy
515 srlx %o4, 16, %o5 C (x >> 16)
516 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
517 and %o4, xffff, %o5 C (x & 0xffff)
519 srlx rlimb, 32, %g4 C HI(rlimb)
520 and rlimb, xffffffff, %g5 C LO(rlimb)
521 ldx [%i0+%i2], rlimb C read rp[i]
522 srlx %o2, 48, %o7 C (mi64 >> 48)
523 add i00, %g5, %g5 C i00+ now in g5
525 sllx %o2, 16, %i3 C (mi64 << 16)
526 add %o7, %o1, cy C new cy
527 ldx [%sp+2223+0], i00
528 srlx i16, 48, %l4 C (i16 >> 48)
530 ldx [%sp+2223+8], i16
531 srlx i48, 16, %l5 C (i48 >> 16)
532 add i32, %g4, %g4 C i32+ now in g4
533 ldx [%sp+2223+16], i32
534 sllx i48, 32, %l6 C (i48 << 32)
536 ldx [%sp+2223+24], i48
537 srlx %g4, 32, %o3 C (i32 >> 32)
538 add %l5, %l4, %o1 C hi64- in %o1
539 std a00, [%sp+2223+0]
540 sllx %g4, 16, %o2 C (i32 << 16)
541 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
542 std a16, [%sp+2223+8]
543 sllx %o1, 48, %o3 C (hi64 << 48)
544 add %g2, %o2, %o2 C mi64- in %o2
545 add %l6, %o2, %o2 C mi64- in %o2
546 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
548 add cy, %g5, %o4 C x = prev(i00) + cy
551 srlx %o4, 16, %o5 C (x >> 16)
552 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
553 and %o4, xffff, %o5 C (x & 0xffff)
554 srlx rlimb, 32, %g4 C HI(rlimb)
555 and rlimb, xffffffff, %g5 C LO(rlimb)
556 srlx %o2, 48, %o7 C (mi64 >> 48)
557 add i00, %g5, %g5 C i00+ now in g5
558 sllx %o2, 16, %i3 C (mi64 << 16)
559 add %o7, %o1, cy C new cy
560 ldx [%sp+2223+0], i00
561 srlx i16, 48, %l4 C (i16 >> 48)
563 ldx [%sp+2223+8], i16
564 srlx i48, 16, %l5 C (i48 >> 16)
565 add i32, %g4, %g4 C i32+ now in g4
566 sllx i48, 32, %l6 C (i48 << 32)
568 srlx %g4, 32, %o3 C (i32 >> 32)
569 add %l5, %l4, %o1 C hi64- in %o1
570 sllx %g4, 16, %o2 C (i32 << 16)
571 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
572 sllx %o1, 48, %o3 C (hi64 << 48)
573 add %g2, %o2, %o2 C mi64- in %o2
574 add %l6, %o2, %o2 C mi64- in %o2
575 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
577 add cy, %g5, %o4 C x = prev(i00) + cy
580 srlx %o4, 16, %o5 C (x >> 16)
581 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
582 and %o4, xffff, %o5 C (x & 0xffff)
583 srlx %o2, 48, %o7 C (mi64 >> 48)
584 sllx %o2, 16, %i3 C (mi64 << 16)
585 add %o7, %o1, cy C new cy
596 EPILOGUE(mpn_addmul_1)