1 dnl SPARC v9 64-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2 dnl the result in a second limb vector.
4 dnl Copyright 1998, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
27 C Algorithm: We use eight floating-point multiplies per limb product, with the
28 C invariant v operand split into four 16-bit pieces, and the s1 operand split
29 C into 32-bit pieces. We sum pairs of 48-bit partial products using
30 C floating-point add, then convert the four 49-bit product-sums and transfer
31 C them to the integer unit.
33 C Possible optimizations:
34 C 1. Align the stack area where we transfer the four 49-bit product-sums
35 C to a 32-byte boundary. That would minimize the cache collision.
36 C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
37 C be to align the area to map to the area immediately before s1?)
38 C 2. Sum the 4 49-bit quantities using 32-bit operations, as in the
39 C develop mpn_addmul_2. This would save many integer instructions.
40 C 3. Unrolling. Questionable if it is worth the code expansion, given that
41 C it could only save 1 cycle/limb.
42 C 4. Specialize for particular v values. If its upper 32 bits are zero, we
43 C could save many operations, in the FPU (fmuld), but more so in the IEU
44 C since we'll be summing 48-bit quantities, which might be simpler.
45 C 5. Ideally, we should schedule the f2/f3 and f4/f5 RAW further apart, and
46 C the i00,i16,i32,i48 RAW less apart. The latter apart-scheduling should
47 C not be greater than needed for L2 cache latency, and also not so great
48 C that i16 needs to be copied.
49 C 6. Avoid performing mem+fa+fm in the same cycle, at least not when we want
50 C to get high IEU bandwidth. (12 of the 14 cycles will be free for 2 IEU
53 C Instruction classification (as per UltraSPARC-1/2 functional units):
57 C 9 ISHIFT + 10? IADDLOG
59 C 49 insns totally (plus three mov insns that should be optimized out)
61 C The loop executes 53 instructions in 14 cycles on UltraSPARC-1/2, i.e we
62 C sustain 3.79 instructions/cycle.
71 REGISTER(%g2,#scratch)
72 REGISTER(%g3,#scratch)
74 define(`p00', `%f8') define(`p16',`%f10') define(`p32',`%f12') define(`p48',`%f14')
75 define(`r32',`%f16') define(`r48',`%f18') define(`r64',`%f20') define(`r80',`%f22')
76 define(`v00',`%f24') define(`v16',`%f26') define(`v32',`%f28') define(`v48',`%f30')
77 define(`u00',`%f32') define(`u32', `%f34')
78 define(`a00',`%f36') define(`a16',`%f38') define(`a32',`%f40') define(`a48',`%f42')
81 define(`i00',`%l0') define(`i16',`%l1') define(`i32',`%l2') define(`i48',`%l3')
82 define(`xffffffff',`%l7')
87 C Initialization. (1) Split v operand into four 16-bit chunks and store them
88 C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
89 C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'.
93 srlx %g4, 48, xffff C store mask in register `xffff'
101 stx %g2, [%sp+2223+16]
103 stx %g3, [%sp+2223+24]
104 srlx %g4, 32, xffffffff C store mask in register `xffffffff'
115 ldd [%sp+2223+0], v00
116 ldd [%sp+2223+8], v16
117 ldd [%sp+2223+16], v32
118 ldd [%sp+2223+24], v48
119 ld [%sp+2223+0],%f2 C zero f2
120 ld [%sp+2223+0],%f4 C zero f4
121 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
122 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
128 C Start real work. (We sneakingly read f3 and f5 above...)
129 C The software pipeline is very deep, requiring 4 feed-in stages.
139 bnz,pt %icc, .L_two_or_more
143 fmuld u32, v32, r64 C FIXME not urgent
147 fmuld u32, v48, r80 C FIXME not urgent
151 std a00, [%sp+2223+0]
152 std a16, [%sp+2223+8]
153 std a32, [%sp+2223+16]
154 std a48, [%sp+2223+24]
159 ldx [%sp+2223+0], i00
160 ldx [%sp+2223+8], i16
161 ldx [%sp+2223+16], i32
162 ldx [%sp+2223+24], i48
163 std a00, [%sp+2223+0]
164 std a16, [%sp+2223+8]
167 mov i00, %g5 C i00+ now in g5
168 ldx [%sp+2223+0], i00
169 srlx i16, 48, %l4 C (i16 >> 48)
171 ldx [%sp+2223+8], i16
172 srlx i48, 16, %l5 C (i48 >> 16)
173 mov i32, %g4 C i32+ now in g4
174 sllx i48, 32, %l6 C (i48 << 32)
175 srlx %g4, 32, %o3 C (i32 >> 32)
176 add %l5, %l4, %o1 C hi64- in %o1
177 std a00, [%sp+2223+0]
178 sllx %g4, 16, %o2 C (i32 << 16)
179 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
180 std a16, [%sp+2223+8]
181 sllx %o1, 48, %o3 C (hi64 << 48)
182 add %g2, %o2, %o2 C mi64- in %o2
183 add %l6, %o2, %o2 C mi64- in %o2
184 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
185 add cy, %g5, %o4 C x = prev(i00) + cy
190 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
191 fmuld u32, v32, r64 C FIXME not urgent
193 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
196 fmuld u32, v48, r80 C FIXME not urgent
202 std a00, [%sp+2223+0]
204 std a16, [%sp+2223+8]
206 std a32, [%sp+2223+16]
208 std a48, [%sp+2223+24]
214 bnz,pt %icc, .L_three_or_more
218 fmuld u32, v32, r64 C FIXME not urgent
222 fmuld u32, v48, r80 C FIXME not urgent
224 ldx [%sp+2223+0], i00
226 ldx [%sp+2223+8], i16
227 ldx [%sp+2223+16], i32
228 ldx [%sp+2223+24], i48
230 std a00, [%sp+2223+0]
231 std a16, [%sp+2223+8]
232 std a32, [%sp+2223+16]
233 std a48, [%sp+2223+24]
237 mov i00, %g5 C i00+ now in g5
239 ldx [%sp+2223+0], i00
240 srlx i16, 48, %l4 C (i16 >> 48)
242 ldx [%sp+2223+8], i16
243 srlx i48, 16, %l5 C (i48 >> 16)
244 mov i32, %g4 C i32+ now in g4
245 ldx [%sp+2223+16], i32
246 sllx i48, 32, %l6 C (i48 << 32)
247 ldx [%sp+2223+24], i48
248 srlx %g4, 32, %o3 C (i32 >> 32)
249 add %l5, %l4, %o1 C hi64- in %o1
250 std a00, [%sp+2223+0]
251 sllx %g4, 16, %o2 C (i32 << 16)
252 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
253 std a16, [%sp+2223+8]
254 sllx %o1, 48, %o3 C (hi64 << 48)
255 add %g2, %o2, %o2 C mi64- in %o2
256 add %l6, %o2, %o2 C mi64- in %o2
257 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
258 add cy, %g5, %o4 C x = prev(i00) + cy
263 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
264 fmuld u32, v32, r64 C FIXME not urgent
266 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
269 fmuld u32, v48, r80 C FIXME not urgent
271 ldx [%sp+2223+0], i00
273 ldx [%sp+2223+8], i16
275 ldx [%sp+2223+16], i32
277 ldx [%sp+2223+24], i48
279 std a00, [%sp+2223+0]
281 std a16, [%sp+2223+8]
283 std a32, [%sp+2223+16]
285 std a48, [%sp+2223+24]
291 bnz,pt %icc, .L_four_or_more
295 fmuld u32, v32, r64 C FIXME not urgent
299 mov i00, %g5 C i00+ now in g5
300 fmuld u32, v48, r80 C FIXME not urgent
302 ldx [%sp+2223+0], i00
304 srlx i16, 48, %l4 C (i16 >> 48)
306 ldx [%sp+2223+8], i16
307 srlx i48, 16, %l5 C (i48 >> 16)
308 mov i32, %g4 C i32+ now in g4
309 ldx [%sp+2223+16], i32
310 sllx i48, 32, %l6 C (i48 << 32)
311 ldx [%sp+2223+24], i48
313 srlx %g4, 32, %o3 C (i32 >> 32)
314 add %l5, %l4, %o1 C hi64- in %o1
315 std a00, [%sp+2223+0]
316 sllx %g4, 16, %o2 C (i32 << 16)
317 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
318 std a16, [%sp+2223+8]
319 sllx %o1, 48, %o3 C (hi64 << 48)
320 add %g2, %o2, %o2 C mi64- in %o2
321 std a32, [%sp+2223+16]
322 add %l6, %o2, %o2 C mi64- in %o2
323 std a48, [%sp+2223+24]
324 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
325 add cy, %g5, %o4 C x = prev(i00) + cy
330 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
331 fmuld u32, v32, r64 C FIXME not urgent
333 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
336 mov i00, %g5 C i00+ now in g5
337 fmuld u32, v48, r80 C FIXME not urgent
339 ldx [%sp+2223+0], i00
341 srlx i16, 48, %l4 C (i16 >> 48)
343 ldx [%sp+2223+8], i16
345 srlx i48, 16, %l5 C (i48 >> 16)
346 mov i32, %g4 C i32+ now in g4
347 ldx [%sp+2223+16], i32
349 sllx i48, 32, %l6 C (i48 << 32)
350 ldx [%sp+2223+24], i48
352 srlx %g4, 32, %o3 C (i32 >> 32)
353 add %l5, %l4, %o1 C hi64- in %o1
354 std a00, [%sp+2223+0]
356 sllx %g4, 16, %o2 C (i32 << 16)
357 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
358 std a16, [%sp+2223+8]
360 sllx %o1, 48, %o3 C (hi64 << 48)
361 add %g2, %o2, %o2 C mi64- in %o2
362 std a32, [%sp+2223+16]
364 add %l6, %o2, %o2 C mi64- in %o2
365 std a48, [%sp+2223+24]
368 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
371 add cy, %g5, %o4 C x = prev(i00) + cy
383 srlx %o4, 16, %o5 C (x >> 16)
384 ld [%i5+%i2], %f3 C read low 32 bits of up[i]
385 fmuld u32, v32, r64 C FIXME not urgent
388 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
389 and %o4, xffff, %o5 C (x & 0xffff)
390 ld [%i1+%i2], %f5 C read high 32 bits of up[i]
395 srlx %o2, 48, %o7 C (mi64 >> 48)
396 mov i00, %g5 C i00+ now in g5
397 fmuld u32, v48, r80 C FIXME not urgent
400 sllx %o2, 16, %i3 C (mi64 << 16)
401 add %o7, %o1, cy C new cy
402 ldx [%sp+2223+0], i00
405 srlx i16, 48, %l4 C (i16 >> 48)
407 ldx [%sp+2223+8], i16
410 srlx i48, 16, %l5 C (i48 >> 16)
411 mov i32, %g4 C i32+ now in g4
412 ldx [%sp+2223+16], i32
415 sllx i48, 32, %l6 C (i48 << 32)
417 ldx [%sp+2223+24], i48
420 srlx %g4, 32, %o3 C (i32 >> 32)
421 add %l5, %l4, %o1 C hi64- in %o1
422 std a00, [%sp+2223+0]
425 sllx %g4, 16, %o2 C (i32 << 16)
426 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
427 std a16, [%sp+2223+8]
430 sllx %o1, 48, %o3 C (hi64 << 48)
431 add %g2, %o2, %o2 C mi64- in %o2
432 std a32, [%sp+2223+16]
435 add %l6, %o2, %o2 C mi64- in %o2
436 std a48, [%sp+2223+24]
440 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
445 add cy, %g5, %o4 C x = prev(i00) + cy
452 srlx %o4, 16, %o5 C (x >> 16)
453 fmuld u32, v32, r64 C FIXME not urgent
455 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
456 and %o4, xffff, %o5 C (x & 0xffff)
459 srlx %o2, 48, %o7 C (mi64 >> 48)
460 mov i00, %g5 C i00+ now in g5
461 fmuld u32, v48, r80 C FIXME not urgent
463 sllx %o2, 16, %i3 C (mi64 << 16)
464 add %o7, %o1, cy C new cy
465 ldx [%sp+2223+0], i00
467 srlx i16, 48, %l4 C (i16 >> 48)
469 ldx [%sp+2223+8], i16
470 srlx i48, 16, %l5 C (i48 >> 16)
471 mov i32, %g4 C i32+ now in g4
472 ldx [%sp+2223+16], i32
473 sllx i48, 32, %l6 C (i48 << 32)
475 ldx [%sp+2223+24], i48
477 srlx %g4, 32, %o3 C (i32 >> 32)
478 add %l5, %l4, %o1 C hi64- in %o1
479 std a00, [%sp+2223+0]
480 sllx %g4, 16, %o2 C (i32 << 16)
481 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
482 std a16, [%sp+2223+8]
483 sllx %o1, 48, %o3 C (hi64 << 48)
484 add %g2, %o2, %o2 C mi64- in %o2
485 std a32, [%sp+2223+16]
486 add %l6, %o2, %o2 C mi64- in %o2
487 std a48, [%sp+2223+24]
488 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
490 add cy, %g5, %o4 C x = prev(i00) + cy
493 srlx %o4, 16, %o5 C (x >> 16)
494 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
495 and %o4, xffff, %o5 C (x & 0xffff)
497 srlx %o2, 48, %o7 C (mi64 >> 48)
498 mov i00, %g5 C i00+ now in g5
500 sllx %o2, 16, %i3 C (mi64 << 16)
501 add %o7, %o1, cy C new cy
502 ldx [%sp+2223+0], i00
503 srlx i16, 48, %l4 C (i16 >> 48)
505 ldx [%sp+2223+8], i16
506 srlx i48, 16, %l5 C (i48 >> 16)
507 mov i32, %g4 C i32+ now in g4
508 ldx [%sp+2223+16], i32
509 sllx i48, 32, %l6 C (i48 << 32)
511 ldx [%sp+2223+24], i48
512 srlx %g4, 32, %o3 C (i32 >> 32)
513 add %l5, %l4, %o1 C hi64- in %o1
514 std a00, [%sp+2223+0]
515 sllx %g4, 16, %o2 C (i32 << 16)
516 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
517 std a16, [%sp+2223+8]
518 sllx %o1, 48, %o3 C (hi64 << 48)
519 add %g2, %o2, %o2 C mi64- in %o2
520 add %l6, %o2, %o2 C mi64- in %o2
521 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
523 add cy, %g5, %o4 C x = prev(i00) + cy
526 srlx %o4, 16, %o5 C (x >> 16)
527 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
528 and %o4, xffff, %o5 C (x & 0xffff)
529 srlx %o2, 48, %o7 C (mi64 >> 48)
530 mov i00, %g5 C i00+ now in g5
531 sllx %o2, 16, %i3 C (mi64 << 16)
532 add %o7, %o1, cy C new cy
533 ldx [%sp+2223+0], i00
534 srlx i16, 48, %l4 C (i16 >> 48)
536 ldx [%sp+2223+8], i16
537 srlx i48, 16, %l5 C (i48 >> 16)
538 mov i32, %g4 C i32+ now in g4
539 sllx i48, 32, %l6 C (i48 << 32)
541 srlx %g4, 32, %o3 C (i32 >> 32)
542 add %l5, %l4, %o1 C hi64- in %o1
543 sllx %g4, 16, %o2 C (i32 << 16)
544 add %o3, %o1, %o1 C hi64 in %o1 1st ASSIGNMENT
545 sllx %o1, 48, %o3 C (hi64 << 48)
546 add %g2, %o2, %o2 C mi64- in %o2
547 add %l6, %o2, %o2 C mi64- in %o2
548 sub %o2, %o3, %o2 C mi64 in %o2 1st ASSIGNMENT
550 add cy, %g5, %o4 C x = prev(i00) + cy
553 srlx %o4, 16, %o5 C (x >> 16)
554 add %o5, %o2, %o2 C mi64 in %o2 2nd ASSIGNMENT
555 and %o4, xffff, %o5 C (x & 0xffff)
556 srlx %o2, 48, %o7 C (mi64 >> 48)
557 sllx %o2, 16, %i3 C (mi64 << 16)
558 add %o7, %o1, cy C new cy