1 dnl SPARC v9 64-bit mpn_addmul_2 -- Multiply an n limb number with 2-limb
2 dnl number and add the result to a n limb vector.
4 dnl Copyright 2002, 2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
27 C Algorithm: We use 16 floating-point multiplies per limb product, with the
28 C 2-limb v operand split into eight 16-bit pieces, and the n-limb u operand
29 C split into 32-bit pieces. We sum four 48-bit partial products using
30 C floating-point add, then convert the resulting four 50-bit quantities and
31 C transfer them to the integer unit.
33 C Possible optimizations:
34 C 1. Align the stack area where we transfer the four 50-bit product-sums
35 C to a 32-byte boundary. That would minimize the cache collision.
36 C (UltraSPARC-1/2 use a direct-mapped cache.) (Perhaps even better would
37 C be to align the area to map to the area immediately before up?)
38 C 2. Perform two of the fp->int conversions with integer instructions. We
39 C can get almost ten free IEU slots, if we clean up bookkeeping and the
40 C silly carry-limb code.
41 C 3. For an mpn_addmul_1 based on this, we need to fix the silly carry-limb
44 C OSP (Overlapping software pipeline) version of mpn_mul_basecase:
45 C Operand swap will require 8 LDDA and 8 FXTOD, which will mean 8 cycles.
51 C Instruction classification (as per UltraSPARC functional units).
52 C Assuming silly carry code is fixed. Includes bookkeeping.
54 C mpn_addmul_X mpn_mul_X
56 C ========== ==========
64 C TOTAL IEU 17 17 16 16
67 C IEU cycles 8.5 8.5 8 8
68 C MEM cycles 12 12 10 10
69 C ISSUE cycles 12 16 11.25 15.25
70 C FPU cycles 10 18 10 18
71 C cycles/loop 12 18 12 18
72 C cycles/limb 12 9 12 9
83 REGISTER(%g2,#scratch)
84 REGISTER(%g3,#scratch)
94 define(`p000', `%f8') define(`p016',`%f10')
95 define(`p032',`%f12') define(`p048',`%f14')
96 define(`p064',`%f16') define(`p080',`%f18')
97 define(`p096a',`%f20') define(`p112a',`%f22')
98 define(`p096b',`%f56') define(`p112b',`%f58')
100 define(`out000',`%f0') define(`out016',`%f6')
102 define(`v000',`%f24') define(`v016',`%f26')
103 define(`v032',`%f28') define(`v048',`%f30')
104 define(`v064',`%f44') define(`v080',`%f46')
105 define(`v096',`%f48') define(`v112',`%f50')
107 define(`u00',`%f32') define(`u32', `%f34')
109 define(`a000',`%f36') define(`a016',`%f38')
110 define(`a032',`%f40') define(`a048',`%f42')
111 define(`a064',`%f60') define(`a080',`%f62')
113 define(`u00_hi',`%f2') define(`u32_hi',`%f4')
114 define(`u00_lo',`%f3') define(`u32_lo',`%f5')
117 define(`rlimb',`%g3')
118 define(`i00',`%l0') define(`i16',`%l1')
119 define(`r00',`%l2') define(`r32',`%l3')
120 define(`xffffffff',`%l7')
121 define(`xffff',`%o0')
124 PROLOGUE(mpn_addmul_2)
126 C Initialization. (1) Split v operand into eight 16-bit chunks and store them
127 C as IEEE double in fp registers. (2) Clear upper 32 bits of fp register pairs
128 C f2 and f4. (3) Store masks in registers aliased to `xffff' and `xffffffff'.
129 C This code could be better scheduled.
136 srlx %g4, 32, xffffffff C store mask in register `xffffffff'
137 ldda [%i3+6] %asi, v000
138 ldda [%i3+4] %asi, v016
139 ldda [%i3+2] %asi, v032
140 ldda [%i3+0] %asi, v048
142 ldda [%i3+14] %asi, v064
144 ldda [%i3+12] %asi, v080
146 ldda [%i3+10] %asi, v096
148 ldda [%i3+8] %asi, v112
157 ldx [%i3+0], %l0 C vp[0]
158 srlx %g4, 48, xffff C store mask in register `xffff'
159 ldx [%i3+8], %l1 C vp[1]
162 stx %g2, [%sp+2223+0]
165 stx %g3, [%sp+2223+8]
168 stx %g2, [%sp+2223+16]
170 stx %g3, [%sp+2223+24]
172 stx %g2, [%sp+2223+32]
175 stx %g3, [%sp+2223+40]
178 stx %g2, [%sp+2223+48]
180 stx %g3, [%sp+2223+56]
182 srlx %g4, 32, xffffffff C store mask in register `xffffffff'
184 ldd [%sp+2223+0], v000
185 ldd [%sp+2223+8], v016
186 ldd [%sp+2223+16], v032
187 ldd [%sp+2223+24], v048
189 ldd [%sp+2223+32], v064
191 ldd [%sp+2223+40], v080
193 ldd [%sp+2223+48], v096
195 ldd [%sp+2223+56], v112
197 ld [%sp+2223+0], u00_hi C zero u00_hi
199 ld [%sp+2223+0], u32_hi C zero u32_hi
203 C Initialization done.
207 add %i0, -8, %i0 C BOOKKEEPING
209 C Start software pipeline.
211 ld [%i1+4], u00_lo C read low 32 bits of up[i]
214 ld [%i1+0], u32_lo C read high 32 bits of up[i]
215 fmuld u00, v000, a000
216 fmuld u00, v016, a016
217 fmuld u00, v032, a032
218 fmuld u00, v048, a048
219 add %i2, -1, %i2 C BOOKKEEPING
220 fmuld u00, v064, p064
221 add %i1, 8, %i1 C BOOKKEEPING
223 fmuld u00, v080, p080
224 fmuld u00, v096, p096a
225 brnz,pt %i2, .L_2_or_more
226 fmuld u00, v112, p112a
228 .L1: fdtox a000, out000
229 fmuld u32, v000, p000
231 fmuld u32, v016, p016
233 fmuld u32, v032, p032
235 fmuld u32, v048, p048
236 std out000, [%sp+2223+16]
237 faddd p000, a032, a000
238 fmuld u32, v064, p064
239 std out016, [%sp+2223+24]
241 faddd p016, a048, a016
242 fmuld u32, v080, p080
243 faddd p032, a064, a032
244 fmuld u32, v096, p096b
245 faddd p048, a080, a048
246 fmuld u32, v112, p112b
250 faddd p064, p096a, a064
251 faddd p080, p112a, a080
252 std out000, [%sp+2223+0]
254 std out016, [%sp+2223+8]
257 ld [%i1+4], u00_lo C read low 32 bits of up[i]
259 fmuld u32, v000, p000
261 fmuld u32, v016, p016
263 fmuld u32, v032, p032
265 fmuld u32, v048, p048
266 std out000, [%sp+2223+16]
267 faddd p000, a032, a000
268 fmuld u32, v064, p064
269 std out016, [%sp+2223+24]
271 faddd p016, a048, a016
272 fmuld u32, v080, p080
273 faddd p032, a064, a032
274 fmuld u32, v096, p096b
275 faddd p048, a080, a048
276 fmuld u32, v112, p112b
278 ld [%i1+0], u32_lo C read high 32 bits of up[i]
280 fmuld u00, v000, p000
282 fmuld u00, v016, p016
283 faddd p064, p096a, a064
284 fmuld u00, v032, p032
285 faddd p080, p112a, a080
286 fmuld u00, v048, p048
287 add %i2, -1, %i2 C BOOKKEEPING
288 std out000, [%sp+2223+0]
289 faddd p000, a032, a000
290 fmuld u00, v064, p064
291 add %i1, 8, %i1 C BOOKKEEPING
292 std out016, [%sp+2223+8]
294 faddd p016, a048, a016
295 fmuld u00, v080, p080
296 faddd p032, a064, a032
297 fmuld u00, v096, p096a
298 faddd p048, a080, a048
299 brnz,pt %i2, .L_3_or_more
300 fmuld u00, v112, p112a
309 C . |_______i00__| 50
310 C |_______i16__| . 50
316 .Loop: ld [%i1+4], u00_lo C read low 32 bits of up[i]
317 and %g2, xffffffff, %g2
319 fmuld u32, v000, p000
321 lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
324 fmuld u32, v016, p016
327 ldx [%sp+2223+16], i00
328 faddd p064, p096b, a064
329 fmuld u32, v032, p032
331 add %g4, cy, cy C new cy
332 ldx [%sp+2223+24], i16
333 faddd p080, p112b, a080
334 fmuld u32, v048, p048
337 std out000, [%sp+2223+16]
338 faddd p000, a032, a000
339 fmuld u32, v064, p064
342 add %i0, 8, %i0 C BOOKKEEPING
343 std out016, [%sp+2223+24]
348 faddd p016, a048, a016
349 fmuld u32, v080, p080
353 faddd p032, a064, a032
354 fmuld u32, v096, p096b
358 faddd p048, a080, a048
359 fmuld u32, v112, p112b
361 ld [%i1+0], u32_lo C read high 32 bits of up[i]
362 and %g2, xffffffff, %g2
364 fmuld u00, v000, p000
366 lduw [%i0+0], r32 C read high 32 bits of rp[i]
369 fmuld u00, v016, p016
372 ldx [%sp+2223+0], i00
373 faddd p064, p096a, a064
374 fmuld u00, v032, p032
376 add %g4, cy, cy C new cy
377 ldx [%sp+2223+8], i16
378 faddd p080, p112a, a080
379 fmuld u00, v048, p048
381 add %i2, -1, %i2 C BOOKKEEPING
382 std out000, [%sp+2223+0]
383 faddd p000, a032, a000
384 fmuld u00, v064, p064
387 add %i1, 8, %i1 C BOOKKEEPING
388 std out016, [%sp+2223+8]
393 faddd p016, a048, a016
394 fmuld u00, v080, p080
398 faddd p032, a064, a032
399 fmuld u00, v096, p096a
402 faddd p048, a080, a048
404 fmuld u00, v112, p112a
408 .Lend: and %g2, xffffffff, %g2
410 fmuld u32, v000, p000
411 lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
414 fmuld u32, v016, p016
416 ldx [%sp+2223+16], i00
417 faddd p064, p096b, a064
418 fmuld u32, v032, p032
419 add %g4, cy, cy C new cy
420 ldx [%sp+2223+24], i16
421 faddd p080, p112b, a080
422 fmuld u32, v048, p048
423 std out000, [%sp+2223+16]
424 faddd p000, a032, a000
425 fmuld u32, v064, p064
427 add %i0, 8, %i0 C BOOKKEEPING
428 std out016, [%sp+2223+24]
431 faddd p016, a048, a016
432 fmuld u32, v080, p080
435 faddd p032, a064, a032
436 fmuld u32, v096, p096b
438 faddd p048, a080, a048
439 fmuld u32, v112, p112b
441 and %g2, xffffffff, %g2
443 lduw [%i0+0], r32 C read high 32 bits of rp[i]
447 ldx [%sp+2223+0], i00
448 faddd p064, p096a, a064
449 add %g4, cy, cy C new cy
450 ldx [%sp+2223+8], i16
451 faddd p080, p112a, a080
452 std out000, [%sp+2223+0]
454 std out016, [%sp+2223+8]
462 .L_wd2: and %g2, xffffffff, %g2
464 lduw [%i0+4+8], r00 C read low 32 bits of rp[i]
468 ldx [%sp+2223+16], i00
469 add %g4, cy, cy C new cy
470 ldx [%sp+2223+24], i16
471 std out000, [%sp+2223+16]
473 add %i0, 8, %i0 C BOOKKEEPING
474 std out016, [%sp+2223+24]
481 and %g2, xffffffff, %g2
483 lduw [%i0+0], r32 C read high 32 bits of rp[i]
487 ldx [%sp+2223+0], i00
488 add %g4, cy, cy C new cy
489 ldx [%sp+2223+8], i16
490 std out000, [%sp+2223+0]
492 std out016, [%sp+2223+8]
500 .L_wd3: and %g2, xffffffff, %g2
505 ldx [%sp+2223+16], rlimb
506 add %g4, cy, cy C new cy
507 ldx [%sp+2223+24], i16
508 std out000, [%sp+2223+16]
509 add %i0, 8, %i0 C BOOKKEEPING
510 std out016, [%sp+2223+24]
517 and %g2, xffffffff, %g2
520 ldx [%sp+2223+0], rlimb
521 add %g4, cy, cy C new cy
522 ldx [%sp+2223+8], i16
529 and %g2, xffffffff, %g2
532 ldx [%sp+2223+16], i00
533 add %g4, cy, cy C new cy
534 ldx [%sp+2223+24], i16
540 EPILOGUE(mpn_addmul_2)