1 dnl SPARC v9 32-bit mpn_addmul_1 -- Multiply a limb vector with a limb and add
2 dnl the result to a second limb vector.
4 dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
23 C Algorithm: We use two floating-point multiplies per limb product, with the
24 C invariant v operand split into two 16-bit pieces, and the u operand split
25 C into 32-bit pieces. We convert the two 48-bit products and transfer them to
32 C Possible optimizations:
33 C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
34 C memory bandwidth limited, this could save 1.5 cycles/limb.
35 C 2. Unroll the inner loop. Since we already use alternate temporary areas,
36 C it is very straightforward to unroll, using an exit branch midways.
37 C Unrolling would allow deeper scheduling which could improve speed for L2
39 C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es
40 C aren't sufficiently apart-scheduled with just two temp areas.
41 C 4. Specialize for particular v values. If its upper 16 bits are zero, we
42 C could save many operations.
53 PROLOGUE(mpn_addmul_1)
55 sethi %hi(0xffff), %g1
57 or %g1, %lo(0xffff), %g1
65 ld [%sp+104], %f10 C zero f10
69 define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe
71 add %sp, 160, %o5 C point in scratch area
72 and %o5, -32, %o5 C align at 0 (mod 32) in scratch area
75 ld [%o1], %f11 C read up[i]
76 add %o1, 4, %o1 C up++
77 bne,pt %icc, .L_two_or_more
86 ldx [%o5+16], %g2 C p16
87 ldx [%o5+24], %g1 C p0
88 lduw [%o0], %g5 C read rp[i]
95 ld [%o1], %f11 C read up[i]
98 add %o1, 4, %o1 C up++
99 bne,pt %icc, .L_three_or_more
112 lduw [%o0], %g5 C read rp[i]
113 ldx [%o5+16], %g2 C p16
114 ldx [%o5+24], %g1 C p0
121 ld [%o1], %f11 C read up[i]
128 add %o1, 4, %o1 C up++
129 bne,pt %icc, .L_four_or_more
139 ldx [%o5+16], %g2 C p16
141 ldx [%o5+24], %g1 C p0
144 lduw [%o0], %g5 C read rp[i]
151 ld [%o1], %f11 C read up[i]
158 add %o1, 4, %o1 C up++
159 bne,pt %icc, .L_five_or_more
163 ldx [%o5+16], %g2 C p16
165 ldx [%o5+24], %g1 C p0
170 add %o1, 4, %o1 C up++
171 lduw [%o0], %g5 C read rp[i]
178 ld [%o1], %f11 C read up[i]
180 ldx [%o5+16], %g2 C p16
182 ldx [%o5+24], %g1 C p0
187 add %o1, 4, %o1 C up++
188 lduw [%o0], %g5 C read rp[i]
198 ld [%o1], %f11 C read up[i]
201 sllx %g2, 16, %g4 C (p16 << 16)
202 add %o0, 4, %o0 C rp++
203 ldx [%o5+0], %g2 C p16
207 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
208 ldx [%o5+8], %g1 C p0
212 add %g3, %g4, %g4 C p += cy
217 add %g5, %g4, %g4 C p += rp[i]
221 xor %o5, 16, %o5 C alternate scratch variables
222 add %o1, 4, %o1 C up++
226 srlx %g4, 32, %g3 C new cy
227 lduw [%o0], %g5 C read rp[i]
232 .L5: fdtox %f16, %f14
233 sllx %g2, 16, %g4 C (p16 << 16)
234 ldx [%o5+0], %g2 C p16
236 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
237 ldx [%o5+8], %g1 C p0
238 add %g4, %g3, %g4 C p += cy
241 add %g5, %g4, %g4 C p += rp[i]
246 srlx %g4, 32, %g3 C new cy
247 lduw [%o0+4], %g5 C read rp[i]
249 .L4: fdtox %f16, %f14
250 sllx %g2, 16, %g4 C (p16 << 16)
251 ldx [%o5+0], %g2 C p16
253 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
254 ldx [%o5+8], %g1 C p0
255 add %g3, %g4, %g4 C p += cy
257 add %g5, %g4, %g4 C p += rp[i]
261 srlx %g4, 32, %g3 C new cy
262 lduw [%o0+8], %g5 C read rp[i]
264 .L3: sllx %g2, 16, %g4 C (p16 << 16)
265 ldx [%o5+0], %g2 C p16
266 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
267 ldx [%o5+8], %g1 C p0
268 add %g3, %g4, %g4 C p += cy
269 add %g5, %g4, %g4 C p += rp[i]
272 srlx %g4, 32, %g3 C new cy
273 lduw [%o0+12], %g5 C read rp[i]
275 .L2: sllx %g2, 16, %g4 C (p16 << 16)
276 ldx [%o5+0], %g2 C p16
277 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
278 ldx [%o5+8], %g1 C p0
279 add %g3, %g4, %g4 C p += cy
280 add %g5, %g4, %g4 C p += rp[i]
282 srlx %g4, 32, %g3 C new cy
283 lduw [%o0+16], %g5 C read rp[i]
285 .L1: sllx %g2, 16, %g4 C (p16 << 16)
286 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
287 add %g3, %g4, %g4 C p += cy
288 add %g5, %g4, %g4 C p += rp[i]
290 srlx %g4, 32, %g3 C new cy
295 EPILOGUE(mpn_addmul_1)