1 dnl SPARC v9 32-bit mpn_mul_1 -- Multiply a limb vector with a limb and store
2 dnl the result in a second limb vector.
4 dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
23 C Algorithm: We use two floating-point multiplies per limb product, with the
24 C invariant v operand split into two 16-bit pieces, and the u operand split
25 C into 32-bit pieces. We convert the two 48-bit products and transfer them to
32 C Possible optimizations:
33 C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
34 C memory bandwidth limited, this could save 1.5 cycles/limb.
35 C 2. Unroll the inner loop. Since we already use alternate temporary areas,
36 C it is very straightforward to unroll, using an exit branch midways.
37 C Unrolling would allow deeper scheduling which could improve speed for L2
39 C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es
40 C aren't sufficiently apart-scheduled with just two temp areas.
41 C 4. Specialize for particular v values. If its upper 16 bits are zero, we
42 C could save many operations.
55 sethi %hi(0xffff), %g1
57 or %g1, %lo(0xffff), %g1
65 ld [%sp+104], %f10 C zero f10
69 define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe
71 add %sp, 160, %o5 C point in scratch area
72 and %o5, -32, %o5 C align at 0 (mod 32) in scratch area
75 ld [%o1], %f11 C read up[i]
76 add %o1, 4, %o1 C up++
77 bne,pt %icc, .L_two_or_more
86 ldx [%o5+16], %g2 C p16
87 ldx [%o5+24], %g1 C p0
94 ld [%o1], %f11 C read up[i]
97 add %o1, 4, %o1 C up++
98 bne,pt %icc, .L_three_or_more
111 ldx [%o5+16], %g2 C p16
112 ldx [%o5+24], %g1 C p0
119 ld [%o1], %f11 C read up[i]
126 add %o1, 4, %o1 C up++
127 bne,pt %icc, .L_four_or_more
137 ldx [%o5+16], %g2 C p16
139 ldx [%o5+24], %g1 C p0
148 ld [%o1], %f11 C read up[i]
155 add %o1, 4, %o1 C up++
156 bne,pt %icc, .L_five_or_more
160 ldx [%o5+16], %g2 C p16
162 ldx [%o5+24], %g1 C p0
167 add %o1, 4, %o1 C up++
174 ld [%o1], %f11 C read up[i]
176 ldx [%o5+16], %g2 C p16
178 ldx [%o5+24], %g1 C p0
183 add %o1, 4, %o1 C up++
193 ld [%o1], %f11 C read up[i]
196 sllx %g2, 16, %g4 C (p16 << 16)
197 add %o0, 4, %o0 C rp++
198 ldx [%o5+0], %g2 C p16
202 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
203 ldx [%o5+8], %g1 C p0
207 add %g3, %g4, %g4 C p += cy
211 srlx %g4, 32, %g3 C new cy
212 add %o1, 4, %o1 C up++
216 xor %o5, 16, %o5 C alternate scratch variables
222 .L5: fdtox %f16, %f14
223 sllx %g2, 16, %g4 C (p16 << 16)
224 ldx [%o5+0], %g2 C p16
226 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
227 ldx [%o5+8], %g1 C p0
228 add %g4, %g3, %g4 C p += cy
235 srlx %g4, 32, %g3 C new cy
237 .L4: fdtox %f16, %f14
238 sllx %g2, 16, %g4 C (p16 << 16)
239 ldx [%o5+0], %g2 C p16
241 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
242 ldx [%o5+8], %g1 C p0
243 add %g3, %g4, %g4 C p += cy
248 srlx %g4, 32, %g3 C new cy
250 .L3: sllx %g2, 16, %g4 C (p16 << 16)
251 ldx [%o5+0], %g2 C p16
252 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
253 ldx [%o5+8], %g1 C p0
254 add %g3, %g4, %g4 C p += cy
257 srlx %g4, 32, %g3 C new cy
259 .L2: sllx %g2, 16, %g4 C (p16 << 16)
260 ldx [%o5+0], %g2 C p16
261 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
262 ldx [%o5+8], %g1 C p0
263 add %g3, %g4, %g4 C p += cy
265 srlx %g4, 32, %g3 C new cy
267 .L1: sllx %g2, 16, %g4 C (p16 << 16)
268 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
269 add %g3, %g4, %g4 C p += cy
271 srlx %g4, 32, %g3 C new cy