1 dnl SPARC v9 32-bit mpn_submul_1 -- Multiply a limb vector with a limb and
2 dnl subtract the result from a second limb vector.
4 dnl Copyright 1998, 2000, 2001, 2003 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl it under the terms of the GNU Lesser General Public License as published
10 dnl by the Free Software Foundation; either version 3 of the License, or (at
11 dnl your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
16 dnl License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
23 C Algorithm: We use two floating-point multiplies per limb product, with the
24 C invariant v operand split into two 16-bit pieces, and the u operand split
25 C into 32-bit pieces. We convert the two 48-bit products and transfer them to
32 C Possible optimizations:
33 C 1. Combine 32-bit memory operations into 64-bit operations. Since we're
34 C memory bandwidth limited, this could save 1.5 cycles/limb.
35 C 2. Unroll the inner loop. Since we already use alternate temporary areas,
36 C it is very straightforward to unroll, using an exit branch midways.
37 C Unrolling would allow deeper scheduling which could improve speed for L2
39 C 3. For mpn_mul_1: Use more alternating temp areas. The std'es and ldx'es
40 C aren't sufficiently apart-scheduled with just two temp areas.
41 C 4. Specialize for particular v values. If its upper 16 bits are zero, we
42 C could save many operations.
53 PROLOGUE(mpn_submul_1)
55 sethi %hi(0xffff), %g1
57 or %g1, %lo(0xffff), %g1
65 ld [%sp+104], %f10 C zero f10
69 define(`fanop', `fitod %f18, %f0') C A quasi nop running in the FA pipe
71 add %sp, 160, %o5 C point in scratch area
72 and %o5, -32, %o5 C align at 0 (mod 32) in scratch area
75 ld [%o1], %f11 C read up[i]
76 add %o1, 4, %o1 C up++
77 bne,pt %icc, .L_two_or_more
86 ldx [%o5+16], %g2 C p16
87 ldx [%o5+24], %g1 C p0
88 lduw [%o0], %g5 C read rp[i]
95 ld [%o1], %f11 C read up[i]
98 add %o1, 4, %o1 C up++
99 bne,pt %icc, .L_three_or_more
112 lduw [%o0], %g5 C read rp[i]
113 ldx [%o5+16], %g2 C p16
114 ldx [%o5+24], %g1 C p0
121 ld [%o1], %f11 C read up[i]
128 add %o1, 4, %o1 C up++
129 bne,pt %icc, .L_four_or_more
139 ldx [%o5+16], %g2 C p16
141 ldx [%o5+24], %g1 C p0
144 lduw [%o0], %g5 C read rp[i]
151 ld [%o1], %f11 C read up[i]
158 add %o1, 4, %o1 C up++
159 bne,pt %icc, .L_five_or_more
163 ldx [%o5+16], %g2 C p16
165 ldx [%o5+24], %g1 C p0
170 add %o1, 4, %o1 C up++
171 lduw [%o0], %g5 C read rp[i]
178 ld [%o1], %f11 C read up[i]
180 ldx [%o5+16], %g2 C p16
182 ldx [%o5+24], %g1 C p0
187 add %o1, 4, %o1 C up++
188 lduw [%o0], %g5 C read rp[i]
196 .Loop: sub %g0, %g3, %g3
198 ld [%o1], %f11 C read up[i]
201 sllx %g2, 16, %g4 C (p16 << 16)
202 add %o0, 4, %o0 C rp++
203 ldx [%o5+0], %g2 C p16
206 srl %g3, 0, %g3 C zero most significant 32 bits
207 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
208 ldx [%o5+8], %g1 C p0
212 add %g3, %g4, %g4 C p += cy
217 sub %g5, %g4, %g4 C p += rp[i]
221 xor %o5, 16, %o5 C alternate scratch variables
222 add %o1, 4, %o1 C up++
226 srlx %g4, 32, %g3 C new cy
227 lduw [%o0], %g5 C read rp[i]
232 .L5: sub %g0, %g3, %g3
234 sllx %g2, 16, %g4 C (p16 << 16)
235 ldx [%o5+0], %g2 C p16
237 srl %g3, 0, %g3 C zero most significant 32 bits
238 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
239 ldx [%o5+8], %g1 C p0
240 add %g4, %g3, %g4 C p += cy
243 sub %g5, %g4, %g4 C p += rp[i]
248 srlx %g4, 32, %g3 C new cy
249 lduw [%o0+4], %g5 C read rp[i]
252 .L4: fdtox %f16, %f14
253 sllx %g2, 16, %g4 C (p16 << 16)
254 ldx [%o5+0], %g2 C p16
256 srl %g3, 0, %g3 C zero most significant 32 bits
257 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
258 ldx [%o5+8], %g1 C p0
259 add %g3, %g4, %g4 C p += cy
261 sub %g5, %g4, %g4 C p += rp[i]
265 srlx %g4, 32, %g3 C new cy
266 lduw [%o0+8], %g5 C read rp[i]
269 .L3: sllx %g2, 16, %g4 C (p16 << 16)
270 ldx [%o5+0], %g2 C p16
271 srl %g3, 0, %g3 C zero most significant 32 bits
272 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
273 ldx [%o5+8], %g1 C p0
274 add %g3, %g4, %g4 C p += cy
275 sub %g5, %g4, %g4 C p += rp[i]
278 srlx %g4, 32, %g3 C new cy
279 lduw [%o0+12], %g5 C read rp[i]
282 .L2: sllx %g2, 16, %g4 C (p16 << 16)
283 ldx [%o5+0], %g2 C p16
284 srl %g3, 0, %g3 C zero most significant 32 bits
285 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
286 ldx [%o5+8], %g1 C p0
287 add %g3, %g4, %g4 C p += cy
288 sub %g5, %g4, %g4 C p += rp[i]
290 srlx %g4, 32, %g3 C new cy
291 lduw [%o0+16], %g5 C read rp[i]
294 .L1: sllx %g2, 16, %g4 C (p16 << 16)
295 srl %g3, 0, %g3 C zero most significant 32 bits
296 add %g1, %g4, %g4 C p = p0 + (p16 << 16)
297 add %g3, %g4, %g4 C p += cy
298 sub %g5, %g4, %g4 C p += rp[i]
300 srlx %g4, 32, %g3 C new cy
305 EPILOGUE(mpn_submul_1)