1 dnl Alpha ev6 nails mpn_addmul_1.
3 dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
28 C * Reroll loop for 3.75 c/l with current 4-way unrolling.
29 C * The loop is overscheduled wrt loads and wrt multiplies, in particular
31 C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
32 C and would work since the loop structure is really regular.
40 define(`numb_mask',`r6')
67 define(`NAIL_BITS',`GMP_NAIL_BITS')
68 define(`NUMB_BITS',`GMP_NUMB_BITS')
70 dnl This declaration is munged by configure
74 PROLOGUE(mpn_addmul_1)
75 sll vl0, NAIL_BITS, vl0
76 lda numb_mask, -1(r31)
77 srl numb_mask, NAIL_BITS, numb_mask
86 L(3m4): ldq ul3, 0(up)
101 srl m3a,NAIL_BITS, t0
105 srl m0a,NAIL_BITS, t0
107 srl acc1,NUMB_BITS, t1
110 L(ge3): ldq ul2, 0(up)
114 srl m3a,NAIL_BITS, t0
122 srl m0a,NAIL_BITS, t0
126 srl acc1,NUMB_BITS, t1
144 srl m2a,NAIL_BITS, t0
150 srl m3a,NAIL_BITS, t0
152 srl acc0,NUMB_BITS, t1
155 L(ge4): ldq rl2, 0(rp)
156 srl m2a,NAIL_BITS, t0
163 srl m3a,NAIL_BITS, t0
168 srl acc0,NUMB_BITS, t1
183 srl m0a,NAIL_BITS, t0
187 srl m1a,NAIL_BITS, t0
189 srl acc0,NUMB_BITS, t1
192 L(ge2): ldq ul2, 0(up)
200 srl m0a,NAIL_BITS, t0
207 srl m1a,NAIL_BITS, t0
213 srl acc0,NUMB_BITS, t1
227 srl m1a,NAIL_BITS, t0
229 and acc1,numb_mask, r28
230 srl acc1,NUMB_BITS, t1
235 L(ge1): ldq ul2, 0(up)
246 srl m1a,NAIL_BITS, t0
255 srl m2a,NAIL_BITS, t0
258 srl acc1,NUMB_BITS, t1
261 L(ge5): ldq ul2, 0(up)
265 L(top): mulq vl0, ul0, m0a C U1
266 addq t0, m0b, acc1 C L0
267 srl acc0,NUMB_BITS, t1 C U0
268 stq r28, -24(rp) C L1
270 L(el2): umulh vl0, ul0, m0b C U1
271 and acc0,numb_mask, r28 C L0
272 addq rl1, acc1, acc1 C U0
276 addq t1, acc1, acc1 C L0
277 srl m2a,NAIL_BITS, t0 C U0
280 mulq vl0, ul1, m1a C U1
281 addq t0, m1b, acc0 C L0
282 srl acc1,NUMB_BITS, t1 C U0
283 stq r28, -16(rp) C L1
285 L(el1): umulh vl0, ul1, m1b C U1
286 and acc1,numb_mask, r28 C L0
287 addq rl2, acc0, acc0 C U0
291 addq t1, acc0, acc0 C L0
292 srl m3a,NAIL_BITS, t0 C U0
295 mulq vl0, ul2, m2a C U1
296 addq t0, m2b, acc1 C L0
297 srl acc0,NUMB_BITS, t1 C U0
300 L(el0): umulh vl0, ul2, m2b C U1
301 and acc0,numb_mask, r28 C L0
302 addq rl3, acc1, acc1 C U0
306 addq t1, acc1, acc1 C L0
307 srl m0a,NAIL_BITS, t0 C U0
310 mulq vl0, ul3, m3a C U1
311 addq t0, m3b, acc0 C L0
312 srl acc1,NUMB_BITS, t1 C U0
315 L(el3): umulh vl0, ul3, m3b C U1
316 and acc1,numb_mask, r28 C L0
317 addq rl0, acc0, acc0 C U0
321 addq t1, acc0, acc0 C L0
322 srl m1a,NAIL_BITS, t0 C U0
330 L(end): mulq vl0, ul0, m0a
332 srl acc0,NUMB_BITS, t1
334 L(ta6): umulh vl0, ul0, m0b
335 and acc0,numb_mask, r28
339 srl m2a,NAIL_BITS, t0
342 srl acc1,NUMB_BITS, t1
344 L(ta5): umulh vl0, ul1, m1b
345 and acc1,numb_mask, r28
349 srl m3a,NAIL_BITS, t0
351 srl acc0,NUMB_BITS, t1
355 L(ta4): and acc0,numb_mask, r28
359 srl m0a,NAIL_BITS, t0
361 srl acc1,NUMB_BITS, t1
365 L(ta3): and acc1,numb_mask, r28
369 srl m1a,NAIL_BITS, t0
371 srl acc0,NUMB_BITS, t1
375 L(ta2): and acc0,numb_mask, r28
378 srl acc1,NUMB_BITS, t1
380 and acc1,numb_mask, r28