1 dnl Alpha ev6 nails mpn_mul_1.
3 dnl Copyright 2002, 2005, 2006 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
28 C * Reroll loop for 3.0 c/l with current 4-way unrolling.
29 C * The loop is overscheduled wrt loads and wrt multiplies, in particular
31 C * Use FP loop count and multiple exit points, that would simplify feed-in lp0
32 C and would work since the loop structure is really regular.
40 define(`numb_mask',`r6')
67 define(`NAIL_BITS',`GMP_NAIL_BITS')
68 define(`NUMB_BITS',`GMP_NUMB_BITS')
70 dnl This declaration is munged by configure
75 sll vl0, NAIL_BITS, vl0
76 lda numb_mask, -1(r31)
77 srl numb_mask, NAIL_BITS, numb_mask
86 L(3m4): ldq ul3, 0(up)
100 srl m3a,NAIL_BITS, t0
102 srl m0a,NAIL_BITS, t0
104 srl acc1,NUMB_BITS, t1
107 L(ge3): ldq ul2, 0(up)
110 srl m3a,NAIL_BITS, t0
116 srl m0a,NAIL_BITS, t0
120 srl acc1,NUMB_BITS, t1
137 srl m2a,NAIL_BITS, t0
141 srl m3a,NAIL_BITS, t0
143 srl acc0,NUMB_BITS, t1
146 L(ge4): srl m2a,NAIL_BITS, t0
151 srl m3a,NAIL_BITS, t0
156 srl acc0,NUMB_BITS, t1
170 srl m0a,NAIL_BITS, t0
172 srl m1a,NAIL_BITS, t0
174 srl acc0,NUMB_BITS, t1
177 L(ge2): ldq ul2, 0(up)
184 srl m0a,NAIL_BITS, t0
189 srl m1a,NAIL_BITS, t0
195 srl acc0,NUMB_BITS, t1
208 srl m1a,NAIL_BITS, t0
210 and acc1,numb_mask, r28
211 srl acc1,NUMB_BITS, t1
216 L(ge1): ldq ul2, 0(up)
226 srl m1a,NAIL_BITS, t0
233 srl m2a,NAIL_BITS, t0
236 srl acc1,NUMB_BITS, t1
239 L(ge5): ldq ul2, 0(up)
243 L(top): mulq vl0, ul0, m0a C U1
244 addq t0, m0b, acc1 C L0
245 srl acc0,NUMB_BITS, t1 C U0
246 stq r28, -24(rp) C L1
248 L(el2): umulh vl0, ul0, m0b C U1
249 and acc0,numb_mask, r28 C L0
254 addq t1, acc1, acc1 C L0
255 srl m2a,NAIL_BITS, t0 C U0
258 mulq vl0, ul1, m1a C U1
259 addq t0, m1b, acc0 C L0
260 srl acc1,NUMB_BITS, t1 C U0
261 stq r28, -16(rp) C L1
263 L(el1): umulh vl0, ul1, m1b C U1
264 and acc1,numb_mask, r28 C L0
269 addq t1, acc0, acc0 C L0
270 srl m3a,NAIL_BITS, t0 C U0
273 mulq vl0, ul2, m2a C U1
274 addq t0, m2b, acc1 C L0
275 srl acc0,NUMB_BITS, t1 C U0
278 L(el0): umulh vl0, ul2, m2b C U1
279 and acc0,numb_mask, r28 C L0
284 addq t1, acc1, acc1 C L0
285 srl m0a,NAIL_BITS, t0 C U0
288 mulq vl0, ul3, m3a C U1
289 addq t0, m3b, acc0 C L0
290 srl acc1,NUMB_BITS, t1 C U0
293 L(el3): umulh vl0, ul3, m3b C U1
294 and acc1,numb_mask, r28 C L0
299 addq t1, acc0, acc0 C L0
300 srl m1a,NAIL_BITS, t0 C U0
308 L(end): mulq vl0, ul0, m0a
310 srl acc0,NUMB_BITS, t1
312 L(ta6): umulh vl0, ul0, m0b
313 and acc0,numb_mask, r28
315 srl m2a,NAIL_BITS, t0
318 srl acc1,NUMB_BITS, t1
320 L(ta5): umulh vl0, ul1, m1b
321 and acc1,numb_mask, r28
323 srl m3a,NAIL_BITS, t0
325 srl acc0,NUMB_BITS, t1
328 L(ta4): and acc0,numb_mask, r28
330 srl m0a,NAIL_BITS, t0
332 srl acc1,NUMB_BITS, t1
336 L(ta3): and acc1,numb_mask, r28
338 srl m1a,NAIL_BITS, t0
340 srl acc0,NUMB_BITS, t1
344 L(ta2): and acc0,numb_mask, r28
346 srl acc1,NUMB_BITS, t1
348 and acc1,numb_mask, r28