1 dnl Alpha ev6 mpn_addmul_1 and mpn_submul_1.
3 dnl Copyright 2000, 2003, 2004, 2005, 2008 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
33 dnl This code was written in cooperation with ev6 pipeline expert Steve Root.
35 dnl The stores can issue a cycle late so we have paired no-op's to 'catch'
36 dnl them, so that further disturbance to the schedule is damped.
38 dnl We couldn't pair the loads, because the entangled schedule of the carry's
39 dnl has to happen on one side {0} of the machine.
41 dnl This is a great schedule for the d_cache, a poor schedule for the b_cache.
42 dnl The lockup on U0 means that any stall can't be recovered from. Consider a
43 dnl ldq in L1, say that load gets stalled because it collides with a fill from
44 dnl the b_cache. On the next cycle, this load gets priority. If first looks
45 dnl at L0, and goes there. The instruction we intended for L0 gets to look at
46 dnl L1, which is NOT where we want it. It either stalls 1, because it can't
47 dnl go in L0, or goes there, and causes a further instruction to stall.
49 dnl So for b_cache, we're likely going to want to put one or more cycles back
50 dnl into the code! And, of course, put in lds prefetch for the rp[] operand.
51 dnl At a place where we have an mt followed by a bookkeeping, put the
52 dnl bookkeeping in upper, and the prefetch into lower.
54 dnl Note, the ldq's and stq's are at the end of the quadpacks. Note, we'd
55 dnl like not to have an ldq or an stq to preceded a conditional branch in a
56 dnl quadpack. The conditional branch moves the retire pointer one cycle
59 ifdef(`OPERATION_addmul_1',`
60 define(`ADDSUB', `addq')
61 define(`CMPCY', `cmpult $2,$1')
62 define(`func', `mpn_addmul_1')
64 ifdef(`OPERATION_submul_1',`
65 define(`ADDSUB', `subq')
66 define(`CMPCY', `cmpult $1,$2')
67 define(`func', `mpn_submul_1')
70 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_submul_1)
80 $1mod8: ldq r5, 0(rp) C
84 CMPCY( r5, r23), r20 C
90 $L1: lda r8, 0(r31) C zero carry reg
91 lda r24, 0(r31) C zero carry reg
105 $7mod8: ldq r5, 0(rp) C
110 CMPCY( r5, r23), r20 C
115 $6mod8: ldq r1, 8(up) C
123 lda up, 48(up) C L1 bookkeeping
126 lda rp, -32(rp) C L1 bookkeeping
128 ADDSUB r4, r25, r25 C lo + acc
132 $ent1: lda up, 8(up) C
136 $0mod8: ldq r1, 8(up) C
147 ADDSUB r4, r2, r2 C lo + acc
152 $3mod8: ldq r5, 0(rp) C
157 CMPCY( r5, r23), r20 C
162 $2mod8: ldq r1, 8(up) C
171 lda up, 16(up) C L1 bookkeeping
174 lda rp, 0(rp) C L1 bookkeeping
176 ADDSUB r4, r25, r25 C lo + acc
180 $5mod8: ldq r5, 0(rp) C
185 CMPCY( r5, r23), r20 C
190 $4mod8: ldq r1, 8(up) C
198 lda up, 32(up) C L1 bookkeeping
201 lda rp, 16(rp) C L1 bookkeeping
203 ADDSUB r4, r2, r2 C lo + acc
205 CMPCY( r4, r2), r20 C L0 lo add => carry
206 ADDSUB r2, r8, r22 C U0 hi add => answer
210 bis r31, r31, r31 C U1 mt
211 CMPCY( r2, r22), r21 C L0 hi add => carry
212 addq r6, r20, r6 C U0 hi mul + carry
215 bis r31, r31, r31 C U1 mt
216 ADDSUB r5, r7, r7 C L0 lo + acc
217 addq r6, r21, r6 C U0 hi mul + carry
220 umulh v0, r1, r8 C U1
221 CMPCY( r5, r7), r20 C L0 lo add => carry
222 ADDSUB r7, r6, r23 C U0 hi add => answer
226 CMPCY( r7, r23), r21 C L0 hi add => carry
227 addq r24, r20, r24 C U0 hi mul + carry
230 umulh v0, r0, r6 C U1
231 ADDSUB r4, r25, r25 C U0 lo + acc
232 stq r22, -16(rp) C L0
235 bis r31, r31, r31 C L0 st slosh
237 bis r31, r31, r31 C L1 st slosh
238 addq r24, r21, r24 C U0 hi mul + carry
240 CMPCY( r4, r25), r20 C L0 lo add => carry
241 bis r31, r31, r31 C U1 mt
242 lda r18, -8(r18) C L1 bookkeeping
243 ADDSUB r25, r24, r22 C U0 hi add => answer
245 bis r31, r31, r31 C U1 mt
246 CMPCY( r25, r22), r21 C L0 hi add => carry
247 addq r3, r20, r3 C U0 hi mul + carry
250 bis r31, r31, r31 C U1 mt
251 ADDSUB r5, r28, r28 C L0 lo + acc
252 addq r3, r21, r3 C U0 hi mul + carry
255 umulh v0, r1, r24 C U1
256 CMPCY( r5, r28), r20 C L0 lo add => carry
257 ADDSUB r28, r3, r23 C U0 hi add => answer
260 mulq v0, r0, r25 C U1
261 CMPCY( r28, r23), r21 C L0 hi add => carry
262 addq r8, r20, r8 C U0 hi mul + carry
265 umulh v0, r0, r3 C U1
266 ADDSUB r4, r2, r2 C U0 lo + acc
270 bis r31, r31, r31 C L0 st slosh
271 mulq v0, r1, r28 C U1
272 bis r31, r31, r31 C L1 st slosh
273 addq r8, r21, r8 C U0 hi mul + carry
275 CMPCY( r4, r2), r20 C L0 lo add => carry
276 bis r31, r31, r31 C U1 mt
277 lda up, 64(up) C L1 bookkeeping
278 ADDSUB r2, r8, r22 C U0 hi add => answer
280 bis r31, r31, r31 C U1 mt
281 CMPCY( r2, r22), r21 C L0 hi add => carry
282 addq r6, r20, r6 C U0 hi mul + carry
285 bis r31, r31, r31 C U1 mt
286 ADDSUB r5, r7, r7 C L0 lo + acc
287 addq r6, r21, r6 C U0 hi mul + carry
290 umulh v0, r1, r8 C U1
291 CMPCY( r5, r7), r20 C L0 lo add => carry
292 ADDSUB r7, r6, r23 C U0 hi add => answer
296 CMPCY( r7, r23), r21 C L0 hi add => carry
297 addq r24, r20, r24 C U0 hi mul + carry
300 umulh v0, r0, r6 C U1
301 ADDSUB r4, r25, r25 C U0 lo + acc
305 bis r31, r31, r31 C L0 st slosh
307 bis r31, r31, r31 C L1 st slosh
308 addq r24, r21, r24 C U0 hi mul + carry
310 CMPCY( r4, r25), r20 C L0 lo add => carry
311 bis r31, r31, r31 C U1 mt
312 lda rp, 64(rp) C L1 bookkeeping
313 ADDSUB r25, r24, r22 C U0 hi add => answer
315 bis r31, r31, r31 C U1 mt
316 CMPCY( r25, r22), r21 C L0 hi add => carry
317 addq r3, r20, r3 C U0 hi mul + carry
320 bis r31, r31, r31 C U1 mt
321 ADDSUB r5, r28, r28 C L0 lo + acc
322 addq r3, r21, r3 C U0 hi mul + carry
325 umulh v0, r1, r24 C U1
326 CMPCY( r5, r28), r20 C L0 lo add => carry
327 ADDSUB r28, r3, r23 C U0 hi add => answer
330 mulq v0, r0, r25 C U1
331 CMPCY( r28, r23), r21 C L0 hi add => carry
332 addq r8, r20, r8 C U0 hi mul + carry
335 umulh v0, r0, r3 C U1
336 ADDSUB r4, r2, r2 C U0 lo + acc
337 stq r22, -32(rp) C L0
338 stq r23, -24(rp) C L1
340 bis r31, r31, r31 C L0 st slosh
341 mulq v0, r1, r28 C U1
342 bis r31, r31, r31 C L1 st slosh
343 addq r8, r21, r8 C U0 hi mul + carry
345 CMPCY( r4, r2), r20 C L0 lo add => carry
346 ADDSUB r2, r8, r22 C U0 hi add => answer
347 ldl r31, 256(up) C prefetch up[]
348 bgt r18, $Loop C U1 bookkeeping
350 $Lend: CMPCY( r2, r22), r21 C
356 CMPCY( r5, r7), r20 C
358 CMPCY(r7, r23), r21 C
361 ADDSUB r4, r25, r25 C
368 $n23: ldq r4, 0(rp) C
371 ADDSUB r4, r25, r25 C
372 L(x): CMPCY( r4, r25), r20 C
373 ADDSUB r25, r24, r22 C
374 CMPCY( r25, r22), r21 C
376 ADDSUB r5, r28, r28 C
378 CMPCY( r5, r28), r20 C
379 ADDSUB r28, r3, r23 C
380 CMPCY( r28, r23), r21 C