1 dnl Alpha mpn_divexact_by3c -- mpn division by 3, expecting no remainder.
3 dnl Copyright 2004, 2005, 2009 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
25 C EV6: 6.3 Note that mpn_bdiv_dbm1c is faster
28 C * Remove the unops, they benefit just ev6, which no longer uses this file.
29 C * Try prefetch for destination, using lds.
30 C * Improve feed-in code, by moving initial mulq earlier; make initial load
31 C to u0/u0 to save some copying.
32 C * Combine u0 and u2, u1 and u3.
43 .quad 0xAAAAAAAAAAAAAAAB
44 .quad 0x5555555555555555
45 .quad 0xAAAAAAAAAAAAAAAA
48 define(`xAAAAAAAAAAAAAAAB', `r20')
49 define(`x5555555555555555', `r21')
50 define(`xAAAAAAAAAAAAAAAA', `r22')
51 define(`u0', `r0') define(`u1', `r1')
52 define(`u2', `r2') define(`u3', `r3')
53 define(`l0', `r25') define(`x', `r8')
54 define(`q0', `r4') define(`q1', `r5')
55 define(`p6', `r6') define(`p7', `r7')
56 define(`t0', `r23') define(`t1', `r24')
57 define(`cymask',`r28')
60 PROLOGUE(mpn_divexact_by3c,gp)
62 ldq r28, 0(up) C load first limb early
64 C Put magic constants in registers
66 ldq xAAAAAAAAAAAAAAAB, 0(r0)
67 ldq x5555555555555555, 8(r0)
68 ldq xAAAAAAAAAAAAAAAA, 16(r0)
70 C Compute initial l0 value
74 and p6, x5555555555555555, l0
75 cmovne p7, xAAAAAAAAAAAAAAAA, l0
77 C Feed-in depending on (n mod 4)
89 mulq r28, xAAAAAAAAAAAAAAAB, q0
96 mulq r28, xAAAAAAAAAAAAAAAB, q1
100 $Lb01: lda rp, -8(rp)
101 mulq r28, xAAAAAAAAAAAAAAAB, q0
109 mulq r28, xAAAAAAAAAAAAAAAB, q1
116 cmpult u3, cy, cy C L0
117 mulq u0, xAAAAAAAAAAAAAAAB, q0 C U1
124 cmpult x5555555555555555, x, p6 C U0
126 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
132 and cymask, x5555555555555555, l0 C U1
134 and t0, x5555555555555555, t0
136 and t1, x5555555555555555, t1
147 cmpult u0, cy, cy C L0
148 mulq u1, xAAAAAAAAAAAAAAAB, q1 C U1
155 cmpult x5555555555555555, x, p6 C U0
157 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
163 and cymask, x5555555555555555, l0 C U1
165 and t0, x5555555555555555, t0
167 and t1, x5555555555555555, t1
178 cmpult u1, cy, cy C L0
179 mulq u2, xAAAAAAAAAAAAAAAB, q0 C U1
186 cmpult x5555555555555555, x, p6 C U0
188 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
194 and cymask, x5555555555555555, l0 C U1
196 and t0, x5555555555555555, t0
198 and t1, x5555555555555555, t1
209 cmpult u2, cy, cy C L0
210 mulq u3, xAAAAAAAAAAAAAAAB, q1 C U1
217 cmpult x5555555555555555, x, p6 C U0
219 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
220 lda n, -4(n) C L1 bookkeeping
225 and cymask, x5555555555555555, l0 C U1
227 and t0, x5555555555555555, t0
229 and t1, x5555555555555555, t1
239 ldl r31, 256(up) C prefetch
243 C *** MAIN LOOP END ***
246 cmpult u3, cy, cy C L0
247 mulq u0, xAAAAAAAAAAAAAAAB, q0 C U1
254 cmpult x5555555555555555, x, p6 C U0
256 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
262 and cymask, x5555555555555555, l0 C U1
264 and t0, x5555555555555555, t0
266 and t1, x5555555555555555, t1
276 cmpult u0, cy, cy C L0
278 cmpult x5555555555555555, x, p6 C U0
279 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
288 C This is useful for playing with various schedules.
289 C Expand as: one(0)one(1)one(2)one(3)
292 cmpult `$'eval(($1+3)%4), cy, cy C L0
293 mulq `$'$1, xAAAAAAAAAAAAAAAB, `$'eval(4+$1%2) C U1
294 ldq `$'eval(($1+1)%4), eval($1*8+16)(up) C L1
295 addq `$'eval(4+($1+1)%2), l0, x C U0
300 cmpult x5555555555555555, x, p6 C U0
302 cmpult xAAAAAAAAAAAAAAAA, x, p7 C U1
308 and cymask, x5555555555555555, l0 C U1
310 and t0, x5555555555555555, t0
312 and t1, x5555555555555555, t1
319 stq x, eval($1*8)(rp) C L1