1 dnl IA-64 mpn_divrem_2 -- Divide an n-limb number by a 2-limb number.
3 dnl Copyright 2004, 2005 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
28 C * Further optimize the loop. We could probably do some more trickery with
29 C arithmetic in the FPU, or perhaps use a non-zero addend of xma in more
31 C * Software pipeline for perhaps 5 saved cycles, around the end and start of
33 C * Schedule code outside of loop better.
34 C * Update the comments. They are now using the same name for the same
36 C * Handle conditional zeroing of r31 in loop more cleanly.
37 C * Inline mpn_invert_limb and schedule its insns across the entire init code.
38 C * Ultimately, use 2-limb, or perhaps 3-limb or 4-limb inverse.
47 define(`fminus1',`f10')
55 define(`R1',`r38') define(`R0',`r37')
56 define(`P1',`r28') define(`P0',`r27')
60 C HP's assembler requires these declarations for importing mpn_invert_limb
61 .global mpn_invert_limb
62 .type mpn_invert_limb,@function
64 PROLOGUE(mpn_divrem_2)
70 ` addp4 qp = 0, qp C M I
71 addp4 np = 0, np C M I
72 addp4 dp = 0, dp C M I
78 alloc r42 = ar.pfs, 5,8,1,0 C M2
79 ld8 d0 = [dp], 8 C M0M1 d0
81 shladd np = nn, 3, np C M I
83 ld8 d1 = [dp] C M0M1 d1
85 add r15 = -8, np C M I
86 add np = -16, np C M I
89 ld8 R1 = [r15] C M0M1 n1
90 ld8 R0 = [r34], -8 C M0M1 n0
92 cmp.ltu p6, p0 = d1, R1 C M I
93 cmp.eq p8, p0 = d1, R1 C M I
95 (p8) cmp.leu p6, p0 = d0, R0
96 cmp.ltu p8, p9 = R0, d0
97 (p6) br.cond.dpnt .L_high_limb_1 C FIXME: inline!
101 br.call.sptk.many b0 = mpn_invert_limb C FIXME: inline+schedule
103 setf.sig fd1 = d1 C d1
104 setf.sig fd0 = d0 C d0
105 add r14 = r33, r35 C nn + qxn
107 setf.sig fdinv = r8 C dinv
111 setf.sig fminus1 = r9
112 cmp.gt p6, p0 = r0, r35
113 shladd qp = r35, 3, qp
116 (p6) br.cond.dpnt .Ldone
119 C *** MAIN LOOP START ***
121 mov r15 = R0 C nadj = n10
122 cmp.le p14, p15 = 0, R0 C check high bit of R0
123 cmp.le p8, p0 = r33, r35 C dividend limbs remaining?
125 .pred.rel "mutex", p14, p15
126 (p8) ld8 r31 = [r34], -8 C n0
127 (p15) add r15 = d1, R0 C nadj = n10 + d1
128 (p15) add r14 = 1, R1 C nh + (nl:63)
129 (p14) mov r14 = R1 C nh
130 cmp.eq p6, p0 = d1, R1 C nh == d1
131 (p6) br.cond.spnt .L_R1_eq_d1
133 setf.sig f8 = r14 C n2 + (nl:63)
134 setf.sig f15 = r15 C nadj
135 sub r23 = -1, R1 C r23 = ~nh
140 xma.hu f7 = fdinv, f8, f15 C xh = HI(dinv*(nh-nmask)+nadj)
142 xma.l f7 = f7, fminus1, fnh C nh + xh
145 xma.hu f9 = f7, fd1, fnl C xh = HI(q1*d1+nl)
146 xma.l f33 = f7, fd1, fnl C xh = LO(q1*d1+nl)
153 cmp.eq p6, p7 = r16, r24
155 .pred.rel "mutex", p6, p7
156 (p6) xma.l f8 = f7, fminus1, f0 C f8 = -f7
157 (p7) xma.l f8 = f7,fminus1,fminus1 C f8 = -f7-1
159 .pred.rel "mutex", p6, p7
160 (p6) sub r18 = 0, r14 C q = -q1
161 (p7) sub r18 = -1, r14 C q = -q1-1
162 (p6) add r14 = 0, r17 C n1 = xl
163 (p7) add r14 = d1, r17 C n1 = xl + d1
165 xma.hu f9 = fd0, f8, f0 C d0*(-f7-1) = -d0*f7-d0
166 xma.l f35 = fd0, f8, f0
168 getf.sig P1 = f9 C P1
170 getf.sig P0 = f35 C P0
173 cmp.ltu p8, p0 = r31, P0 C p8 = cy from low limb
174 cmp.ltu p6, p0 = r14, P1 C p6 = prel cy from high limb
178 (p8) cmp.eq.or p6, p0 = 0, R1 C p6 = final cy from high limb
180 cmp.ne p10, p0 = r0, r0 C clear p10 FIXME: use unc below!
181 cmp.ne p13, p0 = r0, r0 C clear p13 FIXME: use unc below!
185 (p6) add r18 = -1, r18 C q--
187 (p6) cmp.ltu p10, p0 = R0, d0
188 (p6) cmp.ltu p0, p13 = R1, d1
190 (p10) cmp.ne.and p0, p13 = -1, R1 C p13 = !cy
192 (p13) br.cond.spnt .L_two_too_big C jump if not cy
196 mov r31 = 0 C n0, next iteration
198 C *** MAIN LOOP END ***
213 .pred.rel "mutex", p8, p9
215 (p8) sub R1 = R1, d1, 1
225 add r18 = -1, r18 C q--
226 cmp.ltu p10, p0 = R0, d0
231 mov r31 = 0 C n0, next iteration
236 add r14 = R0, d1 C r = R0 + d1
237 mov r18 = -1 C q = -1
239 cmp.leu p6, p0 = R0, r14
240 (p6) br.cond.spnt .L20 C jump unless cy
245 cmp.ltu p8, p9 = R0, r31
247 .pred.rel "mutex", p8, p9
249 (p8) add R1 = r0, P1, 1 C R1 = n1 - P1 - cy
250 (p9) add R1 = r0, P1 C R1 = n1 - P1
252 mov r31 = 0 C n0, next iteration
256 .L20: cmp.ne p6, p7 = 0, d0
258 .pred.rel "mutex", p6, p7