1 dnl IA-64 mpn_add_n_sub_n -- mpn parallel addition and subtraction.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2010 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
22 dnl or both in parallel, as here.
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
46 C Some useful aliases for registers we use
47 define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
48 define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
49 define(`s0',`r24') define(`s1',`r25') define(`s2',`r26') define(`s3',`r27')
50 define(`d0',`r28') define(`d1',`r29') define(`d2',`r30') define(`d3',`r31')
56 define(`cmpltu', `cmp.ltu')
57 define(`cmpeqor', `cmp.eq.or')
60 PROLOGUE(mpn_add_n_sub_n)
65 addp4 sp = 0, sp C M I
66 addp4 dp = 0, dp C M I
68 addp4 up = 0, up C M I
69 addp4 vp = 0, vp C M I
76 add up1 = 8, up0 C M I
77 add vp1 = 8, vp0 C M I
79 add r10 = 256, up C M I
82 cmp.eq p10, p0 = 0, r9 C M I
83 cmp.eq p11, p0 = 2, r9 C M I
84 cmp.eq p12, p0 = 3, r9 C M I
85 add r11 = 256, vp C M I
92 L(b1): ld8 u3 = [up0], 8 C M01
93 add up1 = 8, up1 C M I
94 cmpltu p14, p15 = 4, n C M I
95 ld8 v3 = [vp0], 8 C M01
96 add vp1 = 8, vp1 C M I
102 cmpltu p9, p0 = s3, v3 C carry from add3 M I
103 cmpltu p13, p0 = u3, v3 C borrow from sub3 M I
105 st8 [sp] = s3, 8 C M23
106 st8 [dp] = d3, 8 C M23
109 L(b0): cmp.ne p9, p0 = r0, r0 C M I
110 cmp.ne p13, p0 = r0, r0 C M I
111 L(c0): ld8 u0 = [up0], 16 C M01
112 ld8 u1 = [up1], 16 C M01
114 ld8 v0 = [vp0], 16 C M01
115 ld8 v1 = [vp1], 16 C M01
117 ld8 u2 = [up0], 16 C M01
118 ld8 u3 = [up1], 16 C M01
120 ld8 v2 = [vp0], 16 C M01
121 ld8 v3 = [vp1], 16 C M01
123 add s0 = u0, v0 C M I
124 add s1 = u1, v1 C M I
125 sub d0 = u0, v0 C M I
126 sub d1 = u1, v1 C M I
128 cmpltu p6, p0 = s0, v0 C carry from add0 M I
129 cmpltu p7, p0 = s1, v1 C carry from add1 M I
130 cmpltu p10, p0 = u0, v0 C borrow from sub0 M I
131 cmpltu p11, p0 = u1, v1 C borrow from sub1 M I
134 br.cloop.dptk L(top) C B
137 L(b3): ld8 u1 = [up0], 8 C M01
138 add up1 = 8, up1 C M I
139 ld8 v1 = [vp0], 8 C M01
141 add vp1 = 8, vp1 C M I
142 add s1 = u1, v1 C M I
143 sub d1 = u1, v1 C M I
145 cmpltu p7, p0 = s1, v1 C carry from add1 M I
146 cmpltu p11, p0 = u1, v1 C borrow from sub1 M I
148 st8 [sp] = s1, 8 C M23
149 st8 [dp] = d1, 8 C M23
153 L(b2): cmp.ne p7, p0 = r0, r0 C M I
154 cmp.ne p11, p0 = r0, r0 C M I
156 L(c2): ld8 u2 = [up0], 16 C M01
157 ld8 u3 = [up1], 16 C M01
158 cmpltu p14, p0 = 4, n C M I
160 ld8 v2 = [vp0], 16 C M01
161 ld8 v3 = [vp1], 16 C M01
164 add s2 = u2, v2 C M I
165 add s3 = u3, v3 C M I
166 sub d2 = u2, v2 C M I
167 sub d3 = u3, v3 C M I
169 cmpltu p8, p0 = s2, v2 C carry from add0 M I
170 cmpltu p9, p0 = s3, v3 C carry from add3 M I
171 cmpltu p12, p0 = u2, v2 C borrow from sub2 M I
172 cmpltu p13, p0 = u3, v3 C borrow from sub3 M I
175 L(gt4): ld8 u0 = [up0], 16 C M01
176 ld8 u1 = [up1], 16 C M01
178 ld8 v0 = [vp0], 16 C M01
179 ld8 v1 = [vp1], 16 C M01
181 add s2 = u2, v2 C M I
182 add s3 = u3, v3 C M I
183 sub d2 = u2, v2 C M I
184 sub d3 = u3, v3 C M I
186 cmpltu p8, p0 = s2, v2 C carry from add0 M I
187 cmpltu p9, p0 = s3, v3 C carry from add1 M I
188 cmpltu p12, p0 = u2, v2 C borrow from sub0 M I
189 cmpltu p13, p0 = u3, v3 C borrow from sub1 M I
190 br.cloop.dptk L(mid) C B
194 ld8 u0 = [up0], 16 C M01
195 ld8 u1 = [up1], 16 C M01
196 (p9) cmpeqor p6, p0 = -1, s0 C M I
197 (p9) add s0 = 1, s0 C M I
198 (p13) cmpeqor p10, p0 = 0, d0 C M I
199 (p13) add d0 = -1, d0 C M I
201 ld8 v0 = [vp0], 16 C M01
202 ld8 v1 = [vp1], 16 C M01
203 (p6) cmpeqor p7, p0 = -1, s1 C M I
204 (p6) add s1 = 1, s1 C M I
205 (p10) cmpeqor p11, p0 = 0, d1 C M I
206 (p10) add d1 = -1, d1 C M I
208 st8 [sp] = s0, 8 C M23
209 st8 [dp] = d0, 8 C M23
210 add s2 = u2, v2 C M I
211 add s3 = u3, v3 C M I
212 sub d2 = u2, v2 C M I
213 sub d3 = u3, v3 C M I
215 st8 [sp] = s1, 8 C M23
216 st8 [dp] = d1, 8 C M23
217 cmpltu p8, p0 = s2, v2 C carry from add2 M I
218 cmpltu p9, p0 = s3, v3 C carry from add3 M I
219 cmpltu p12, p0 = u2, v2 C borrow from sub2 M I
220 cmpltu p13, p0 = u3, v3 C borrow from sub3 M I
223 ld8 u2 = [up0], 16 C M01
224 ld8 u3 = [up1], 16 C M01
225 (p7) cmpeqor p8, p0 = -1, s2 C M I
226 (p7) add s2 = 1, s2 C M I
227 (p11) cmpeqor p12, p0 = 0, d2 C M I
228 (p11) add d2 = -1, d2 C M I
230 ld8 v2 = [vp0], 16 C M01
231 ld8 v3 = [vp1], 16 C M01
232 (p8) cmpeqor p9, p0 = -1, s3 C M I
233 (p8) add s3 = 1, s3 C M I
234 (p12) cmpeqor p13, p0 = 0, d3 C M I
235 (p12) add d3 = -1, d3 C M I
237 st8 [sp] = s2, 8 C M23
238 st8 [dp] = d2, 8 C M23
239 add s0 = u0, v0 C M I
240 add s1 = u1, v1 C M I
241 sub d0 = u0, v0 C M I
242 sub d1 = u1, v1 C M I
244 st8 [sp] = s3, 8 C M23
245 st8 [dp] = d3, 8 C M23
246 cmpltu p6, p0 = s0, v0 C carry from add0 M I
247 cmpltu p7, p0 = s1, v1 C carry from add1 M I
248 cmpltu p10, p0 = u0, v0 C borrow from sub0 M I
249 cmpltu p11, p0 = u1, v1 C borrow from sub1 M I
251 lfetch [r10], 32 C M?
252 lfetch [r11], 32 C M?
253 br.cloop.dptk L(top) C B
259 (p9) cmpeqor p6, p0 = -1, s0 C M I
260 (p9) add s0 = 1, s0 C M I
261 (p13) cmpeqor p10, p0 = 0, d0 C M I
262 (p13) add d0 = -1, d0 C M I
266 (p6) cmpeqor p7, p0 = -1, s1 C M I
267 (p6) add s1 = 1, s1 C M I
268 (p10) cmpeqor p11, p0 = 0, d1 C M I
269 (p10) add d1 = -1, d1 C M I
271 st8 [sp] = s0, 8 C M23
272 st8 [dp] = d0, 8 C M23
273 add s2 = u2, v2 C M I
274 add s3 = u3, v3 C M I
275 sub d2 = u2, v2 C M I
276 sub d3 = u3, v3 C M I
278 st8 [sp] = s1, 8 C M23
279 st8 [dp] = d1, 8 C M23
280 cmpltu p8, p0 = s2, v2 C carry from add2 M I
281 cmpltu p9, p0 = s3, v3 C carry from add3 M I
282 cmpltu p12, p0 = u2, v2 C borrow from sub2 M I
283 cmpltu p13, p0 = u3, v3 C borrow from sub3 M I
286 (p7) cmpeqor p8, p0 = -1, s2 C M I
287 (p7) add s2 = 1, s2 C M I
288 (p11) cmpeqor p12, p0 = 0, d2 C M I
289 (p11) add d2 = -1, d2 C M I
293 st8 [sp] = s2, 8 C M23
294 st8 [dp] = d2, 8 C M23
295 (p8) cmpeqor p9, p0 = -1, s3 C M I
296 (p8) add s3 = 1, s3 C M I
297 (p12) cmpeqor p13, p0 = 0, d3 C M I
298 (p12) add d3 = -1, d3 C M I
301 (p9) mov r8 = 2 C M I
303 mov.i ar.lc = r2 C I0
304 (p13) add r8 = 1, r8 C M I
307 br.ret.sptk.many b0 C B