1 dnl IA-64 mpn_rsh1add_n/mpn_rsh1sub_n -- rp[] = (up[] +- vp[]) >> 1.
3 dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
27 C * Rewrite function entry code using aorslsh1_n.asm style.
28 C * Micro-optimize feed-in and wind-down code.
36 ifdef(`OPERATION_rsh1add_n',`
41 define(func, mpn_rsh1add_n)
43 ifdef(`OPERATION_rsh1sub_n',`
48 define(func, mpn_rsh1sub_n)
51 C Some useful aliases for registers we use
52 define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
53 define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
54 define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
55 define(`x0',`r26') define(`x1',`r9') define(`x2',`r30') define(`x3',`r31')
57 MULFUNC_PROLOGUE(mpn_rsh1add_n mpn_rsh1sub_n)
65 addp4 rp = 0, rp C M I
66 addp4 up = 0, up C M I
67 addp4 vp = 0, vp C M I
71 {.mmi; ld8 r11 = [vp], 8 C M01
72 ld8 r10 = [up], 8 C M01
74 }{.mmi; and r14 = 3, n C M I
75 cmp.lt p15, p0 = 4, n C M I
78 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
79 cmp.eq p7, p0 = 2, r14 C M I
80 cmp.eq p8, p0 = 3, r14 C M I
82 (p6) br.dptk .Lb01 C B
83 (p7) br.dptk .Lb10 C B
84 (p8) br.dptk .Lb11 C B
87 .Lb00: ld8 v0 = [vp], 8 C M01
88 ld8 u0 = [up], 8 C M01
91 ld8 v1 = [vp], 8 C M01
92 ld8 u1 = [up], 8 C M01
93 ADDSUB w3 = r10, r11 C M I
95 ld8 v2 = [vp], 8 C M01
96 ld8 u2 = [up], 8 C M01
97 (p15) br.dpnt .grt4 C B
100 cmp.PRED p7, p0 = w3, r10 C M I
102 ADDSUB w0 = u0, v0 C M I
104 cmp.PRED p8, p0 = w0, u0 C M I
105 ADDSUB w1 = u1, v1 C M I
107 cmp.PRED p9, p0 = w1, u1 C M I
108 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
109 (p7) add w0 = INCR, w0 C M I
111 shrp x3 = w0, w3, 1 C I0
112 ADDSUB w2 = u2, v2 C M I
113 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
114 (p8) add w1 = INCR, w1 C M I
117 .grt4: ld8 v3 = [vp], 8 C M01
118 cmp.PRED p7, p0 = w3, r10 C M I
119 ld8 u3 = [up], 8 C M01
122 ADDSUB w0 = u0, v0 C M I
123 ld8 v0 = [vp], 8 C M01
126 cmp.PRED p8, p0 = w0, u0 C M I
127 ld8 u0 = [up], 8 C M01
128 ADDSUB w1 = u1, v1 C M I
130 ld8 v1 = [vp], 8 C M01
132 cmp.PRED p9, p0 = w1, u1 C M I
133 ld8 u1 = [up], 8 C M01
134 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
135 (p7) add w0 = INCR, w0 C M I
137 ADDSUB w2 = u2, v2 C M I
138 ld8 v2 = [vp], 8 C M01
139 shrp x3 = w0, w3, 1 C I0
140 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
141 (p8) add w1 = INCR, w1 C M I
145 .Lb01: ADDSUB w2 = r10, r11 C M I
147 (p15) br.dpnt .grt1 C B
150 cmp.PRED p6, p7 = w2, r10 C M I
151 shr.u x2 = w2, 1 C I0
154 (p6) dep x2 = -1, x2, 63, 1 C I0
157 .grt1: ld8 v3 = [vp], 8 C M01
158 ld8 u3 = [up], 8 C M01
160 ld8 v0 = [vp], 8 C M01
161 ld8 u0 = [up], 8 C M01
162 mov.i ar.lc = n C FIXME swap with next I0
164 ld8 v1 = [vp], 8 C M01
165 ld8 u1 = [up], 8 C M01
167 ld8 v2 = [vp], 8 C M01
168 ld8 u2 = [up], 8 C M01
169 cmp.PRED p6, p0 = w2, r10 C M I
171 ADDSUB w3 = u3, v3 C M I
172 br.cloop.dptk .grt5 C B
175 cmp.PRED p7, p0 = w3, u3 C M I
177 ADDSUB w0 = u0, v0 C M I
178 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
179 (p6) add w3 = INCR, w3 C M I
181 cmp.PRED p8, p0 = w0, u0 C M I
182 shrp x2 = w3, w2, 1 C I0
183 ADDSUB w1 = u1, v1 C M I
185 cmp.PRED p9, p0 = w1, u1 C M I
186 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
187 (p7) add w0 = INCR, w0 C M I
190 .grt5: ld8 v3 = [vp], 8 C M01
191 cmp.PRED p7, p0 = w3, u3 C M I
192 ld8 u3 = [up], 8 C M01
194 ADDSUB w0 = u0, v0 C M I
195 ld8 v0 = [vp], 8 C M01
196 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
197 (p6) add w3 = INCR, w3 C M I
199 cmp.PRED p8, p0 = w0, u0 C M I
200 shrp x2 = w3, w2, 1 C I0
201 ld8 u0 = [up], 8 C M01
202 ADDSUB w1 = u1, v1 C M I
204 ld8 v1 = [vp], 8 C M01
205 cmp.PRED p9, p0 = w1, u1 C M I
206 ld8 u1 = [up], 8 C M01
207 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
208 (p7) add w0 = INCR, w0 C M I
212 .Lb10: ld8 v2 = [vp], 8 C M01
213 ld8 u2 = [up], 8 C M01
215 ADDSUB w1 = r10, r11 C M I
216 (p15) br.dpnt .grt2 C B
219 cmp.PRED p9, p0 = w1, r10 C M I
221 ADDSUB w2 = u2, v2 C M I
223 cmp.PRED p6, p0 = w2, u2 C M I
225 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
226 (p9) add w2 = INCR, w2 C M I
228 shrp x1 = w2, w1, 1 C I0
229 shr.u x2 = w2, 1 C I0
232 .grt2: ld8 v3 = [vp], 8 C M01
233 ld8 u3 = [up], 8 C M01
235 ld8 v0 = [vp], 8 C M01
236 ld8 u0 = [up], 8 C M01
239 ld8 v1 = [vp], 8 C M01
240 cmp.PRED p9, p0 = w1, r10 C M I
241 ld8 u1 = [up], 8 C M01
244 ADDSUB w2 = u2, v2 C M I
245 ld8 v2 = [vp], 8 C M01
247 cmp.PRED p6, p0 = w2, u2 C M I
248 ld8 u2 = [up], 8 C M01
249 ADDSUB w3 = u3, v3 C M I
250 br.cloop.dptk .grt6 C B
253 cmp.PRED p7, p0 = w3, u3 C M I
254 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
255 (p9) add w2 = INCR, w2 C M I
257 shrp x1 = w2, w1, 1 C I0
258 ADDSUB w0 = u0, v0 C M I
259 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
260 (p6) add w3 = INCR, w3 C M I
263 .grt6: ld8 v3 = [vp], 8 C M01
264 cmp.PRED p7, p0 = w3, u3 C M I
265 ld8 u3 = [up], 8 C M01
266 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
267 (p9) add w2 = INCR, w2 C M I
269 shrp x1 = w2, w1, 1 C I0
270 ADDSUB w0 = u0, v0 C M I
271 ld8 v0 = [vp], 8 C M01
272 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
273 (p6) add w3 = INCR, w3 C M I
277 .Lb11: ld8 v1 = [vp], 8 C M01
278 ld8 u1 = [up], 8 C M01
281 ld8 v2 = [vp], 8 C M01
282 ld8 u2 = [up], 8 C M01
283 ADDSUB w0 = r10, r11 C M I
284 (p15) br.dpnt .grt3 C B
287 cmp.PRED p8, p0 = w0, r10 C M I
288 ADDSUB w1 = u1, v1 C M I
291 cmp.PRED p9, p0 = w1, u1 C M I
293 ADDSUB w2 = u2, v2 C M I
294 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
295 (p8) add w1 = INCR, w1 C M I
297 cmp.PRED p6, p0 = w2, u2 C M I
298 shrp x0 = w1, w0, 1 C I0
300 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
301 (p9) add w2 = INCR, w2 C M I
304 .grt3: ld8 v3 = [vp], 8 C M01
305 ld8 u3 = [up], 8 C M01
307 ld8 v0 = [vp], 8 C M01
309 cmp.PRED p8, p0 = w0, r10 C M I
310 ld8 u0 = [up], 8 C M01
311 ADDSUB w1 = u1, v1 C M I
314 ld8 v1 = [vp], 8 C M01
315 cmp.PRED p9, p0 = w1, u1 C M I
316 ld8 u1 = [up], 8 C M01
318 ADDSUB w2 = u2, v2 C M I
319 ld8 v2 = [vp], 8 C M01
320 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
321 (p8) add w1 = INCR, w1 C M I
323 cmp.PRED p6, p0 = w2, u2 C M I
324 shrp x0 = w1, w0, 1 C I0
325 ld8 u2 = [up], 8 C M01
326 ADDSUB w3 = u3, v3 C M I
327 br.cloop.dptk .grt7 C B
330 cmp.PRED p7, p0 = w3, u3 C M I
331 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
332 (p9) add w2 = INCR, w2 C M I
335 .grt7: ld8 v3 = [vp], 8 C M01
336 cmp.PRED p7, p0 = w3, u3 C M I
337 ld8 u3 = [up], 8 C M01
338 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
339 (p9) add w2 = INCR, w2 C M I
343 C *** MAIN LOOP START ***
345 .Loop: st8 [rp] = x3, 8 C M23
346 ld8 v3 = [vp], 8 C M01
347 cmp.PRED p7, p0 = w3, u3 C M I
348 ld8 u3 = [up], 8 C M01
349 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
350 (p9) add w2 = INCR, w2 C M I
352 .LL11: st8 [rp] = x0, 8 C M23
353 shrp x1 = w2, w1, 1 C I0
354 ADDSUB w0 = u0, v0 C M I
355 ld8 v0 = [vp], 8 C M01
356 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
357 (p6) add w3 = INCR, w3 C M I
359 .LL10: cmp.PRED p8, p0 = w0, u0 C M I
360 shrp x2 = w3, w2, 1 C I0
362 ld8 u0 = [up], 8 C M01
363 ADDSUB w1 = u1, v1 C M I
366 st8 [rp] = x1, 8 C M23
367 ld8 v1 = [vp], 8 C M01
368 cmp.PRED p9, p0 = w1, u1 C M I
369 ld8 u1 = [up], 8 C M01
370 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
371 (p7) add w0 = INCR, w0 C M I
373 .LL01: st8 [rp] = x2, 8 C M23
374 shrp x3 = w0, w3, 1 C I0
375 ADDSUB w2 = u2, v2 C M I
376 ld8 v2 = [vp], 8 C M01
377 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
378 (p8) add w1 = INCR, w1 C M I
380 .LL00: cmp.PRED p6, p0 = w2, u2 C M I
381 shrp x0 = w1, w0, 1 C I0
383 ld8 u2 = [up], 8 C M01
384 ADDSUB w3 = u3, v3 C M I
385 br.cloop.dptk .Loop C B
387 C *** MAIN LOOP END ***
389 .Lskip: st8 [rp] = x3, 8 C M23
390 cmp.PRED p7, p0 = w3, u3 C M I
391 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
392 (p9) add w2 = INCR, w2 C M I
394 .Lcj7: st8 [rp] = x0, 8 C M23
395 shrp x1 = w2, w1, 1 C I0
396 ADDSUB w0 = u0, v0 C M I
397 (p6) cmp.eq.or p7, p0 = LIM, w3 C M I
398 (p6) add w3 = INCR, w3 C M I
400 .Lcj6: cmp.PRED p8, p0 = w0, u0 C M I
401 shrp x2 = w3, w2, 1 C I0
402 ADDSUB w1 = u1, v1 C M I
404 st8 [rp] = x1, 8 C M23
405 cmp.PRED p9, p0 = w1, u1 C M I
406 (p7) cmp.eq.or p8, p0 = LIM, w0 C M I
407 (p7) add w0 = INCR, w0 C M I
409 .Lcj5: st8 [rp] = x2, 8 C M23
410 shrp x3 = w0, w3, 1 C I0
411 ADDSUB w2 = u2, v2 C M I
412 (p8) cmp.eq.or p9, p0 = LIM, w1 C M I
413 (p8) add w1 = INCR, w1 C M I
415 .Lcj4: cmp.PRED p6, p0 = w2, u2 C M I
416 shrp x0 = w1, w0, 1 C I0
418 st8 [rp] = x3, 8 C M23
419 (p9) cmp.eq.or p6, p0 = LIM, w2 C M I
420 (p9) add w2 = INCR, w2 C M I
422 .Lcj3: st8 [rp] = x0, 8 C M23
423 shrp x1 = w2, w1, 1 C I0
424 shr.u x2 = w2, 1 C I0
426 .Lcj2: st8 [rp] = x1, 8 C M23
427 (p6) dep x2 = -1, x2, 63, 1 C I0
429 .Lcj1: st8 [rp] = x2 C M23
430 mov.i ar.lc = r2 C I0
431 br.ret.sptk.many b0 C B