1 dnl IA-64 mpn_addlsh1_n/mpn_sublsh1_n -- rp[] = up[] +- (vp[] << 1).
3 dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
27 C * Use shladd in feed-in code (for mpn_addlsh1_n).
35 ifdef(`OPERATION_addlsh1_n',`
40 define(func, mpn_addlsh1_n)
42 ifdef(`OPERATION_sublsh1_n',`
47 define(func, mpn_sublsh1_n)
50 C Some useful aliases for registers we use
51 define(`u0',`r14') define(`u1',`r15') define(`u2',`r16') define(`u3',`r17')
52 define(`v0',`r18') define(`v1',`r19') define(`v2',`r20') define(`v3',`r21')
53 define(`w0',`r22') define(`w1',`r23') define(`w2',`r24') define(`w3',`r25')
54 define(`s0',`r26') define(`s1',`r27') define(`s2',`r28') define(`s3',`r29')
55 define(`x0',`r30') define(`x1',`r31') define(`x2',`r30') define(`x3',`r31')
57 MULFUNC_PROLOGUE(mpn_addlsh1_n mpn_sublsh1_n)
65 addp4 rp = 0, rp C M I
66 addp4 up = 0, up C M I
67 addp4 vp = 0, vp C M I
71 {.mmi; ld8 r11 = [vp], 8 C M01
72 ld8 r10 = [up], 8 C M01
74 }{.mmi; and r14 = 3, n C M I
75 cmp.lt p15, p0 = 4, n C M I
78 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
79 cmp.eq p7, p0 = 2, r14 C M I
80 cmp.eq p8, p0 = 3, r14 C M I
82 (p6) br.dptk .Lb01 C B
83 (p7) br.dptk .Lb10 C B
84 (p8) br.dptk .Lb11 C B
87 .Lb00: ld8 v0 = [vp], 8 C M01
88 ld8 u0 = [up], 8 C M01
91 ld8 v1 = [vp], 8 C M01
92 ld8 u1 = [up], 8 C M01
93 add x3 = r11, r11 C M I
95 ld8 v2 = [vp], 8 C M01
96 ld8 u2 = [up], 8 C M01
97 ADDSUB w3 = r10, x3 C M I
98 (p15) br.dpnt .grt4 C B
100 shrp x0 = v0, r11, 63 C I0
101 cmp.PRED p8, p0 = w3, r10 C M I
103 shrp x1 = v1, v0, 63 C I0
104 ADDSUB w0 = u0, x0 C M I
106 cmp.PRED p6, p0 = w0, u0 C M I
107 ADDSUB w1 = u1, x1 C M I
110 .grt4: ld8 v3 = [vp], 8 C M01
111 shrp x0 = v0, r11, 63 C I0
112 cmp.PRED p8, p0 = w3, r10 C M I
115 ld8 u3 = [up], 8 C M01
117 shrp x1 = v1, v0, 63 C I0
118 ld8 v0 = [vp], 8 C M01
119 ADDSUB w0 = u0, x0 C M I
121 cmp.PRED p6, p0 = w0, u0 C M I
122 ld8 u0 = [up], 8 C M01
123 ADDSUB w1 = u1, x1 C M I
126 .Lb01: add x2 = r11, r11 C M I
128 (p15) br.dpnt .grt1 C B
130 ADDSUB w2 = r10, x2 C M I
131 shr.u r8 = r11, 63 C retval I0
133 cmp.PRED p6, p0 = w2, r10 C M I
135 st8 [rp] = w2, 8 C M23
136 (p6) add r8 = 1, r8 C M I
137 br.ret.sptk.many b0 C B
139 .grt1: ld8 v3 = [vp], 8 C M01
140 ld8 u3 = [up], 8 C M01
141 mov.i ar.lc = n C FIXME swap with next I0
143 ld8 v0 = [vp], 8 C M01
144 ld8 u0 = [up], 8 C M01
147 ld8 v1 = [vp], 8 C M01
148 ld8 u1 = [up], 8 C M01
149 shrp x3 = v3, r11, 63 C I0
151 ld8 v2 = [vp], 8 C M01
152 ld8 u2 = [up], 8 C M01
153 cmp.PRED p6, p0 = w2, r10 C M I
154 ADDSUB w3 = u3, x3 C M I
155 br.cloop.dptk .grt5 C B
157 shrp x0 = v0, v3, 63 C I0
158 cmp.PRED p8, p0 = w3, u3 C M I
161 .grt5: shrp x0 = v0, v3, 63 C I0
162 ld8 v3 = [vp], 8 C M01
163 cmp.PRED p8, p0 = w3, u3 C M I
166 .Lb10: ld8 v2 = [vp], 8 C M01
167 ld8 u2 = [up], 8 C M01
169 add x1 = r11, r11 C M I
170 (p15) br.dpnt .grt2 C B
172 ADDSUB w1 = r10, x1 C M I
173 shrp x2 = v2, r11, 63 C I0
175 cmp.PRED p8, p0 = w1, r10 C M I
176 ADDSUB w2 = u2, x2 C M I
177 shr.u r8 = v2, 63 C retval I0
179 cmp.PRED p6, p0 = w2, u2 C M I
182 .grt2: ld8 v3 = [vp], 8 C M01
183 ld8 u3 = [up], 8 C M01
186 ld8 v0 = [vp], 8 C M01
187 ld8 u0 = [up], 8 C M01
188 ADDSUB w1 = r10, x1 C M I
190 ld8 v1 = [vp], 8 C M01
191 shrp x2 = v2, r11, 63 C I0
192 cmp.PRED p8, p0 = w1, r10 C M I
194 ld8 u1 = [up], 8 C M01
195 shrp x3 = v3, v2, 63 C I0
196 ld8 v2 = [vp], 8 C M01
197 ADDSUB w2 = u2, x2 C M I
199 cmp.PRED p6, p0 = w2, u2 C M I
200 ld8 u2 = [up], 8 C M01
201 ADDSUB w3 = u3, x3 C M I
202 br.cloop.dpnt .Loop C B
205 .Lb11: ld8 v1 = [vp], 8 C M01
206 ld8 u1 = [up], 8 C M01
208 add x0 = r11, r11 C M I
210 ld8 v2 = [vp], 8 C M01
211 ld8 u2 = [up], 8 C M01
212 (p15) br.dpnt .grt3 C B
215 shrp x1 = v1, r11, 63 C I0
216 ADDSUB w0 = r10, x0 C M I
218 cmp.PRED p6, p0 = w0, r10 C M I
219 ADDSUB w1 = u1, x1 C M I
221 shrp x2 = v2, v1, 63 C I0
222 cmp.PRED p8, p0 = w1, u1 C M I
225 .grt3: ld8 v3 = [vp], 8 C M01
226 ld8 u3 = [up], 8 C M01
228 shrp x1 = v1, r11, 63 C I0
229 ADDSUB w0 = r10, x0 C M I
231 ld8 v0 = [vp], 8 C M01
232 cmp.PRED p6, p0 = w0, r10 C M I
233 ld8 u0 = [up], 8 C M01
234 ADDSUB w1 = u1, x1 C M I
236 shrp x2 = v2, v1, 63 C I0
237 ld8 v1 = [vp], 8 C M01
238 cmp.PRED p8, p0 = w1, u1 C M I
242 C *** MAIN LOOP START ***
244 .Loop: st8 [rp] = w1, 8 C M23
245 shrp x0 = v0, v3, 63 C I0
246 (p8) cmp.eq.or p6, p0 = LIM, w2 C M I
247 (p8) add w2 = INCR, w2 C M I
248 ld8 v3 = [vp], 8 C M01
249 cmp.PRED p8, p0 = w3, u3 C M I
251 .LL01: ld8 u3 = [up], 8 C M01
252 shrp x1 = v1, v0, 63 C I0
253 (p6) cmp.eq.or p8, p0 = LIM, w3 C M I
254 (p6) add w3 = INCR, w3 C M I
255 ld8 v0 = [vp], 8 C M01
256 ADDSUB w0 = u0, x0 C M I
258 st8 [rp] = w2, 8 C M23
259 cmp.PRED p6, p0 = w0, u0 C M I
260 ld8 u0 = [up], 8 C M01
261 ADDSUB w1 = u1, x1 C M I
263 .LL00: st8 [rp] = w3, 8 C M23
264 shrp x2 = v2, v1, 63 C I0
265 (p8) cmp.eq.or p6, p0 = LIM, w0 C M I
266 (p8) add w0 = INCR, w0 C M I
267 ld8 v1 = [vp], 8 C M01
268 cmp.PRED p8, p0 = w1, u1 C M I
270 .LL11: ld8 u1 = [up], 8 C M01
271 shrp x3 = v3, v2, 63 C I0
272 (p6) cmp.eq.or p8, p0 = LIM, w1 C M I
273 (p6) add w1 = INCR, w1 C M I
274 ld8 v2 = [vp], 8 C M01
275 ADDSUB w2 = u2, x2 C M I
277 st8 [rp] = w0, 8 C M23
278 cmp.PRED p6, p0 = w2, u2 C M I
279 ld8 u2 = [up], 8 C M01
280 ADDSUB w3 = u3, x3 C M I
281 br.cloop.dptk .Loop C B
283 C *** MAIN LOOP END ***
285 .Lskip: st8 [rp] = w1, 8 C M23
286 shrp x0 = v0, v3, 63 C I0
287 (p8) cmp.eq.or p6, p0 = LIM, w2 C M I
288 (p8) add w2 = INCR, w2 C M I
289 cmp.PRED p8, p0 = w3, u3 C M I
291 .Lcj5: shrp x1 = v1, v0, 63 C I0
292 (p6) cmp.eq.or p8, p0 = LIM, w3 C M I
293 (p6) add w3 = INCR, w3 C M I
294 ADDSUB w0 = u0, x0 C M I
296 st8 [rp] = w2, 8 C M23
297 cmp.PRED p6, p0 = w0, u0 C M I
298 ADDSUB w1 = u1, x1 C M I
300 .Lcj4: st8 [rp] = w3, 8 C M23
301 shrp x2 = v2, v1, 63 C I0
302 (p8) cmp.eq.or p6, p0 = LIM, w0 C M I
303 (p8) add w0 = INCR, w0 C M I
304 cmp.PRED p8, p0 = w1, u1 C M I
306 .Lcj3: shr.u r8 = v2, 63 C I0
307 (p6) cmp.eq.or p8, p0 = LIM, w1 C M I
308 (p6) add w1 = INCR, w1 C M I
309 ADDSUB w2 = u2, x2 C M I
311 st8 [rp] = w0, 8 C M23
312 cmp.PRED p6, p0 = w2, u2 C M I
314 .Lcj2: st8 [rp] = w1, 8 C M23
315 (p8) cmp.eq.or p6, p0 = LIM, w2 C M I
316 (p8) add w2 = INCR, w2 C M I
318 .Lcj1: st8 [rp] = w2, 8 C M23
319 mov.i ar.lc = r2 C I0
320 (p6) add r8 = 1, r8 C M I
321 br.ret.sptk.many b0 C B