Tizen 2.1 base
[external/gmp.git] / mpn / ia64 / lorrshift.asm
1 dnl  IA-64 mpn_lshift/mpn_rshift.
2
3 dnl  Copyright 2000, 2001, 2002, 2003, 2004, 2005 Free Software Foundation,
4 dnl  Inc.
5
6 dnl  This file is part of the GNU MP Library.
7
8 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
9 dnl  it under the terms of the GNU Lesser General Public License as published
10 dnl  by the Free Software Foundation; either version 3 of the License, or (at
11 dnl  your option) any later version.
12
13 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
14 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
15 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
16 dnl  License for more details.
17
18 dnl  You should have received a copy of the GNU Lesser General Public License
19 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
20
21 include(`../config.m4')
22
23 C           cycles/limb
24 C Itanium:      2.0
25 C Itanium 2:    1.0
26
27 C This code is scheduled deeply since the plain shift instructions shr and shl
28 C have a latency of 4 (on Itanium) or 3 (on Itanium 2).  Poor scheduling of
29 C these instructions cause a 10 cycle replay trap on Itanium.
30
31 C TODO
32 C  * Optimize function entry and feed-in code.
33
34 C INPUT PARAMETERS
35 define(`rp',`r32')
36 define(`up',`r33')
37 define(`n',`r34')
38 define(`cnt',`r35')
39
40 define(`tnc',`r9')
41
42 ifdef(`OPERATION_lshift',`
43         define(`FSH',`shl')
44         define(`BSH',`shr.u')
45         define(`UPD',`-8')
46         define(`POFF',`-512')
47         define(`PUPD',`-32')
48         define(`func',`mpn_lshift')
49 ')
50 ifdef(`OPERATION_rshift',`
51         define(`FSH',`shr.u')
52         define(`BSH',`shl')
53         define(`UPD',`8')
54         define(`POFF',`512')
55         define(`PUPD',`32')
56         define(`func',`mpn_rshift')
57 ')
58
59 MULFUNC_PROLOGUE(mpn_lshift mpn_rshift)
60
61 ASM_START()
62 PROLOGUE(func)
63         .prologue
64         .save           ar.lc, r2
65         .body
66 ifdef(`HAVE_ABI_32',
67 `       addp4           rp = 0, rp              C                       M I
68         addp4           up = 0, up              C                       M I
69         sxt4            n = n                   C                       M I
70         zxt4            cnt = cnt               C                       I
71         ;;
72 ')
73
74  {.mmi; cmp.lt          p14, p15 = 4, n         C                       M I
75         and             r14 = 3, n              C                       M I
76         mov.i           r2 = ar.lc              C                       I0
77 }{.mmi; add             r15 = -1, n             C                       M I
78         sub             tnc = 64, cnt           C                       M I
79         add             r16 = -5, n
80         ;;
81 }{.mmi; cmp.eq          p6, p0 = 1, r14         C                       M I
82         cmp.eq          p7, p0 = 2, r14         C                       M I
83         shr.u           n = r16, 2              C                       I0
84 }{.mmi; cmp.eq          p8, p0 = 3, r14         C                       M I
85 ifdef(`OPERATION_lshift',
86 `       shladd          up = r15, 3, up         C                       M I
87         shladd          rp = r15, 3, rp')       C                       M I
88         ;;
89 }{.mmi; add             r11 = POFF, up          C                       M I
90         ld8             r10 = [up], UPD         C                       M01
91         mov.i           ar.lc = n               C                       I0
92 }{.bbb;
93    (p6) br.dptk         .Lb01
94    (p7) br.dptk         .Lb10
95    (p8) br.dptk         .Lb11
96         ;;
97 }
98
99 .Lb00:  ld8             r19 = [up], UPD
100         ;;
101         ld8             r16 = [up], UPD
102         ;;
103         ld8             r17 = [up], UPD
104         BSH             r8 = r10, tnc           C function return value
105   (p14) br.cond.dptk    .grt4
106
107         FSH             r24 = r10, cnt
108         BSH             r25 = r19, tnc
109         ;;
110         FSH             r26 = r19, cnt
111         BSH             r27 = r16, tnc
112         ;;
113         FSH             r20 = r16, cnt
114         BSH             r21 = r17, tnc
115         ;;
116         or              r14 = r25, r24
117         FSH             r22 = r17, cnt
118         BSH             r23 = r10, tnc
119         br              .Lr4
120
121 .grt4:  FSH             r24 = r10, cnt
122         BSH             r25 = r19, tnc
123         ;;
124         ld8             r18 = [up], UPD
125         FSH             r26 = r19, cnt
126         BSH             r27 = r16, tnc
127         ;;
128         ld8             r19 = [up], UPD
129         FSH             r20 = r16, cnt
130         BSH             r21 = r17, tnc
131         ;;
132         ld8             r16 = [up], UPD
133         FSH             r22 = r17, cnt
134         BSH             r23 = r18, tnc
135         ;;
136         or              r14 = r25, r24
137         ld8             r17 = [up], UPD
138         br.cloop.dpnt   .Ltop
139         br              .Lbot
140
141 .Lb01:
142   (p15) BSH             r8 = r10, tnc           C function return value I
143   (p15) FSH             r22 = r10, cnt          C                       I
144   (p15) br.cond.dptk    .Lr1                    C return                B
145
146 .grt1:  ld8             r18 = [up], UPD
147         ;;
148         ld8             r19 = [up], UPD
149         BSH             r8 = r10, tnc           C function return value
150         ;;
151         ld8             r16 = [up], UPD
152         FSH             r22 = r10, cnt
153         BSH             r23 = r18, tnc
154         ;;
155         ld8             r17 = [up], UPD
156         br.cloop.dpnt   .grt5
157         ;;
158
159         FSH             r24 = r18, cnt
160         BSH             r25 = r19, tnc
161         ;;
162         or              r15 = r23, r22
163         FSH             r26 = r19, cnt
164         BSH             r27 = r16, tnc
165         ;;
166         FSH             r20 = r16, cnt
167         BSH             r21 = r17, tnc
168         br              .Lr5
169
170 .grt5:  FSH             r24 = r18, cnt
171         BSH             r25 = r19, tnc
172         ;;
173         ld8             r18 = [up], UPD
174         FSH             r26 = r19, cnt
175         BSH             r27 = r16, tnc
176         ;;
177         ld8             r19 = [up], UPD
178         FSH             r20 = r16, cnt
179         BSH             r21 = r17, tnc
180         ;;
181         or              r15 = r23, r22
182         ld8             r16 = [up], UPD
183         br              .LL01
184
185
186 .Lb10:  ld8             r17 = [up], UPD
187   (p14) br.cond.dptk    .grt2
188
189         BSH             r8 = r10, tnc           C function return value
190         ;;
191         FSH             r20 = r10, cnt
192         BSH             r21 = r17, tnc
193         ;;
194         or              r14 = r21, r20
195         FSH             r22 = r17, cnt
196         br              .Lr2                    C return
197
198 .grt2:  ld8             r18 = [up], UPD
199         BSH             r8 = r10, tnc           C function return value
200         ;;
201         ld8             r19 = [up], UPD
202         FSH             r20 = r10, cnt
203         BSH             r21 = r17, tnc
204         ;;
205         ld8             r16 = [up], UPD
206         FSH             r22 = r17, cnt
207         BSH             r23 = r18, tnc
208         ;;
209         ld8             r17 = [up], UPD
210         br.cloop.dpnt   .grt6
211         ;;
212
213         or              r14 = r21, r20
214         FSH             r24 = r18, cnt
215         BSH             r25 = r19, tnc
216         ;;
217         FSH             r26 = r19, cnt
218         BSH             r27 = r16, tnc
219         br              .Lr6
220
221 .grt6:  or              r14 = r21, r20
222         FSH             r24 = r18, cnt
223         BSH             r25 = r19, tnc
224         ;;
225         ld8             r18 = [up], UPD
226         FSH             r26 = r19, cnt
227         BSH             r27 = r16, tnc
228         ;;
229         ld8             r19 = [up], UPD
230         br              .LL10
231
232
233 .Lb11:  ld8             r16 = [up], UPD
234         ;;
235         ld8             r17 = [up], UPD
236         BSH             r8 = r10, tnc           C function return value
237   (p14) br.cond.dptk    .grt3
238         ;;
239
240         FSH             r26 = r10, cnt
241         BSH             r27 = r16, tnc
242         ;;
243         FSH             r20 = r16, cnt
244         BSH             r21 = r17, tnc
245         ;;
246         or              r15 = r27, r26
247         FSH             r22 = r17, cnt
248         br              .Lr3                    C return
249
250 .grt3:  ld8             r18 = [up], UPD
251         FSH             r26 = r10, cnt
252         BSH             r27 = r16, tnc
253         ;;
254         ld8             r19 = [up], UPD
255         FSH             r20 = r16, cnt
256         BSH             r21 = r17, tnc
257         ;;
258         ld8             r16 = [up], UPD
259         FSH             r22 = r17, cnt
260         BSH             r23 = r18, tnc
261         ;;
262         ld8             r17 = [up], UPD
263         br.cloop.dpnt   .grt7
264
265         or              r15 = r27, r26
266         FSH             r24 = r18, cnt
267         BSH             r25 = r19, tnc
268         br              .Lr7
269
270 .grt7:  or              r15 = r27, r26
271         FSH             r24 = r18, cnt
272         BSH             r25 = r19, tnc
273         ld8             r18 = [up], UPD
274         br              .LL11
275
276 C *** MAIN LOOP START ***
277         ALIGN(32)
278 .Ltop:
279  {.mmi; st8             [rp] = r14, UPD         C M2
280         or              r15 = r27, r26          C M3
281         FSH             r24 = r18, cnt          C I0
282 }{.mmi; ld8             r18 = [up], UPD         C M1
283         lfetch          [r11], PUPD
284         BSH             r25 = r19, tnc          C I1
285         ;; }
286 .LL11:
287  {.mmi; st8             [rp] = r15, UPD
288         or              r14 = r21, r20
289         FSH             r26 = r19, cnt
290 }{.mmi; ld8             r19 = [up], UPD
291         nop.m           0
292         BSH             r27 = r16, tnc
293         ;; }
294 .LL10:
295  {.mmi; st8             [rp] = r14, UPD
296         or              r15 = r23, r22
297         FSH             r20 = r16, cnt
298 }{.mmi; ld8             r16 = [up], UPD
299         nop.m           0
300         BSH             r21 = r17, tnc
301         ;; }
302 .LL01:
303  {.mmi; st8             [rp] = r15, UPD
304         or              r14 = r25, r24
305         FSH             r22 = r17, cnt
306 }{.mib; ld8             r17 = [up], UPD
307         BSH             r23 = r18, tnc
308         br.cloop.dptk   .Ltop
309         ;; }
310
311 C *** MAIN LOOP END ***
312
313 .Lbot:  or              r15 = r27, r26
314         FSH             r24 = r18, cnt
315         BSH             r25 = r19, tnc
316         st8             [rp] = r14, UPD
317         ;;
318 .Lr7:   or              r14 = r21, r20
319         FSH             r26 = r19, cnt
320         BSH             r27 = r16, tnc
321         st8             [rp] = r15, UPD
322         ;;
323 .Lr6:   or              r15 = r23, r22
324         FSH             r20 = r16, cnt
325         BSH             r21 = r17, tnc
326         st8             [rp] = r14, UPD
327         ;;
328 .Lr5:   st8             [rp] = r15, UPD
329         or              r14 = r25, r24
330         FSH             r22 = r17, cnt
331         ;;
332 .Lr4:   or              r15 = r27, r26
333         st8             [rp] = r14, UPD
334         ;;
335 .Lr3:   or              r14 = r21, r20
336         st8             [rp] = r15, UPD
337         ;;
338 .Lr2:   st8             [rp] = r14, UPD
339         ;;
340 .Lr1:   st8             [rp] = r22, UPD         C                       M23
341         mov             ar.lc = r2              C                       I0
342         br.ret.sptk.many b0                     C                       B
343 EPILOGUE(func)
344 ASM_END()