Upload Tizen:Base source
[external/gmp.git] / mpn / ia64 / hamdist.asm
1 dnl  IA-64 mpn_hamdist -- mpn hamming distance.
2
3 dnl  Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
4 dnl
5 dnl  This file is part of the GNU MP Library.
6
7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl  it under the terms of the GNU Lesser General Public License as published
9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
10 dnl  your option) any later version.
11
12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15 dnl  License for more details.
16
17 dnl  You should have received a copy of the GNU Lesser General Public License
18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22 C           cycles/limb
23 C Itanium:       2
24 C Itanium 2:     1
25
26 C INPUT PARAMETERS
27 define(`up', `r32')
28 define(`vp', `r33')
29 define(`n', `r34')
30
31 define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
32 define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
33 define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27')
34 define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
35 define(`s',`r8')
36
37
38 ASM_START()
39 PROLOGUE(mpn_hamdist)
40         .prologue
41 ifdef(`HAVE_ABI_32',
42 `       addp4           up = 0, up              C                       M I
43         addp4           vp = 0, vp              C                       M I
44         zxt4            n = n                   C                       I
45         ;;
46 ')
47
48  {.mmi; ld8             r10 = [up], 8           C load first ulimb      M01
49         ld8             r11 = [vp], 8           C load first vlimb      M01
50         mov.i           r2 = ar.lc              C save ar.lc            I0
51 }{.mmi; and             r14 = 3, n              C                       M I
52         cmp.lt          p15, p0 = 4, n          C small count?          M I
53         add             n = -5, n               C                       M I
54         ;;
55 }{.mmi; cmp.eq          p6, p0 = 1, r14         C                       M I
56         cmp.eq          p7, p0 = 2, r14         C                       M I
57         cmp.eq          p8, p0 = 3, r14         C                       M I
58 }{.bbb
59   (p6)  br.dptk         .Lb01                   C                       B
60   (p7)  br.dptk         .Lb10                   C                       B
61   (p8)  br.dptk         .Lb11                   C                       B
62 }
63
64
65 .Lb00:  ld8             u1 = [up], 8            C                       M01
66         ld8             v1 = [vp], 8            C                       M01
67         shr.u           n = n, 2                C                       I0
68         xor             x0 = r10, r11           C                       M I
69         ;;
70         ld8             u2 = [up], 8            C                       M01
71         ld8             v2 = [vp], 8            C                       M01
72         mov.i           ar.lc = n               C                       I0
73         xor             x1 = u1, v1             C                       M I
74         ;;
75         ld8             u3 = [up], 8            C                       M01
76         ld8             v3 = [vp], 8            C                       M01
77         xor             x2 = u2, v2             C                       M I
78         mov             s = 0                   C                       M I
79   (p15) br.cond.dptk    .grt4                   C                       B
80         ;;
81         popcnt          c0 = x0                 C                       I0
82         xor             x3 = u3, v3             C                       M I
83         ;;
84         popcnt          c1 = x1                 C                       I0
85         ;;
86         popcnt          c2 = x2                 C                       I0
87         br              .Lcj4                   C                       B
88
89 .grt4:  ld8             u0 = [up], 8            C                       M01
90         ld8             v0 = [vp], 8            C                       M01
91         xor             x1 = u1, v1             C                       M I
92         ;;
93         ld8             u1 = [up], 8            C                       M01
94         ld8             v1 = [vp], 8            C                       M01
95         xor             x2 = u2, v2             C                       M I
96         ;;
97         ld8             u2 = [up], 8            C                       M01
98         ld8             v2 = [vp], 8            C                       M01
99         popcnt          c0 = x0                 C                       I0
100         xor             x3 = u3, v3             C                       M I
101         ;;
102         ld8             u3 = [up], 8            C                       M01
103         ld8             v3 = [vp], 8            C                       M01
104         popcnt          c1 = x1                 C                       I0
105         xor             x0 = u0, v0             C                       M I
106         br.cloop.dpnt   .grt8                   C                       B
107
108         popcnt          c2 = x2                 C                       I0
109         xor             x1 = u1, v1             C                       M I
110         br              .Lcj8                   C                       B
111
112 .grt8:  ld8             u0 = [up], 8            C                       M01
113         ld8             v0 = [vp], 8            C                       M01
114         popcnt          c2 = x2                 C                       I0
115         xor             x1 = u1, v1             C                       M I
116         br              .LL00                   C                       B
117
118
119 .Lb01:  xor             x3 = r10, r11           C                       M I
120         shr.u           n = n, 2                C                       I0
121   (p15) br.cond.dptk    .grt1                   C                       B
122         ;;
123         popcnt          r8 = x3                 C                       I0
124         br.ret.sptk.many b0                     C                       B
125
126 .grt1:  ld8             u0 = [up], 8            C                       M01
127         ld8             v0 = [vp], 8            C                       M01
128         mov.i           ar.lc = n               C                       I0
129         ;;
130         ld8             u1 = [up], 8            C                       M01
131         ld8             v1 = [vp], 8            C                       M01
132         mov             s = 0                   C                       M I
133         ;;
134         ld8             u2 = [up], 8            C                       M01
135         ld8             v2 = [vp], 8            C                       M01
136         ;;
137         ld8             u3 = [up], 8            C                       M01
138         ld8             v3 = [vp], 8            C                       M01
139         xor             x0 = u0, v0             C                       M I
140         br.cloop.dpnt   .grt5                   C                       B
141
142         xor             x1 = u1, v1             C                       M I
143         ;;
144         popcnt          c3 = x3                 C                       I0
145         xor             x2 = u2, v2             C                       M I
146         ;;
147         popcnt          c0 = x0                 C                       I0
148         xor             x3 = u3, v3             C                       M I
149         ;;
150         popcnt          c1 = x1                 C                       I0
151         br              .Lcj5                   C                       B
152
153 .grt5:  ld8             u0 = [up], 8            C                       M01
154         ld8             v0 = [vp], 8            C                       M01
155         xor             x1 = u1, v1             C                       M I
156         ;;
157         ld8             u1 = [up], 8            C                       M01
158         ld8             v1 = [vp], 8            C                       M01
159         popcnt          c3 = x3                 C                       I0
160         xor             x2 = u2, v2             C                       M I
161         ;;
162         ld8             u2 = [up], 8            C                       M01
163         ld8             v2 = [vp], 8            C                       M01
164         popcnt          c0 = x0                 C                       I0
165         xor             x3 = u3, v3             C                       M I
166         ;;
167         ld8             u3 = [up], 8            C                       M01
168         ld8             v3 = [vp], 8            C                       M01
169         popcnt          c1 = x1                 C                       I0
170         xor             x0 = u0, v0             C                       M I
171         br.cloop.dpnt   .Loop                   C                       B
172         br              .Lend                   C                       B
173
174
175 .Lb10:  ld8             u3 = [up], 8            C                       M01
176         ld8             v3 = [vp], 8            C                       M01
177         xor             x2 = r10, r11           C                       M I
178   (p15) br.cond.dptk    .grt2                   C                       B
179         ;;
180         xor             x3 = u3, v3             C                       M I
181         ;;
182         popcnt          c2 = x2                 C                       I0
183         ;;
184         popcnt          c3 = x3                 C                       I0
185         ;;
186         add             s = c2, c3              C                       M I
187         br.ret.sptk.many b0                     C                       B
188
189 .grt2:  ld8             u0 = [up], 8            C                       M01
190         ld8             v0 = [vp], 8            C                       M01
191         shr.u           n = n, 2                C                       I0
192         ;;
193         ld8             u1 = [up], 8            C                       M01
194         ld8             v1 = [vp], 8            C                       M01
195         mov.i           ar.lc = n               C                       I0
196         mov             s = 0                   C                       M I
197         ;;
198         ld8             u2 = [up], 8            C                       M01
199         ld8             v2 = [vp], 8            C                       M01
200         xor             x3 = u3, v3             C                       M I
201         ;;
202         ld8             u3 = [up], 8            C                       M01
203         ld8             v3 = [vp], 8            C                       M01
204         xor             x0 = u0, v0             C                       M I
205         br.cloop.dptk   .grt6                   C                       B
206
207         popcnt          c2 = x2                 C                       I0
208         xor             x1 = u1, v1             C                       M I
209         ;;
210         popcnt          c3 = x3                 C                       I0
211         xor             x2 = u2, v2             C                       M I
212         ;;
213         popcnt          c0 = x0                 C                       I0
214         xor             x3 = u3, v3             C                       M I
215         br              .Lcj6                   C                       B
216
217 .grt6:  ld8             u0 = [up], 8            C                       M01
218         ld8             v0 = [vp], 8            C                       M01
219         popcnt          c2 = x2                 C                       I0
220         xor             x1 = u1, v1             C                       M I
221         ;;
222         ld8             u1 = [up], 8            C                       M01
223         ld8             v1 = [vp], 8            C                       M01
224         popcnt          c3 = x3                 C                       I0
225         xor             x2 = u2, v2             C                       M I
226         ;;
227         ld8             u2 = [up], 8            C                       M01
228         ld8             v2 = [vp], 8            C                       M01
229         popcnt          c0 = x0                 C                       I0
230         xor             x3 = u3, v3             C                       M I
231         br              .LL10                   C                       B
232
233
234 .Lb11:  ld8             u2 = [up], 8            C                       M01
235         ld8             v2 = [vp], 8            C                       M01
236         shr.u           n = n, 2                C                       I0
237         xor             x1 = r10, r11           C                       M I
238         ;;
239         ld8             u3 = [up], 8            C                       M01
240         ld8             v3 = [vp], 8            C                       M01
241         xor             x2 = u2, v2             C                       M I
242   (p15) br.cond.dptk    .grt3                   C                       B
243         ;;
244         xor             x3 = u3, v3             C                       M I
245         ;;
246         popcnt          c1 = x1                 C                       I0
247         ;;
248         popcnt          c2 = x2                 C                       I0
249         ;;
250         popcnt          c3 = x3                 C                       I0
251         ;;
252         add             s = c1, c2              C                       M I
253         ;;
254         add             s = s, c3               C                       M I
255         br.ret.sptk.many b0                     C                       B
256
257 .grt3:  ld8             u0 = [up], 8            C                       M01
258         ld8             v0 = [vp], 8            C                       M01
259         mov.i           ar.lc = n               C                       I0
260         ;;
261         ld8             u1 = [up], 8            C                       M01
262         ld8             v1 = [vp], 8            C                       M01
263         mov             s = 0                   C                       M I
264         ;;
265         ld8             u2 = [up], 8            C                       M01
266         ld8             v2 = [vp], 8            C                       M01
267         xor             x3 = u3, v3             C                       M I
268         ;;
269         ld8             u3 = [up], 8            C                       M01
270         ld8             v3 = [vp], 8            C                       M01
271         popcnt          c1 = x1                 C                       I0
272         xor             x0 = u0, v0             C                       M I
273         br.cloop.dptk   .grt7                   C                       B
274         popcnt          c2 = x2                 C                       I0
275         xor             x1 = u1, v1             C                       M I
276         ;;
277         popcnt          c3 = x3                 C                       I0
278         xor             x2 = u2, v2             C                       M I
279         br              .Lcj7                   C                       B
280
281 .grt7:  ld8             u0 = [up], 8            C                       M01
282         ld8             v0 = [vp], 8            C                       M01
283         popcnt          c2 = x2                 C                       I0
284         xor             x1 = u1, v1             C                       M I
285         ;;
286         ld8             u1 = [up], 8            C                       M01
287         ld8             v1 = [vp], 8            C                       M01
288         popcnt          c3 = x3                 C                       I0
289         xor             x2 = u2, v2             C                       M I
290         br              .LL11                   C                       B
291
292
293         ALIGN(32)
294 .Loop:  ld8             u0 = [up], 8            C                       M01
295         ld8             v0 = [vp], 8            C                       M01
296         popcnt          c2 = x2                 C                       I0
297         add             s = s, c3               C                       M I
298         xor             x1 = u1, v1             C                       M I
299         nop.b           1                       C                       -
300         ;;
301 .LL00:  ld8             u1 = [up], 8            C                       M01
302         ld8             v1 = [vp], 8            C                       M01
303         popcnt          c3 = x3                 C                       I0
304         add             s = s, c0               C                       M I
305         xor             x2 = u2, v2             C                       M I
306         nop.b           1                       C                       -
307         ;;
308 .LL11:  ld8             u2 = [up], 8            C                       M01
309         ld8             v2 = [vp], 8            C                       M01
310         popcnt          c0 = x0                 C                       I0
311         add             s = s, c1               C                       M I
312         xor             x3 = u3, v3             C                       M I
313         nop.b           1                       C                       -
314         ;;
315 .LL10:  ld8             u3 = [up], 8            C                       M01
316         ld8             v3 = [vp], 8            C                       M01
317         popcnt          c1 = x1                 C                       I0
318         add             s = s, c2               C                       M I
319         xor             x0 = u0, v0             C                       M I
320         br.cloop.dptk   .Loop                   C                       B
321         ;;
322
323 .Lend:  popcnt          c2 = x2                 C                       I0
324         add             s = s, c3               C                       M I
325         xor             x1 = u1, v1             C                       M I
326         ;;
327 .Lcj8:  popcnt          c3 = x3                 C                       I0
328         add             s = s, c0               C                       M I
329         xor             x2 = u2, v2             C                       M I
330         ;;
331 .Lcj7:  popcnt          c0 = x0                 C                       I0
332         add             s = s, c1               C                       M I
333         xor             x3 = u3, v3             C                       M I
334         ;;
335 .Lcj6:  popcnt          c1 = x1                 C                       I0
336         add             s = s, c2               C                       M I
337         ;;
338 .Lcj5:  popcnt          c2 = x2                 C                       I0
339         add             s = s, c3               C                       M I
340         ;;
341 .Lcj4:  popcnt          c3 = x3                 C                       I0
342         add             s = s, c0               C                       M I
343         ;;
344         add             s = s, c1               C                       M I
345         ;;
346         add             s = s, c2               C                       M I
347         ;;
348         add             s = s, c3               C                       M I
349         mov.i           ar.lc = r2              C                       I0
350         br.ret.sptk.many b0                     C                       B
351 EPILOGUE()
352 ASM_END()