1 dnl IA-64 mpn_hamdist -- mpn hamming distance.
3 dnl Copyright 2003, 2004, 2005 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
31 define(`u0',`r16') define(`u1',`r17') define(`u2',`r18') define(`u3',`r19')
32 define(`v0',`r20') define(`v1',`r21') define(`v2',`r22') define(`v3',`r23')
33 define(`x0',`r24') define(`x1',`r25') define(`x2',`r26') define(`x3',`r27')
34 define(`c0',`r28') define(`c1',`r29') define(`c2',`r30') define(`c3',`r31')
42 ` addp4 up = 0, up C M I
43 addp4 vp = 0, vp C M I
48 {.mmi; ld8 r10 = [up], 8 C load first ulimb M01
49 ld8 r11 = [vp], 8 C load first vlimb M01
50 mov.i r2 = ar.lc C save ar.lc I0
51 }{.mmi; and r14 = 3, n C M I
52 cmp.lt p15, p0 = 4, n C small count? M I
55 }{.mmi; cmp.eq p6, p0 = 1, r14 C M I
56 cmp.eq p7, p0 = 2, r14 C M I
57 cmp.eq p8, p0 = 3, r14 C M I
59 (p6) br.dptk .Lb01 C B
60 (p7) br.dptk .Lb10 C B
61 (p8) br.dptk .Lb11 C B
65 .Lb00: ld8 u1 = [up], 8 C M01
66 ld8 v1 = [vp], 8 C M01
68 xor x0 = r10, r11 C M I
70 ld8 u2 = [up], 8 C M01
71 ld8 v2 = [vp], 8 C M01
75 ld8 u3 = [up], 8 C M01
76 ld8 v3 = [vp], 8 C M01
79 (p15) br.cond.dptk .grt4 C B
89 .grt4: ld8 u0 = [up], 8 C M01
90 ld8 v0 = [vp], 8 C M01
93 ld8 u1 = [up], 8 C M01
94 ld8 v1 = [vp], 8 C M01
97 ld8 u2 = [up], 8 C M01
98 ld8 v2 = [vp], 8 C M01
100 xor x3 = u3, v3 C M I
102 ld8 u3 = [up], 8 C M01
103 ld8 v3 = [vp], 8 C M01
105 xor x0 = u0, v0 C M I
106 br.cloop.dpnt .grt8 C B
109 xor x1 = u1, v1 C M I
112 .grt8: ld8 u0 = [up], 8 C M01
113 ld8 v0 = [vp], 8 C M01
115 xor x1 = u1, v1 C M I
119 .Lb01: xor x3 = r10, r11 C M I
121 (p15) br.cond.dptk .grt1 C B
124 br.ret.sptk.many b0 C B
126 .grt1: ld8 u0 = [up], 8 C M01
127 ld8 v0 = [vp], 8 C M01
130 ld8 u1 = [up], 8 C M01
131 ld8 v1 = [vp], 8 C M01
134 ld8 u2 = [up], 8 C M01
135 ld8 v2 = [vp], 8 C M01
137 ld8 u3 = [up], 8 C M01
138 ld8 v3 = [vp], 8 C M01
139 xor x0 = u0, v0 C M I
140 br.cloop.dpnt .grt5 C B
142 xor x1 = u1, v1 C M I
145 xor x2 = u2, v2 C M I
148 xor x3 = u3, v3 C M I
153 .grt5: ld8 u0 = [up], 8 C M01
154 ld8 v0 = [vp], 8 C M01
155 xor x1 = u1, v1 C M I
157 ld8 u1 = [up], 8 C M01
158 ld8 v1 = [vp], 8 C M01
160 xor x2 = u2, v2 C M I
162 ld8 u2 = [up], 8 C M01
163 ld8 v2 = [vp], 8 C M01
165 xor x3 = u3, v3 C M I
167 ld8 u3 = [up], 8 C M01
168 ld8 v3 = [vp], 8 C M01
170 xor x0 = u0, v0 C M I
171 br.cloop.dpnt .Loop C B
175 .Lb10: ld8 u3 = [up], 8 C M01
176 ld8 v3 = [vp], 8 C M01
177 xor x2 = r10, r11 C M I
178 (p15) br.cond.dptk .grt2 C B
180 xor x3 = u3, v3 C M I
187 br.ret.sptk.many b0 C B
189 .grt2: ld8 u0 = [up], 8 C M01
190 ld8 v0 = [vp], 8 C M01
193 ld8 u1 = [up], 8 C M01
194 ld8 v1 = [vp], 8 C M01
198 ld8 u2 = [up], 8 C M01
199 ld8 v2 = [vp], 8 C M01
200 xor x3 = u3, v3 C M I
202 ld8 u3 = [up], 8 C M01
203 ld8 v3 = [vp], 8 C M01
204 xor x0 = u0, v0 C M I
205 br.cloop.dptk .grt6 C B
208 xor x1 = u1, v1 C M I
211 xor x2 = u2, v2 C M I
214 xor x3 = u3, v3 C M I
217 .grt6: ld8 u0 = [up], 8 C M01
218 ld8 v0 = [vp], 8 C M01
220 xor x1 = u1, v1 C M I
222 ld8 u1 = [up], 8 C M01
223 ld8 v1 = [vp], 8 C M01
225 xor x2 = u2, v2 C M I
227 ld8 u2 = [up], 8 C M01
228 ld8 v2 = [vp], 8 C M01
230 xor x3 = u3, v3 C M I
234 .Lb11: ld8 u2 = [up], 8 C M01
235 ld8 v2 = [vp], 8 C M01
237 xor x1 = r10, r11 C M I
239 ld8 u3 = [up], 8 C M01
240 ld8 v3 = [vp], 8 C M01
241 xor x2 = u2, v2 C M I
242 (p15) br.cond.dptk .grt3 C B
244 xor x3 = u3, v3 C M I
255 br.ret.sptk.many b0 C B
257 .grt3: ld8 u0 = [up], 8 C M01
258 ld8 v0 = [vp], 8 C M01
261 ld8 u1 = [up], 8 C M01
262 ld8 v1 = [vp], 8 C M01
265 ld8 u2 = [up], 8 C M01
266 ld8 v2 = [vp], 8 C M01
267 xor x3 = u3, v3 C M I
269 ld8 u3 = [up], 8 C M01
270 ld8 v3 = [vp], 8 C M01
272 xor x0 = u0, v0 C M I
273 br.cloop.dptk .grt7 C B
275 xor x1 = u1, v1 C M I
278 xor x2 = u2, v2 C M I
281 .grt7: ld8 u0 = [up], 8 C M01
282 ld8 v0 = [vp], 8 C M01
284 xor x1 = u1, v1 C M I
286 ld8 u1 = [up], 8 C M01
287 ld8 v1 = [vp], 8 C M01
289 xor x2 = u2, v2 C M I
294 .Loop: ld8 u0 = [up], 8 C M01
295 ld8 v0 = [vp], 8 C M01
298 xor x1 = u1, v1 C M I
301 .LL00: ld8 u1 = [up], 8 C M01
302 ld8 v1 = [vp], 8 C M01
305 xor x2 = u2, v2 C M I
308 .LL11: ld8 u2 = [up], 8 C M01
309 ld8 v2 = [vp], 8 C M01
312 xor x3 = u3, v3 C M I
315 .LL10: ld8 u3 = [up], 8 C M01
316 ld8 v3 = [vp], 8 C M01
319 xor x0 = u0, v0 C M I
320 br.cloop.dptk .Loop C B
323 .Lend: popcnt c2 = x2 C I0
325 xor x1 = u1, v1 C M I
327 .Lcj8: popcnt c3 = x3 C I0
329 xor x2 = u2, v2 C M I
331 .Lcj7: popcnt c0 = x0 C I0
333 xor x3 = u3, v3 C M I
335 .Lcj6: popcnt c1 = x1 C I0
338 .Lcj5: popcnt c2 = x2 C I0
341 .Lcj4: popcnt c3 = x3 C I0
349 mov.i ar.lc = r2 C I0
350 br.ret.sptk.many b0 C B