1 dnl X86-32 and X86-64 mpn_popcount using SSE2.
3 dnl Copyright 2006, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
24 C 32-bit popcount hamdist
25 C cycles/limb cycles/limb
27 C P6 model 0-8,10-12) -
28 C P6 model 9 (Banias) ?
29 C P6 model 13 (Dothan) 4
30 C P4 model 0 (Willamette) ?
32 C P4 model 2 (Northwood) 3.9
33 C P4 model 3 (Prescott) ?
34 C P4 model 4 (Nocona) ?
39 C 64-bit popcount hamdist
40 C cycles/limb cycles/limb
41 C P4 model 4 (Nocona): 8
48 C * Make a mpn_hamdist based on this. Alignment could either be handled by
49 C using movdqu for one operand and movdqa for the other, or by painfully
50 C shifting as we go. Unfortunately, there seem to be no useable shift
51 C instruction, except for one that takes an immediate count.
52 C * It would probably be possible to cut a few cycles/limb using software
54 C * There are 35 decode slots unused by the SSE2 instructions. Loop control
55 C needs just 2 or 3 slots, leaving around 32 slots. This allows a parallel
56 C integer based popcount. Such a combined loop would handle 6 limbs in
57 C about 30 cycles on K8.
58 C * We could save a byte or two by using 32-bit operations on areg.
59 C * Check if using movdqa to a temp of and then register-based pand is faster.
61 ifelse(GMP_LIMB_BITS,`32',
62 ` define(`up', `%edx')
66 define(`zero',`%xmm4')
67 define(`LIMB32',` $1')
68 define(`LIMB64',`dnl')
74 define(`zero',`%xmm8')
75 define(`LIMB32',`dnl')
76 define(`LIMB64',` $1')
79 define(`mm01010101',`%xmm6')
80 define(`mm00110011',`%xmm7')
81 define(`mm00001111',`%xmm2')
83 define(`GMP_LIMB_BYTES', eval(GMP_LIMB_BITS/8))
84 define(`LIMBS_PER_XMM', eval(16/GMP_LIMB_BYTES))
85 define(`LIMBS_PER_2XMM', eval(32/GMP_LIMB_BYTES))
87 undefine(`psadbw') C override inherited m4 version
91 C Make cnsts global to work around Apple relocation bug.
93 define(`cnsts', MPN(popccnsts))
98 PROLOGUE(mpn_popcount)
100 LIMB32(`mov 4(%esp), up ')
101 LIMB32(`mov 8(%esp), n ')
104 pxor %xmm3, %xmm3 C zero grand total count
105 LIMB64(`pxor zero, zero ')
109 LIMB32(`mov $cnsts, breg ')
110 LIMB64(`movabs $cnsts, breg ')
113 movdqa -48(breg), mm01010101
114 movdqa -32(breg), mm00110011
115 movdqa -16(breg), mm00001111
118 and $-16, up C round `up' down to 128-bit boundary
119 and $12, areg C 32:areg = 0, 4, 8, 12
122 pand 64(breg,areg,4), %xmm0
123 shr $m4_log2(GMP_LIMB_BYTES), %eax
124 add areg, n C compensate n for rounded down `up'
127 sub $LIMBS_PER_XMM, n
130 sub $LIMBS_PER_XMM, n
135 L(top): movdqa (up), %xmm0
136 L(ent): movdqa 16(up), %xmm4
142 pand mm01010101, %xmm0
143 pand mm01010101, %xmm4
151 pand mm00110011, %xmm0
152 pand mm00110011, %xmm4
153 pand mm00110011, %xmm1
154 pand mm00110011, %xmm5
158 LIMB32(`pxor zero, zero ')
161 sub $LIMBS_PER_2XMM, n
166 pand mm00001111, %xmm0
167 pand mm00001111, %xmm1
171 paddq %xmm1, %xmm3 C add to grand total
175 add $LIMBS_PER_2XMM, n
179 sub $LIMBS_PER_XMM, n
185 shl $m4_log2(GMP_LIMB_BYTES), n
187 pand (breg,n,4), %xmm0
193 pand mm01010101, %xmm0
194 pand mm01010101, %xmm4
202 pand mm00110011, %xmm0
203 pand mm00110011, %xmm4
204 pand mm00110011, %xmm1
205 pand mm00110011, %xmm5
209 LIMB32(`pxor zero, zero ')
214 pand mm00001111, %xmm0
215 pand mm00001111, %xmm1
219 paddq %xmm1, %xmm3 C add to grand total
222 C Add the two 64-bit halves of the grand total counter
223 L(rt): movdqa %xmm3, %xmm0
226 movd %xmm0, areg C movq avoided due to gas bug
233 C Three magic constants used for masking out bits
234 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
235 .byte 0x55,0x55,0x55,0x55,0x55,0x55,0x55,0x55
237 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
238 .byte 0x33,0x33,0x33,0x33,0x33,0x33,0x33,0x33
240 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
241 .byte 0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f,0x0f
243 C Masks for high end of number
244 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
245 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
247 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
248 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
250 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
251 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
253 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
254 .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
255 C Masks for low end of number
256 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
257 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
259 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff
260 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
262 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
263 .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
265 .byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
266 .byte 0x00,0x00,0x00,0x00,0xff,0xff,0xff,0xff