1 dnl Intel Pentium 4 mpn_popcount, mpn_hamdist -- population count and
4 dnl Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or
9 dnl modify it under the terms of the GNU Lesser General Public License as
10 dnl published by the Free Software Foundation; either version 3 of the
11 dnl License, or (at your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful,
14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 dnl Lesser General Public License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
25 C P3 model 9 (Banias) ? ?
26 C P3 model 13 (Dothan) 6 6
27 C P4 model 0 (Willamette)
29 C P4 model 2 (Northwood) 8 9
30 C P4 model 3 (Prescott) 8 9
33 C unsigned long mpn_popcount (mp_srcptr src, mp_size_t size);
34 C unsigned long mpn_hamdist (mp_srcptr src, mp_srcptr src2, mp_size_t size);
36 C Loading with unaligned movq's costs an extra 1 c/l and hence is avoided.
37 C Two movd's and a punpckldq seems to be the same speed as an aligned movq,
38 C and using them saves fiddling about with alignment testing on entry.
40 C For popcount there's 13 mmx instructions in the loop, so perhaps 6.5 c/l
41 C might be possible, but 8 c/l relying on out-of-order execution is already
44 ifdef(`OPERATION_popcount',,
45 `ifdef(`OPERATION_hamdist',,
46 `m4_error(`Need OPERATION_popcount or OPERATION_hamdist defined
51 `ifdef(`OPERATION_hamdist',`$1')')
55 `ifdef(`OPERATION_popcount',`$1')')
58 defframe(PARAM_SIZE, 12)
59 defframe(PARAM_SRC2, 8)
60 defframe(PARAM_SRC, 4)
61 define(M4_function,mpn_hamdist)
64 defframe(PARAM_SIZE, 8)
65 defframe(PARAM_SRC, 4)
66 define(M4_function,mpn_popcount)
69 MULFUNC_PROLOGUE(mpn_popcount mpn_hamdist)
76 L(rodata_AAAAAAAAAAAAAAAA):
79 L(rodata_3333333333333333):
82 L(rodata_0F0F0F0F0F0F0F0F):
97 movl $0xAAAAAAAA, %edx
101 movl $0x33333333, %edx
105 movl $0x0F0F0F0F, %edx
109 HAM(` movl PARAM_SRC2, %edx')
113 HAM(` movl PARAM_SRC2, %edx')
114 movq L(rodata_AAAAAAAAAAAAAAAA), %mm7
115 movq L(rodata_3333333333333333), %mm6
116 movq L(rodata_0F0F0F0F0F0F0F0F), %mm5
119 pxor %mm4, %mm4 C zero
120 pxor %mm0, %mm0 C total
126 movd (%eax,%ecx,4), %mm1 C src high limb
127 HAM(` movd (%edx,%ecx,4), %mm2
136 C ecx counter, size-1 to 2 or 1, inclusive
139 C mm0 total (low dword)
143 C mm4 0x0000000000000000
144 C mm5 0x0F0F0F0F0F0F0F0F
145 C mm6 0x3333333333333333
146 C mm7 0xAAAAAAAAAAAAAAAA
153 HAM(` movd (%edx), %mm2
164 psubd %mm2, %mm1 C bit pairs
170 paddd %mm2, %mm1 C nibbles
176 paddd %mm2, %mm1 C bytes
179 paddd %mm1, %mm0 C to total
184 C ecx is 0 or -1 representing respectively 1 or 0 further limbs