1 dnl Intel P5 mpn_hamdist -- mpn hamming distance.
3 dnl Copyright 2001, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C P5: 14.0 cycles/limb
26 C unsigned long mpn_hamdist (mp_srcptr src1, mp_srcptr src2, mp_size_t size);
28 C It might be possible to shave 1 cycle from the loop, and hence 2
29 C cycles/limb. The xorb is taking 2 cycles, but a separate load and xor
30 C would be 1, if the right schedule could be found (not found so far).
31 C Wanting to avoid potential cache bank clashes makes it tricky.
33 C The slightly strange quoting here helps the renaming done by tune/many.pl.
35 m4_assert_defined(`GSYM_PREFIX')
36 GSYM_PREFIX`'mpn_popcount``'_table')
38 defframe(PARAM_SIZE,12)
39 defframe(PARAM_SRC2, 8)
40 defframe(PARAM_SRC1, 4)
49 pushl %esi FRAME_pushl()
51 shll %ecx C size in byte pairs
52 pushl %edi FRAME_pushl()
55 pushl %ebx FRAME_pushl()
56 pushl %ebp FRAME_pushl()
58 call L(here) FRAME_pushl()
61 popl %ebp FRAME_popl()
64 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
66 xorl %ebx, %ebx C byte
67 xorl %edx, %edx C byte
69 movl TABLE_NAME@GOT(%ebp), %ebp
70 xorl %eax, %eax C total
71 define(TABLE,`(%ebp,$1)')
78 xorl %eax, %eax C total
79 pushl %ebx FRAME_pushl()
81 xorl %edx, %edx C byte
82 xorl %ebx, %ebx C byte
84 define(TABLE,`TABLE_NAME($1)')
88 C The nop after the xorb seems necessary. Although a movb might be
89 C expected to go down the V pipe in the second cycle of the xorb, it
90 C doesn't and costs an extra 2 cycles.
94 C ecx counter, 2*size to 2
101 movb -1(%esi,%ecx,2), %bl
104 movb -1(%edi,%ecx,2), %dl
107 movb -2(%esi,%ecx,2), %dl
109 xorb -2(%edi,%ecx,2), %dl
112 movb TABLE(%ebx), %bl
115 movb TABLE(%edx), %dl