1 dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
3 dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C P4: 19.0 cycles/limb
26 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
29 C Pairs of movd's are used to avoid unaligned loads. Despite the loads not
30 C being on the dependent chain and there being plenty of cycles available,
31 C using an unaligned movq on every second iteration measured about 23 c/l.
33 C Using divl for size==1 seems a touch quicker than mul-by-inverse. The mul
34 C will be about 9+2*4+2*2+10*4+19+12 = 92 cycles latency, though some of
35 C that might be hidden by out-of-order execution, whereas divl is around 60.
36 C At size==2 an extra 19 for the mul versus 60 for the divl will see the mul
39 defframe(PARAM_DIVISOR,16)
40 defframe(PARAM_SIZE, 12)
41 defframe(PARAM_SRC, 8)
42 defframe(PARAM_DST, 4)
47 PROLOGUE(mpn_divexact_1)
54 movl PARAM_DIVISOR, %ecx
75 bsfl %ecx, %ecx C trailing twos
77 shrl %cl, %eax C d = divisor without twos
79 movd %ecx, %mm7 C shift
83 andl $127, %eax C d/2, 7 bits
86 LEA( binvert_limb_table, %ecx)
87 movzbl (%eax,%ecx), %eax C inv 8 bits
89 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
98 pmuludq %mm5, %mm5 C inv*inv
102 pmuludq %mm6, %mm5 C inv*inv*d
103 paddd %mm0, %mm0 C 2*inv
107 psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d
111 pmuludq %mm0, %mm0 C inv*inv
114 psrlq $32, %mm4 C 0x00000000FFFFFFFF
118 pmuludq %mm6, %mm0 C inv*inv*d
119 paddd %mm5, %mm5 C 2*inv
123 pxor %mm1, %mm1 C initial carry limb
127 psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d
129 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
130 pushl %eax FRAME_pushl()
135 popl %eax FRAME_popl()')
137 pxor %mm0, %mm0 C initial carry bit
140 C The dependent chain here is as follows.
143 C psubq s = (src-cbit) - climb 2
144 C pmuludq q = s*inverse 8
145 C pmuludq prod = q*divisor 8
146 C psrlq climb = high(prod) 2
150 C Yet the loop measures 19.0 c/l, so obviously there's something gained
151 C there over a straight reading of the chip documentation.
154 C eax src, incrementing
156 C ecx dst, incrementing
157 C edx counter, size-1 iterations
161 C mm4 0x00000000FFFFFFFF
172 pand %mm4, %mm2 C src
173 psubq %mm0, %mm2 C src - cbit
175 psubq %mm1, %mm2 C src - cbit - climb
177 psrlq $63, %mm0 C new cbit
179 pmuludq %mm5, %mm2 C s*inverse
180 movd %mm2, (%ecx) C q
184 pmuludq %mm2, %mm1 C q*divisor
185 psrlq $32, %mm1 C new climb
193 psrlq %mm7, %mm2 C src
194 psubq %mm0, %mm2 C src - cbit
196 psubq %mm1, %mm2 C src - cbit - climb
198 pmuludq %mm5, %mm2 C s*inverse
199 movd %mm2, (%ecx) C q