1 dnl Intel Pentium mpn_divexact_1 -- mpn by limb exact division.
3 dnl Copyright 2001, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
25 C P54: 24.5 30.5 cycles/limb
29 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
32 C Plain divl is used for small sizes, since the inverse takes a while to
33 C setup. Multiplying works out faster for size>=3 when the divisor is odd,
34 C or size>=4 when the divisor is even. Actually on P55 size==2 for odd or
35 C size==3 for even are about the same speed for both divl or mul, but the
36 C former is used since it will use up less code cache.
38 C The P55 speeds noted above, 23 cycles odd or 28 cycles even, are as
39 C expected. On P54 in the even case the shrdl pairing nonsense (see
40 C mpn/x86/pentium/README) costs 1 cycle, but it's not clear why there's a
41 C further 1.5 slowdown for both odd and even.
43 defframe(PARAM_DIVISOR,16)
44 defframe(PARAM_SIZE, 12)
45 defframe(PARAM_SRC, 8)
46 defframe(PARAM_DST, 4)
48 dnl re-use parameter space
49 define(VAR_INVERSE,`PARAM_DST')
54 PROLOGUE(mpn_divexact_1)
57 movl PARAM_DIVISOR, %eax
60 pushl %esi FRAME_pushl()
61 push %edi FRAME_pushl()
67 addl %ecx, %eax C size if even, size+1 if odd
75 movl -4(%esi,%ecx,4), %eax
79 movl %eax, -4(%edi,%ecx,4)
92 movl PARAM_DIVISOR, %eax
96 ASSERT(nz, `orl %eax, %eax')
98 incl %ecx C shift count
102 leal 1(%eax,%eax), %edx C d
103 andl $127, %eax C d/2, 7 bits
105 pushl %ebx FRAME_pushl()
106 pushl %ebp FRAME_pushl()
113 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ebp
115 movl binvert_limb_table@GOT(%ebp), %ebp
117 movzbl (%eax,%ebp), %eax
121 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
124 movl %eax, %ebp C inv
125 addl %eax, %eax C 2*inv
127 imull %ebp, %ebp C inv*inv
129 imull %edx, %ebp C inv*inv*d
131 subl %ebp, %eax C inv = 2*inv - inv*inv*d
132 movl PARAM_SIZE, %ebx
135 addl %eax, %eax C 2*inv
137 imull %ebp, %ebp C inv*inv
139 imull %edx, %ebp C inv*inv*d
141 subl %ebp, %eax C inv = 2*inv - inv*inv*d
142 movl %edx, PARAM_DIVISOR C d without twos
144 leal (%esi,%ebx,4), %esi C src end
145 leal (%edi,%ebx,4), %edi C dst end
149 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
150 pushl %eax FRAME_pushl()
151 imull PARAM_DIVISOR, %eax
153 popl %eax FRAME_popl()')
155 movl %eax, VAR_INVERSE
156 xorl %ebp, %ebp C initial carry bit
158 movl (%esi,%ebx,4), %eax C src low limb
159 orl %ecx, %ecx C shift
161 movl 4(%esi,%ebx,4), %edx C src second limb (for even)
164 shrdl( %cl, %edx, %eax)
173 C ebx counter, limbs, negative
178 C ebp carry bit, 0 or -1
182 movl (%esi,%ebx,4), %eax
190 imull VAR_INVERSE, %eax
192 movl %eax, (%edi,%ebx,4)
209 C ebx counter, limbs, negative
214 C ebp carry bit, 0 or -1
218 subl %ebp, %edx C carry bit
219 movl -4(%esi,%ebx,4), %eax C src limb
221 movl (%esi,%ebx,4), %ebp C and one above it
223 shrdl( %cl, %ebp, %eax)
225 subl %edx, %eax C carry limb
230 imull VAR_INVERSE, %eax
232 movl %eax, -4(%edi,%ebx,4)
241 movl -4(%esi), %eax C src high limb
246 subl %edx, %eax C no carry if division is exact
248 imull VAR_INVERSE, %eax
250 movl %eax, -4(%edi) C dst high limb
251 nop C protect against cache bank clash