1 dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division.
3 dnl Copyright 2000-2002, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl or both in parallel, as here.
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
36 C K6: 10.0 12.0 cycles/limb
40 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
43 C A simple divl is used for size==1. This is about 10 cycles faster for an
44 C odd divisor or 20 cycles for an even divisor.
46 C The loops are quite sensitive to code alignment, speeds should be
47 C rechecked (odd and even divisor, pic and non-pic) if contemplating
50 defframe(PARAM_DIVISOR,16)
51 defframe(PARAM_SIZE, 12)
52 defframe(PARAM_SRC, 8)
53 defframe(PARAM_DST, 4)
55 dnl re-use parameter space
56 define(VAR_INVERSE,`PARAM_DST')
61 PROLOGUE(mpn_divexact_1)
83 movl PARAM_DIVISOR, %eax
84 pushl %ebx FRAME_pushl()
87 pushl %ebp FRAME_pushl()
91 incl %edx C will get shift+1
94 pushl %esi FRAME_pushl()
96 leal 1(%eax,%eax), %esi C d without twos
97 andl $127, %eax C d/2, 7 bits
100 LEA( binvert_limb_table, %ebp)
101 Zdisp( movzbl, 0,(%eax,%ebp), %eax)
103 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
105 pushl %edi FRAME_pushl()
107 leal (%eax,%eax), %ebp C 2*inv
109 imull %eax, %eax C inv*inv
113 imull %esi, %eax C inv*inv*d
115 subl %eax, %ebp C inv = 2*inv - inv*inv*d
116 leal (%ebp,%ebp), %eax C 2*inv
118 imull %ebp, %ebp C inv*inv
120 movl %esi, PARAM_DIVISOR C d without twos
121 leal (%ebx,%ecx,4), %ebx C src end
123 imull %esi, %ebp C inv*inv*d
125 leal (%edi,%ecx,4), %edi C dst end
128 subl %ebp, %eax C inv = 2*inv - inv*inv*d
129 subl $1, %edx C shift amount, and clear carry
131 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
132 pushl %eax FRAME_pushl()
133 imull PARAM_DIVISOR, %eax
135 popl %eax FRAME_popl()')
137 movl %eax, VAR_INVERSE
140 movl (%ebx,%ecx,4), %esi C src low limb
149 C ecx counter, limbs, negative
151 C esi next limb, adjusted for carry
153 C ebp carry bit, 0 or -1
157 movl PARAM_DIVISOR, %eax
158 movl %esi, -4(%edi,%ecx,4)
160 mull %esi C carry limb in edx
162 subl %ebp, %edx C apply carry bit
163 movl (%ebx,%ecx,4), %esi
166 subl %edx, %esi C apply carry limb
167 movl VAR_INVERSE, %edx
169 sbbl %ebp, %ebp C 0 or -1
177 movl %esi, -4(%edi,%ecx,4)
198 Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1]
201 movl VAR_INVERSE, %edx
207 jz L(even_two) C if only two limbs
210 C Out-of-order execution is good enough to hide the load/rshift/movd
211 C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12,
212 C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has
213 C been found. Maybe the fact every second movq is unaligned costs the extra
219 C ecx counter, limbs, negative
221 C esi next limb, adjusted for carry
223 C ebp carry bit, 0 or -1
225 C mm0 scratch, source limbs
230 movl %esi, -8(%edi,%ecx,4)
231 movl PARAM_DIVISOR, %eax
233 mull %esi C carry limb in edx
235 movq -4(%ebx,%ecx,4), %mm0
239 subl %ebp, %edx C apply carry bit
241 subl %edx, %esi C apply carry limb
242 movl VAR_INVERSE, %edx
244 sbbl %ebp, %ebp C 0 or -1
251 movd -4(%ebx), %mm0 C src high limb
257 movl PARAM_DIVISOR, %eax
259 mull %esi C carry limb in edx
262 subl %ebp, %edx C apply carry bit
264 movl VAR_INVERSE, %eax
265 subl %edx, %esi C apply carry limb