1 dnl AMD K6 mpn_divexact_1 -- mpn by limb exact division.
3 dnl Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
25 C K6: 10.0 12.0 cycles/limb
29 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
32 C A simple divl is used for size==1. This is about 10 cycles faster for an
33 C odd divisor or 20 cycles for an even divisor.
35 C The loops are quite sensitive to code alignment, speeds should be
36 C rechecked (odd and even divisor, pic and non-pic) if contemplating
39 defframe(PARAM_DIVISOR,16)
40 defframe(PARAM_SIZE, 12)
41 defframe(PARAM_SRC, 8)
42 defframe(PARAM_DST, 4)
44 dnl re-use parameter space
45 define(VAR_INVERSE,`PARAM_DST')
50 PROLOGUE(mpn_divexact_1)
72 movl PARAM_DIVISOR, %eax
73 pushl %ebx FRAME_pushl()
76 pushl %ebp FRAME_pushl()
80 incl %edx C will get shift+1
83 pushl %esi FRAME_pushl()
85 leal 1(%eax,%eax), %esi C d without twos
86 andl $127, %eax C d/2, 7 bits
89 LEA( binvert_limb_table, %ebp)
90 Zdisp( movzbl, 0,(%eax,%ebp), %eax)
92 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
94 pushl %edi FRAME_pushl()
96 leal (%eax,%eax), %ebp C 2*inv
98 imull %eax, %eax C inv*inv
102 imull %esi, %eax C inv*inv*d
104 subl %eax, %ebp C inv = 2*inv - inv*inv*d
105 leal (%ebp,%ebp), %eax C 2*inv
107 imull %ebp, %ebp C inv*inv
109 movl %esi, PARAM_DIVISOR C d without twos
110 leal (%ebx,%ecx,4), %ebx C src end
112 imull %esi, %ebp C inv*inv*d
114 leal (%edi,%ecx,4), %edi C dst end
117 subl %ebp, %eax C inv = 2*inv - inv*inv*d
118 subl $1, %edx C shift amount, and clear carry
120 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
121 pushl %eax FRAME_pushl()
122 imull PARAM_DIVISOR, %eax
124 popl %eax FRAME_popl()')
126 movl %eax, VAR_INVERSE
129 movl (%ebx,%ecx,4), %esi C src low limb
138 C ecx counter, limbs, negative
140 C esi next limb, adjusted for carry
142 C ebp carry bit, 0 or -1
146 movl PARAM_DIVISOR, %eax
147 movl %esi, -4(%edi,%ecx,4)
149 mull %esi C carry limb in edx
151 subl %ebp, %edx C apply carry bit
152 movl (%ebx,%ecx,4), %esi
155 subl %edx, %esi C apply carry limb
156 movl VAR_INVERSE, %edx
158 sbbl %ebp, %ebp C 0 or -1
166 movl %esi, -4(%edi,%ecx,4)
187 Zdisp( movq, 0,(%ebx,%ecx,4), %mm0) C src[0,1]
190 movl VAR_INVERSE, %edx
196 jz L(even_two) C if only two limbs
199 C Out-of-order execution is good enough to hide the load/rshift/movd
200 C latency. Having imul at the top of the loop gives 11.5 c/l instead of 12,
201 C on K6-2. In fact there's only 11 of decode, but nothing running at 11 has
202 C been found. Maybe the fact every second movq is unaligned costs the extra
208 C ecx counter, limbs, negative
210 C esi next limb, adjusted for carry
212 C ebp carry bit, 0 or -1
214 C mm0 scratch, source limbs
219 movl %esi, -8(%edi,%ecx,4)
220 movl PARAM_DIVISOR, %eax
222 mull %esi C carry limb in edx
224 movq -4(%ebx,%ecx,4), %mm0
228 subl %ebp, %edx C apply carry bit
230 subl %edx, %esi C apply carry limb
231 movl VAR_INVERSE, %edx
233 sbbl %ebp, %ebp C 0 or -1
240 movd -4(%ebx), %mm0 C src high limb
246 movl PARAM_DIVISOR, %eax
248 mull %esi C carry limb in edx
251 subl %ebp, %edx C apply carry bit
253 movl VAR_INVERSE, %eax
254 subl %edx, %esi C apply carry limb