1 dnl AMD K7 mpn_divexact_1 -- mpn by limb exact division.
3 dnl Copyright 2001, 2002, 2004, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
28 C void mpn_divexact_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
31 C The dependent chain is mul+imul+sub for 11 cycles and that speed is
32 C achieved with no special effort. The load and shrld latencies are hidden
33 C by out of order execution.
35 C It's a touch faster on size==1 to use the mul-by-inverse than divl.
37 defframe(PARAM_DIVISOR,16)
38 defframe(PARAM_SIZE, 12)
39 defframe(PARAM_SRC, 8)
40 defframe(PARAM_DST, 4)
42 defframe(SAVE_EBX, -4)
43 defframe(SAVE_ESI, -8)
44 defframe(SAVE_EDI, -12)
45 defframe(SAVE_EBP, -16)
46 defframe(VAR_INVERSE, -20)
47 defframe(VAR_DST_END, -24)
49 deflit(STACK_SPACE, 24)
54 PROLOGUE(mpn_divexact_1)
57 movl PARAM_DIVISOR, %eax
58 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
59 movl $-1, %ecx C shift count
67 C If there's usually only one or two trailing zero bits then this
68 C should be faster than bsfl.
75 leal 1(%eax,%eax), %ebx C d without twos
76 andl $127, %eax C d/2, 7 bits
79 LEA( binvert_limb_table, %edx)
80 movzbl (%eax,%edx), %eax C inv 8 bits
82 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
85 leal (%eax,%eax), %edx C 2*inv
86 movl %ebx, PARAM_DIVISOR C d without twos
88 imull %eax, %eax C inv*inv
93 imull %ebx, %eax C inv*inv*d
95 subl %eax, %edx C inv = 2*inv - inv*inv*d
96 leal (%edx,%edx), %eax C 2*inv
98 imull %edx, %edx C inv*inv
100 leal (%esi,%ebp,4), %esi C src end
101 leal (%edi,%ebp,4), %edi C dst end
104 imull %ebx, %edx C inv*inv*d
106 subl %edx, %eax C inv = 2*inv - inv*inv*d
108 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
109 pushl %eax FRAME_pushl()
110 imull PARAM_DIVISOR, %eax
112 popl %eax FRAME_popl()')
114 movl %eax, VAR_INVERSE
115 movl (%esi,%ebp,4), %eax C src[0]
120 movl (%esi,%ebp,4), %edx C src[1]
122 shrdl( %cl, %edx, %eax)
124 movl %edi, VAR_DST_END
131 C ebx carry bit, 0 or 1
136 C ebp counter, limbs, negative
138 mull PARAM_DIVISOR C carry limb in edx
140 movl -4(%esi,%ebp,4), %eax
141 movl (%esi,%ebp,4), %edi
143 shrdl( %cl, %edi, %eax)
145 subl %ebx, %eax C apply carry bit
147 movl VAR_DST_END, %edi
149 subl %edx, %eax C apply carry limb
153 imull VAR_INVERSE, %eax
155 movl %eax, -4(%edi,%ebp,4)
160 mull PARAM_DIVISOR C carry limb in edx
162 movl -4(%esi), %eax C src high limb
166 subl %ebx, %eax C apply carry bit
170 subl %edx, %eax C apply carry limb
172 imull VAR_INVERSE, %eax
176 addl $STACK_SPACE, %esp
186 imull VAR_INVERSE, %eax
192 addl $STACK_SPACE, %esp