1 dnl Intel Pentium mpn_modexact_1_odd -- exact division style remainder.
3 dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C P5: 23.0 cycles/limb
26 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
28 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
29 C mp_limb_t divisor, mp_limb_t carry);
31 C There seems no way to pair up the two lone instructions in the main loop.
33 C The special case for size==1 saves about 20 cycles (non-PIC), making it
34 C the same as mpn_mod_1, and in fact making modexact faster than mod_1 at
39 C Using mmx for the multiplies might be possible, with pmullw and pmulhw
40 C having just 3 cycle latencies, but carry bit handling would probably be
43 defframe(PARAM_CARRY, 16)
44 defframe(PARAM_DIVISOR,12)
45 defframe(PARAM_SIZE, 8)
46 defframe(PARAM_SRC, 4)
48 dnl re-using parameter space
49 define(VAR_INVERSE,`PARAM_SIZE')
54 PROLOGUE(mpn_modexact_1c_odd)
57 movl PARAM_DIVISOR, %eax
58 movl PARAM_CARRY, %edx
65 PROLOGUE(mpn_modexact_1_odd)
68 movl PARAM_DIVISOR, %eax
69 xorl %edx, %edx C carry
74 call L(here) FRAME_pushl()
78 movl (%esp), %ecx C eip
80 addl $_GLOBAL_OFFSET_TABLE_+[.-L(here)], %ecx
81 movl %ebx, (%esp) C push ebx
86 movl binvert_limb_table@GOT(%ecx), %ecx
89 movb (%eax,%ecx), %cl C inv 8 bits
95 pushl %ebx FRAME_pushl()
103 movb binvert_limb_table(%eax), %cl C inv 8 bits
107 addl %ecx, %ecx C 2*inv
109 imull %eax, %eax C inv*inv
111 imull PARAM_DIVISOR, %eax C inv*inv*d
113 subl %eax, %ecx C inv = 2*inv - inv*inv*d
116 addl %ecx, %ecx C 2*inv
118 imull %eax, %eax C inv*inv
120 imull PARAM_DIVISOR, %eax C inv*inv*d
122 subl %eax, %ecx C inv = 2*inv - inv*inv*d
123 pushl %esi FRAME_pushl()
125 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
127 imull PARAM_DIVISOR, %eax
131 movl %ecx, VAR_INVERSE
133 movl (%esi), %eax C src[0]
134 leal 4(%esi,%ebx,4), %esi C &src[size-1]
136 xorl $-1, %ebx C -(size-1)
141 C The use of VAR_INVERSE means only a store is needed for that value, rather
142 C than a push and pop of say %edi.
146 C eax scratch, low product
147 C ebx counter, limbs, negative
149 C edx scratch, high product
154 mull PARAM_DIVISOR C h:dummy = q*d
156 movl (%esi,%ebx,4), %eax C src[i]
157 subl %ecx, %edx C h -= -c
160 subl %edx, %eax C s = src[i] - h
162 sbbl %ecx, %ecx C new -c (0 or -1)
164 imull VAR_INVERSE, %eax C q = s*i
172 movl (%esi), %eax C src high
173 subl %ecx, %edx C h -= -c
175 cmpl PARAM_DIVISOR, %eax
178 deflit(FRAME_LAST,FRAME)
181 subl %edx, %eax C s = src[i] - h
182 popl %esi FRAME_popl()
184 sbbl %ecx, %ecx C c (0 or -1)
185 popl %ebx FRAME_popl()
187 imull VAR_INVERSE, %eax C q = s*i
189 mull PARAM_DIVISOR C h:dummy = q*d
198 C When high<divisor can skip last step.
201 deflit(`FRAME',FRAME_LAST)
208 subl %eax, %edx C r-s
209 popl %esi FRAME_popl()
211 sbbl %eax, %eax C -1 if underflow
212 movl PARAM_DIVISOR, %ebx
214 andl %ebx, %eax C divisor if underflow
215 popl %ebx FRAME_popl()
217 addl %edx, %eax C addback if underflow
222 C Special case for size==1 using a division for r = c-a mod d.
223 C Could look for a-c<d and save a division sometimes, but that doesn't seem
224 C worth bothering about.
239 movl PARAM_DIVISOR, %ecx
240 popl %ebx FRAME_popl()
242 subl (%edx), %eax C c-a
247 andl %ecx, %edx C b*d+c-a if c<a, or c-a if c>=a