1 dnl Intel P6 mpn_modexact_1_odd -- exact division style remainder.
3 dnl Copyright 2000, 2001, 2002, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C P6: 10.0 cycles/limb
26 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
28 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
29 C mp_limb_t divisor, mp_limb_t carry);
31 C It's not worth skipping a step at the end when high<divisor since the main
32 C loop is only 10 cycles.
34 defframe(PARAM_CARRY, 16)
35 defframe(PARAM_DIVISOR,12)
36 defframe(PARAM_SIZE, 8)
37 defframe(PARAM_SRC, 4)
39 dnl Not enough room under modexact_1 to make these re-use the parameter
40 dnl space, unfortunately.
41 defframe(SAVE_EBX, -4)
42 defframe(SAVE_ESI, -8)
43 defframe(SAVE_EDI, -12)
44 deflit(STACK_SPACE, 12)
49 PROLOGUE(mpn_modexact_1c_odd)
52 movl PARAM_CARRY, %ecx
58 PROLOGUE(mpn_modexact_1_odd)
63 movl PARAM_DIVISOR, %eax
65 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
76 LEA( binvert_limb_table, %edi)
77 movzbl (%eax,%edi), %edi C inv 8 bits
79 movzbl binvert_limb_table(%eax), %edi C inv 8 bits
82 xorl %edx, %edx C initial extra carry
83 leal (%edi,%edi), %eax C 2*inv
85 imull %edi, %edi C inv*inv
90 imull PARAM_DIVISOR, %edi C inv*inv*d
92 subl %edi, %eax C inv = 2*inv - inv*inv*d
93 leal (%eax,%eax), %edi C 2*inv
95 imull %eax, %eax C inv*inv
97 imull PARAM_DIVISOR, %eax C inv*inv*d
99 leal (%esi,%ebx,4), %esi C src end
102 subl %eax, %edi C inv = 2*inv - inv*inv*d
104 ASSERT(e,` C d*inv == 1 mod 2^GMP_LIMB_BITS
105 movl PARAM_DIVISOR, %eax
110 C The dependent chain here is
114 C mull PARAM_DIVISOR 5
118 C and this is the measured speed. No special scheduling is necessary, out
119 C of order execution hides the load latency.
122 C eax scratch (src limb)
123 C ebx counter, limbs, negative
124 C ecx carry bit, 0 or 1
125 C edx carry limb, high of last product
130 movl (%esi,%ebx,4), %eax
149 leal (%ecx,%edx), %eax
154 addl $STACK_SPACE, %esp