1 dnl Intel Pentium-4 mpn_modexact_1_odd -- mpn by limb exact remainder.
3 dnl Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C P4: 19.0 cycles/limb
26 C mp_limb_t mpn_modexact_1_odd (mp_srcptr src, mp_size_t size,
28 C mp_limb_t mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size,
29 C mp_limb_t divisor, mp_limb_t carry);
32 defframe(PARAM_CARRY, 16)
33 defframe(PARAM_DIVISOR,12)
34 defframe(PARAM_SIZE, 8)
35 defframe(PARAM_SRC, 4)
40 PROLOGUE(mpn_modexact_1c_odd)
43 movd PARAM_CARRY, %mm1
50 PROLOGUE(mpn_modexact_1_odd)
53 pxor %mm1, %mm1 C carry limb
55 movl PARAM_DIVISOR, %eax
57 movd PARAM_DIVISOR, %mm7
61 andl $127, %eax C d/2, 7 bits
64 LEA( binvert_limb_table, %edx)
65 movzbl (%eax,%edx), %eax C inv 8 bits
67 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
76 pmuludq %mm6, %mm6 C inv*inv
80 pmuludq %mm7, %mm6 C inv*inv*d
81 paddd %mm0, %mm0 C 2*inv
85 psubd %mm6, %mm0 C inv = 2*inv - inv*inv*d
89 pmuludq %mm0, %mm0 C inv*inv
93 pmuludq %mm7, %mm0 C inv*inv*d
94 paddd %mm6, %mm6 C 2*inv
102 psubd %mm0, %mm6 C inv = 2*inv - inv*inv*d
104 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
105 pushl %eax FRAME_pushl()
107 imul PARAM_DIVISOR, %eax
109 popl %eax FRAME_popl()')
111 pxor %mm0, %mm0 C carry bit
114 C The dependent chain here is as follows.
117 C psubq s = (src-cbit) - climb 2
118 C pmuludq q = s*inverse 8
119 C pmuludq prod = q*divisor 8
120 C psrlq climb = high(prod) 2
124 C Yet the loop measures 19.0 c/l, so obviously there's something gained
125 C there over a straight reading of the chip documentation.
128 C eax src, incrementing
141 psubq %mm0, %mm2 C src - cbit
143 psubq %mm1, %mm2 C src - cbit - climb
145 psrlq $63, %mm0 C new cbit
147 pmuludq %mm6, %mm2 C s*inverse
150 pmuludq %mm2, %mm1 C q*divisor
151 psrlq $32, %mm1 C new climb