1 dnl AMD64 mpn_redc_1 -- Montgomery reduction with a one-limb modular inverse.
3 dnl Copyright 2004, 2008 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
32 C * Handle certain sizes, e.g., 1, 2, 3, 4, 8, with single-loop code.
33 C The code for 1, 2, 3, 4 should perhaps be completely register based.
34 C * Perhaps align outer loops.
35 C * The sub_n at the end leaks side-channel data. How do we fix that?
36 C * Write mpn_add_n_sub_n computing R = A + B - C. It should run at 2 c/l.
37 C * We could software pipeline the IMUL stuff, by putting it before the
38 C outer loops and before the end of the outer loops. The last outer
39 C loop iteration would then compute an unneeded product, but it is at
40 C least not a stray read from up[], since it is at up[n].
41 C * Can we combine both the add_n and sub_n into the loops, somehow?
46 define(`param_mp',`%rdx')
52 define(`nneg', `%r12')
64 sub $8, %rsp C maintain ABI required rsp alignment
66 lea (param_mp,n,8), mp C mp += n
67 lea (up,n,8), up C up += n
82 mov 16(up,nneg,8), %rbp C up[0]
101 L(lo1): add %r10, (up,i,8)
105 L(mi1): xor %r10d, %r10d
127 L(ed1): add %r10, (up)
133 L(n1): mov %r14, 16(up,nneg,8) C up[0]
141 L(b0): C lea (mp), mp
144 mov 16(up,nneg,8), %rbp C up[0]
155 L(lo0): add %r10, (up,i,8)
164 L(mi0): mov 8(mp,i,8), %rax
181 L(ed0): add %r10, (up)
187 mov %r14, 16(up,nneg,8) C up[0]
196 L(b3): lea -8(mp), mp
199 mov 24(up,nneg,8), %rbp C up[0]
209 L(lo3): add %r10, (up,i,8)
223 L(mi3): mov 16(mp,i,8), %rax
235 L(ed3): add %r10, 8(up)
241 mov %r14, 24(up,nneg,8) C up[0]
249 L(b2): lea -16(mp), mp
252 mov 32(up,nneg,8), %rbp C up[0]
264 L(lo2): add %r10, (up,i,8)
286 L(mi2): xor %ebx, %ebx
290 L(ed2): add %r10, 16(up)
296 mov %r14, 32(up,nneg,8) C up[0]
305 lea (mp,nneg,8), mp C restore entry mp
307 C cy = mpn_add_n (rp, up, up - n, n);
309 lea (up,nneg,8), up C up -= n
310 lea (up,nneg,8), %rdx C rdx = up - n [up entry value]
311 mov rp, nneg C preserve rp over first call
312 mov 8(%rsp), %rcx C pass entry n
315 test R32(%rax), R32(%rax)
318 C mpn_sub_n (rp, rp, mp, n);
323 mov 8(%rsp), %rcx C pass entry n
328 pop n C just increment rsp