1 dnl x86-64 mpn_divrem_1 -- mpn by limb division.
3 dnl Copyright 2004, 2005, 2007, 2008, 2009 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
26 C P6 core2 25 24.5 19.3
27 C P6 corei7 21.5 20.7 18
31 C * Compute the inverse without relying on the div instruction.
32 C Newton's method and mulq, or perhaps the faster fdiv.
34 C * Optimize for Core 2.
36 C The code for unnormalized divisors works also for normalized divisors, but
37 C for some reason it runs really slowly (on K8) for that case. Use special
38 C code until we can address this. The Intel Atom is also affected, but
39 C understandably (shld slowness).
40 define(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',1)
43 C mpn_divrem_1 (mp_ptr qp, mp_size_t fn,
44 C mp_srcptr np, mp_size_t nn, mp_limb_t d)
47 C mpn_preinv_divrem_1 (mp_ptr qp, mp_size_t fn,
48 C mp_srcptr np, mp_size_t nn, mp_limb_t d,
49 C mp_limb_t dinv, int cnt)
53 define(`fn_param', `%rsi')
54 define(`up_param', `%rdx')
55 define(`un_param', `%rcx')
57 define(`dinv', `%r9') C only for mpn_preinv_divrem_1
58 C shift passed on stack C only for mpn_preinv_divrem_1
66 C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
72 PROLOGUE(mpn_preinv_divrem_1)
81 add fn_param, un_param
84 lea -8(qp,un_param,8), qp
94 PROLOGUE(mpn_divrem_1)
103 add fn_param, un_param
107 lea -8(qp,un_param,8), qp
108 xor R32(%rbp), R32(%rbp)
111 ifdef(`SPECIAL_CODE_FOR_NORMALIZED_DIVISOR',`
118 mov -8(up,un,8), %rbp
131 div d C FREE rax rdx rcx r9 r10 r11
137 L(nloop): C cycK8 cycP6 cycP4
138 mov (up,un,8), %r10 C
140 mul dinv C 0,13 0,19 0,45
141 add %r10, %rax C 4 8 12
142 adc %rbp, %rdx C 5 9 13
143 mov %rax, %rbp C 5 9 13
144 mov %rdx, %r13 C 6 11 23
145 imul d, %rdx C 6 11 23
146 sub %rdx, %r10 C 10 16 33
148 add %r10, %rax C 11 17 34
149 cmp %rbp, %r10 C 11 17 34
150 cmovb %r10, %rax C 12 18 35
154 L(nok): mov %r13, (qp) C
170 mov -8(up,un,8), %rax
186 div d C FREE rax rdx r9 r10 r11
192 mov -8(up,un,8), %rbp
218 L(uok): mov %r13, (qp)
240 L(69): mov %r13, (qp)
257 L(floop): C cycK8 cycP6 cycP4
267 cmovb %rdx, %rax C 11