1 dnl x86-64 mpn_divrem_2 -- Divide an mpn number by a normalized 2-limb number.
3 dnl Copyright 2007, 2008 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
30 C * Perhaps compute the inverse without relying on divq? Could either use
31 C Newton's method and mulq, or perhaps the faster fdiv.
32 C * The loop has not been carefully tuned, nor analysed for critical path
33 C length. It seems that 20 c/l is a bit long, compared to the 13 c/l for
35 C * Clean up. This code is really crude.
41 define(`up_param', `%rdx')
42 define(`un_param', `%rcx')
48 C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
54 PROLOGUE(mpn_divrem_2)
57 lea (%rdx,%rcx,8), %rax
70 xor R32(%r15), R32(%r15)
79 lea -3(%rcx,%r13), %rbx C un + fn - 3
105 C rax rbx rcx rdx rsi rdi rbp r8 r9 r10 r11 r12 r13 r14 r15
106 C n2 un n1 dinv qp d0 d1 up fn msl
107 C n2 un -d1 n1 dinv XX XX
110 lea (%rbp,%rbx,8), %rbp
119 mov %r9, %rax C di ncp
122 mov %rax, %r10 C q0 5
127 lea (%rdx, %r14), %rbx C n1 -= ... 7
129 xor R32(%r14), R32(%r14) C
134 L(19): sub %r8, %r14 C ncp
139 xor R32(%rdx), R32(%rdx) C
141 mov %r8, %rax C d0 ncp
142 adc $-1, %rdx C mask 14
143 add %rdx, %rdi C q-- 15
144 and %rdx, %rax C d0 or 0 15
145 and %r11, %rdx C d1 or 0 15
150 L(bck): mov %rdi, (%rbp) C
158 lea (%rbp,%rbx,8), %rbp
164 mov %rax, %r14 C 0, 19
168 mov %rax, %rbx C q0 5
170 lea 1(%rdx), %r10 C 6
174 xor R32(%r9), R32(%r9) C
180 L(13): sub %r8, %r9 C ncp
195 L(bck): mov %r10, (%rbp) C