1 dnl AMD64 mpn_bdiv_q_1, mpn_pi1_bdiv_q_1 -- schoolbook Hensel division by
2 dnl 1-limb divisor, returning quotient only.
4 dnl Copyright 2001, 2002, 2004, 2005, 2006, 2009 Free Software Foundation,
7 dnl This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22 include(`../config.m4')
39 C di r8 just mpn_pi1_bdiv_q_1
40 C shift r9 just mpn_pi1_bdiv_q_1
46 PROLOGUE(mpn_bdiv_q_1)
50 xor R32(%rcx), R32(%rcx) C shift count
54 jnc L(evn) C skip bsfq unless divisor is even
56 L(odd): mov %rax, %rbx
58 and $127, R32(%rax) C d/2, 7 bits
61 mov binvert_limb_table@GOTPCREL(%rip), %rdx
63 movabs $binvert_limb_table, %rdx
66 movzbl (%rdx,%rax), R32(%rax) C inv 8 bits
68 mov %rbx, %r11 C d without twos
70 lea (%rax,%rax), R32(%rdx) C 2*inv
71 imul R32(%rax), R32(%rax) C inv*inv
72 imul R32(%rbx), R32(%rax) C inv*inv*d
73 sub R32(%rax), R32(%rdx) C inv = 2*inv - inv*inv*d, 16 bits
75 lea (%rdx,%rdx), R32(%rax) C 2*inv
76 imul R32(%rdx), R32(%rdx) C inv*inv
77 imul R32(%rbx), R32(%rdx) C inv*inv*d
78 sub R32(%rdx), R32(%rax) C inv = 2*inv - inv*inv*d, 32 bits
80 lea (%rax,%rax), %r8 C 2*inv
81 imul %rax, %rax C inv*inv
82 imul %rbx, %rax C inv*inv*d
83 sub %rax, %r8 C inv = 2*inv - inv*inv*d, 64 bits
87 L(evn): bsf %rax, %rcx
92 PROLOGUE(mpn_pi1_bdiv_q_1)
99 mov (%rsi), %rax C up[0]
104 mov 8(%rsi), %rdx C up[1]
105 lea (%rsi,%r10,8), %rsi C up end
106 lea (%rdi,%r10,8), %rdi C rp end
109 shrd R8(%rcx), %rdx, %rax
111 xor R32(%rbx), R32(%rbx)
117 C rbx carry bit, 0 or 1
122 C r10 counter, limbs, negative
124 mul %r11 C carry limb in rdx
125 mov (%rsi,%r10,8), %rax
126 mov 8(%rsi,%r10,8), %r9
127 shrd R8(%rcx), %r9, %rax
129 sub %rbx, %rax C apply carry bit
131 sub %rdx, %rax C apply carry limb
133 L(ent): imul %r8, %rax
134 mov %rax, (%rdi,%r10,8)
138 mul %r11 C carry limb in rdx
139 mov (%rsi), %rax C up high limb
141 sub %rbx, %rax C apply carry bit
142 sub %rdx, %rax C apply carry limb
148 L(one): shr R8(%rcx), %rax