1 dnl Intel Pentium-4 mpn_divexact_1 -- mpn by limb exact division.
3 dnl Copyright 2001, 2002, 2007, 2011 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl Rearranged from mpn/x86/pentium4/sse2/dive_1.asm by Marco Bodrato.
9 dnl The GNU MP Library is free software; you can redistribute it and/or
10 dnl modify it under the terms of the GNU Lesser General Public License as
11 dnl published by the Free Software Foundation; either version 3 of the
12 dnl License, or (at your option) any later version.
14 dnl The GNU MP Library is distributed in the hope that it will be useful,
15 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
16 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 dnl Lesser General Public License for more details.
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22 include(`../config.m4')
25 C P4: 19.0 cycles/limb
27 C Pairs of movd's are used to avoid unaligned loads. Despite the loads not
28 C being on the dependent chain and there being plenty of cycles available,
29 C using an unaligned movq on every second iteration measured about 23 c/l.
32 defframe(PARAM_SHIFT, 24)
33 defframe(PARAM_INVERSE,20)
34 defframe(PARAM_DIVISOR,16)
35 defframe(PARAM_SIZE, 12)
36 defframe(PARAM_SRC, 8)
37 defframe(PARAM_DST, 4)
42 C mpn_pi1_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size, mp_limb_t divisor,
43 C mp_limb_t inverse, int shift)
45 PROLOGUE(mpn_pi1_bdiv_q_1)
52 movl PARAM_DIVISOR, %ecx
55 movl PARAM_SHIFT, %ecx
57 movd %ecx, %mm7 C shift
61 movl PARAM_INVERSE, %ecx
65 pxor %mm1, %mm1 C initial carry limb
66 pxor %mm0, %mm0 C initial carry bit
72 psrlq $32, %mm4 C 0x00000000FFFFFFFF
74 C The dependent chain here is as follows.
77 C psubq s = (src-cbit) - climb 2
78 C pmuludq q = s*inverse 8
79 C pmuludq prod = q*divisor 8
80 C psrlq climb = high(prod) 2
84 C Yet the loop measures 19.0 c/l, so obviously there's something gained
85 C there over a straight reading of the chip documentation.
88 C eax src, incrementing
90 C ecx dst, incrementing
91 C edx counter, size-1 iterations
95 C mm4 0x00000000FFFFFFFF
106 pand %mm4, %mm2 C src
107 psubq %mm0, %mm2 C src - cbit
109 psubq %mm1, %mm2 C src - cbit - climb
111 psrlq $63, %mm0 C new cbit
113 pmuludq %mm5, %mm2 C s*inverse
114 movd %mm2, (%ecx) C q
118 pmuludq %mm2, %mm1 C q*divisor
119 psrlq $32, %mm1 C new climb
127 psrlq %mm7, %mm2 C src
128 psubq %mm0, %mm2 C src - cbit
130 psubq %mm1, %mm2 C src - cbit - climb
132 pmuludq %mm5, %mm2 C s*inverse
133 movd %mm2, (%ecx) C q
141 C mp_limb_t mpn_bdiv_q_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
142 C mp_limb_t divisor);
144 PROLOGUE(mpn_bdiv_q_1)
147 movl PARAM_SIZE, %edx
149 movl PARAM_DIVISOR, %ecx
157 bsfl %ecx, %ecx C trailing twos
159 shrl %cl, %eax C d = divisor without twos
161 movd %ecx, %mm7 C shift
165 andl $127, %eax C d/2, 7 bits
168 LEA( binvert_limb_table, %ecx)
169 movzbl (%eax,%ecx), %eax C inv 8 bits
171 movzbl binvert_limb_table(%eax), %eax C inv 8 bits
176 movd %eax, %mm5 C inv
178 movd %eax, %mm0 C inv
180 pmuludq %mm5, %mm5 C inv*inv
184 pmuludq %mm6, %mm5 C inv*inv*d
185 paddd %mm0, %mm0 C 2*inv
189 psubd %mm5, %mm0 C inv = 2*inv - inv*inv*d
193 pmuludq %mm0, %mm0 C inv*inv
196 psrlq $32, %mm4 C 0x00000000FFFFFFFF
200 pmuludq %mm6, %mm0 C inv*inv*d
201 paddd %mm5, %mm5 C 2*inv
205 pxor %mm1, %mm1 C initial carry limb
209 psubd %mm0, %mm5 C inv = 2*inv - inv*inv*d
211 ASSERT(e,` C expect d*inv == 1 mod 2^GMP_LIMB_BITS
212 pushl %eax FRAME_pushl()
217 popl %eax FRAME_popl()')
219 pxor %mm0, %mm0 C initial carry bit