1 dnl Intel Pentium-4 mpn_divrem_1 -- mpn by limb division.
3 dnl Copyright 1999, 2000, 2001, 2002, 2003, 2004 Free Software Foundation,
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or
9 dnl modify it under the terms of the GNU Lesser General Public License as
10 dnl published by the Free Software Foundation; either version 3 of the
11 dnl License, or (at your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful,
14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 dnl Lesser General Public License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
24 C P4: 32 cycles/limb integer part, 30 cycles/limb fraction part.
27 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
28 C mp_srcptr src, mp_size_t size,
30 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
31 C mp_srcptr src, mp_size_t size,
32 C mp_limb_t divisor, mp_limb_t carry);
33 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
34 C mp_srcptr src, mp_size_t size,
35 C mp_limb_t divisor, mp_limb_t inverse,
40 C The method and nomenclature follow part 8 of "Division by Invariant
41 C Integers using Multiplication" by Granlund and Montgomery, reference in
44 C "m" is written for what is m' in the paper, and "d" for d_norm, which
45 C won't cause any confusion since it's only the normalized divisor that's of
46 C any use in the code. "b" is written for 2^N, the size of a limb, N being
49 C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
50 C "n-d - q1*d". This rearrangement gives the same two-limb answer but lets
51 C us have just a psubq on the dependent chain.
53 C For reference, the way the k7 code uses "n-(q1+1)*d" would not suit here,
54 C detecting an overflow of q1+1 when q1=0xFFFFFFFF would cost too much.
58 C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
59 C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
60 C carry, since in normal circumstances that will be a very rare event.
62 C The test for skipping a division is branch free (once size>=1 is tested).
63 C The store to the destination high limb is 0 when a divide is skipped, or
64 C if it's not skipped then a copy of the src high limb is stored. The
65 C latter is in case src==dst.
67 C There's a small bias towards expecting xsize==0, by having code for
68 C xsize==0 in a straight line and xsize!=0 under forward jumps.
72 C The loop measures 32 cycles, but the dependent chain would suggest it
73 C could be done with 30. Not sure where to start looking for the extras.
77 C If the divisor is normalized (high bit set) then a division step can
78 C always be skipped, since the high destination limb is always 0 or 1 in
79 C that case. It doesn't seem worth checking for this though, since it
80 C probably occurs infrequently.
83 dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
84 dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
86 dnl The inverse takes about 80-90 cycles to calculate, but after that the
87 dnl multiply is 32 c/l versus division at about 58 c/l.
89 dnl At 4 limbs the div is a touch faster than the mul (and of course
90 dnl simpler), so start the mul from 5 limbs.
92 deflit(MUL_THRESHOLD, 5)
95 defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
96 defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
97 defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
98 defframe(PARAM_DIVISOR,20)
99 defframe(PARAM_SIZE, 16)
100 defframe(PARAM_SRC, 12)
101 defframe(PARAM_XSIZE, 8)
102 defframe(PARAM_DST, 4)
104 dnl re-use parameter space
105 define(SAVE_ESI,`PARAM_SIZE')
106 define(SAVE_EBP,`PARAM_SRC')
107 define(SAVE_EDI,`PARAM_DIVISOR')
108 define(SAVE_EBX,`PARAM_DST')
113 PROLOGUE(mpn_preinv_divrem_1)
116 movl PARAM_SIZE, %ecx
117 xorl %edx, %edx C carry if can't skip a div
123 movl PARAM_DIVISOR, %ebp
128 movl -4(%esi,%ecx,4), %eax C src high limb
131 movl PARAM_XSIZE, %ebx
133 movd PARAM_PREINV_INVERSE, %mm4
135 movd PARAM_PREINV_SHIFT, %mm7 C l
136 cmpl %ebp, %eax C high cmp divisor
138 cmovc( %eax, %edx) C high is carry if high<divisor
139 movd %edx, %mm0 C carry
141 movd %edx, %mm1 C carry
145 cmovnc( %eax, %edx) C 0 if skip div, src high if not
146 C (the latter in case src==dst)
147 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
149 movl %edx, (%edi,%ecx,4) C dst high limb
150 sbbl $0, %ecx C skip one division if high<divisor
153 subl PARAM_PREINV_SHIFT, %eax
154 psllq %mm7, %mm5 C d normalized
155 leal (%edi,%ecx,4), %edi C &dst[xsize+size-1]
156 leal -4(%esi,%ecx,4), %esi C &src[size-1]
158 movd %eax, %mm6 C 32-l
165 PROLOGUE(mpn_divrem_1c)
168 movl PARAM_CARRY, %edx
170 movl PARAM_SIZE, %ecx
176 movl PARAM_DIVISOR, %ebp
182 movl PARAM_XSIZE, %ebx
184 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
191 PROLOGUE(mpn_divrem_1)
194 movl PARAM_SIZE, %ecx
195 xorl %edx, %edx C initial carry (if can't skip a div)
201 movl PARAM_DIVISOR, %ebp
207 movl PARAM_XSIZE, %ebx
208 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
210 orl %ecx, %ecx C size
211 jz L(no_skip_div) C if size==0
212 movl -4(%esi,%ecx,4), %eax C src high limb
214 cmpl %ebp, %eax C high cmp divisor
216 cmovnc( %eax, %edx) C 0 if skip div, src high if not
217 movl %edx, (%edi,%ecx,4) C dst high limb
220 cmovc( %eax, %edx) C high is carry if high<divisor
222 sbbl $0, %ecx C size-1 if high<divisor
235 leal (%ebx,%ecx), %eax C size+xsize
236 leal -4(%esi,%ecx,4), %esi C &src[size-1]
237 leal (%edi,%ecx,4), %edi C &dst[size+xsize-1]
239 cmpl $MUL_THRESHOLD, %eax
240 jae L(mul_by_inverse)
244 jz L(divide_no_integer) C if size==0
247 C eax scratch (quotient)
251 C esi src, decrementing
252 C edi dst, decrementing
264 jnz L(divide_integer)
267 L(divide_no_integer):
269 jnz L(divide_fraction) C if xsize!=0
281 C eax scratch (quotient)
286 C edi dst, decrementing
297 jnz L(divide_fraction)
303 C -----------------------------------------------------------------------------
311 C edi &dst[size+xsize-1]
314 bsrl %ebp, %eax C 31-l
315 movd %edx, %mm0 C carry
316 movd %edx, %mm1 C carry
317 movl %ecx, %edx C size
322 xorl %eax, %ecx C l = leading zeros on d
325 shll %cl, %ebp C d normalized
327 movl %edx, %ecx C size
329 movd %eax, %mm6 C 32-l
335 subl %ebp, %edx C (b-d)-1 so edx:eax = b*(b-d)-1
337 divl %ebp C floor (b*(b-d)-1 / d)
351 C edi &dst[size+xsize-1]
362 psllq %mm7, %mm0 C n2 = carry << l, for size==0
367 movd (%esi), %mm0 C src high limb
369 psrlq %mm6, %mm0 C n2 = high (carry:srchigh << l)
373 C The dependent chain here consists of
376 C 8 pmuludq m*(n1+n2)
377 C 2 paddq n2:nadj + m*(n1+n2)
381 C 2 psrlq high n-(q1+1)*d mask
383 C 2 paddd n2+d addback
387 C But it seems to run at 32 cycles, so presumably there's something else
394 C ecx counter, size-1 to 0
396 C esi src, decrementing
397 C edi dst, decrementing
410 movd -4(%esi), %mm1 C next src limbs
415 psrlq %mm6, %mm1 C n10
417 movq %mm1, %mm2 C n10
418 movq %mm1, %mm3 C n10
419 psrad $31, %mm1 C -n1
420 pand %mm5, %mm1 C -n1 & d
421 paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
424 paddd %mm0, %mm2 C n2+n1
425 punpckldq %mm0, %mm1 C n2:nadj
427 pmuludq %mm4, %mm2 C m*(n2+n1)
431 paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
432 pxor %mm2, %mm2 C break dependency, saves 4 cycles
433 pcmpeqd %mm2, %mm2 C FF...FF
436 psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
438 paddd %mm1, %mm2 C q1+1
439 pmuludq %mm5, %mm1 C q1*d
441 punpckldq %mm0, %mm3 C n = n2:n10
444 psubq %mm5, %mm3 C n - d
448 psubq %mm1, %mm3 C n - (q1+1)*d
450 por %mm3, %mm0 C copy remainder -> new n2
451 psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
459 pand %mm5, %mm3 C mask & d
461 paddd %mm3, %mm0 C addback if necessary
488 movd (%esi), %mm1 C src[0]
489 psllq %mm7, %mm1 C n10
491 movq %mm1, %mm2 C n10
492 movq %mm1, %mm3 C n10
493 psrad $31, %mm1 C -n1
494 pand %mm5, %mm1 C -n1 & d
495 paddd %mm2, %mm1 C nadj = n10+(-n1&d), ignore overflow
498 paddd %mm0, %mm2 C n2+n1
499 punpckldq %mm0, %mm1 C n2:nadj
501 pmuludq %mm4, %mm2 C m*(n2+n1)
505 paddq %mm2, %mm1 C n2:nadj + m*(n2+n1)
506 pcmpeqd %mm2, %mm2 C FF...FF
509 psrlq $32, %mm1 C q1 = high(n2:nadj + m*(n2+n1))
510 paddd %mm1, %mm2 C q1
512 pmuludq %mm5, %mm1 C q1*d
513 punpckldq %mm0, %mm3 C n
514 psubq %mm5, %mm3 C n - d
519 psubq %mm1, %mm3 C n - (q1+1)*d
521 por %mm3, %mm0 C remainder -> n2
522 psrlq $32, %mm3 C high n - (q1+1)*d, 0 or -1
530 pand %mm5, %mm3 C mask & d
532 paddd %mm3, %mm0 C addback if necessary
542 jnz L(fraction_some) C if xsize!=0
547 psrld %mm7, %mm0 C remainder
559 C -----------------------------------------------------------------------------
574 C ebx counter, xsize iterations
577 C esi src, decrementing
578 C edi dst, decrementing
592 pmuludq %mm4, %mm0 C m*n2
599 psrlq $32, %mm0 C high(m*n2)
601 paddd %mm1, %mm0 C q1 = high(n2:0 + m*n2)
603 paddd %mm0, %mm2 C q1+1
604 pmuludq %mm5, %mm0 C q1*d
606 psllq $32, %mm1 C n = n2:0
607 psubq %mm5, %mm1 C n - d
611 psubq %mm0, %mm1 C r = n - (q1+1)*d
614 por %mm1, %mm0 C r -> n2
615 psrlq $32, %mm1 C high n - (q1+1)*d, 0 or -1
623 pand %mm5, %mm1 C mask & d
625 paddd %mm1, %mm0 C addback if necessary