1 dnl Intel Pentium-II mpn_divrem_1 -- mpn by limb division.
3 dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C P6MMX: 25.0 cycles/limb integer part, 17.5 cycles/limb fraction part.
26 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
27 C mp_srcptr src, mp_size_t size,
29 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
30 C mp_srcptr src, mp_size_t size,
31 C mp_limb_t divisor, mp_limb_t carry);
32 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
33 C mp_srcptr src, mp_size_t size,
34 C mp_limb_t divisor, mp_limb_t inverse,
37 C This code is a lightly reworked version of mpn/x86/k7/mmx/divrem_1.asm,
38 C see that file for some comments. It's possible what's here can be improved.
41 dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
42 dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
44 dnl The different speeds of the integer and fraction parts means that using
45 dnl xsize+size isn't quite right. The threshold wants to be a bit higher
46 dnl for the integer part and a bit lower for the fraction part. (Or what's
47 dnl really wanted is to speed up the integer part!)
49 dnl The threshold is set to make the integer part right. At 4 limbs the
50 dnl div and mul are about the same there, but on the fractional part the
51 dnl mul is much faster.
53 deflit(MUL_THRESHOLD, 4)
56 defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
57 defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
58 defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
59 defframe(PARAM_DIVISOR,20)
60 defframe(PARAM_SIZE, 16)
61 defframe(PARAM_SRC, 12)
62 defframe(PARAM_XSIZE, 8)
63 defframe(PARAM_DST, 4)
65 defframe(SAVE_EBX, -4)
66 defframe(SAVE_ESI, -8)
67 defframe(SAVE_EDI, -12)
68 defframe(SAVE_EBP, -16)
70 defframe(VAR_NORM, -20)
71 defframe(VAR_INVERSE, -24)
72 defframe(VAR_SRC, -28)
73 defframe(VAR_DST, -32)
74 defframe(VAR_DST_STOP,-36)
76 deflit(STACK_SPACE, 36)
81 PROLOGUE(mpn_preinv_divrem_1)
83 movl PARAM_XSIZE, %ecx
84 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
93 movl PARAM_DIVISOR, %ebp
98 movl -4(%esi,%ebx,4), %eax C src high limb
99 xorl %edi, %edi C initial carry (if can't skip a div)
103 leal 8(%edx,%ecx,4), %edx C &dst[xsize+2]
106 movl %edx, VAR_DST_STOP C &dst[xsize+2]
107 cmpl %ebp, %eax C high cmp divisor
109 cmovc( %eax, %edi) C high is carry if high<divisor
111 cmovnc( %eax, %ecx) C 0 if skip div, src high if not
112 C (the latter in case src==dst)
114 movl %ecx, -12(%edx,%ebx,4) C dst high limb
116 sbbl $0, %ebx C skip one division if high<divisor
117 movl PARAM_PREINV_SHIFT, %ecx
119 leal -8(%edx,%ebx,4), %edx C &dst[xsize+size]
122 movl %edx, VAR_DST C &dst[xsize+size]
124 shll %cl, %ebp C d normalized
128 movd %eax, %mm7 C rshift
129 movl PARAM_PREINV_INVERSE, %eax
138 PROLOGUE(mpn_divrem_1c)
140 movl PARAM_CARRY, %edx
142 movl PARAM_SIZE, %ecx
143 subl $STACK_SPACE, %esp
144 deflit(`FRAME',STACK_SPACE)
147 movl PARAM_XSIZE, %ebx
153 movl PARAM_DIVISOR, %ebp
158 leal -4(%edi,%ebx,4), %edi
164 C offset 0x31, close enough to aligned
165 PROLOGUE(mpn_divrem_1)
168 movl PARAM_SIZE, %ecx
169 movl $0, %edx C initial carry (if can't skip a div)
170 subl $STACK_SPACE, %esp
171 deflit(`FRAME',STACK_SPACE)
174 movl PARAM_DIVISOR, %ebp
177 movl PARAM_XSIZE, %ebx
181 orl %ecx, %ecx C size
186 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
187 jz L(no_skip_div) C if size==0
189 movl -4(%esi,%ecx,4), %eax C src high limb
191 cmpl %ebp, %eax C high cmp divisor
193 cmovc( %eax, %edx) C high is carry if high<divisor
195 cmovnc( %eax, %esi) C 0 if skip div, src high if not
196 C (the latter in case src==dst)
198 movl %esi, (%edi,%ecx,4) C dst high limb
200 sbbl $0, %ecx C size-1 if high<divisor
201 movl PARAM_SRC, %esi C reload
214 leal (%ebx,%ecx), %eax C size+xsize
215 cmpl $MUL_THRESHOLD, %eax
216 jae L(mul_by_inverse)
219 jz L(divide_no_integer)
222 C eax scratch (quotient)
225 C edx scratch (remainder)
230 movl -4(%esi,%ecx,4), %eax
234 movl %eax, (%edi,%ecx,4)
236 jnz L(divide_integer)
239 L(divide_no_integer):
242 jnz L(divide_fraction)
253 addl $STACK_SPACE, %esp
259 C eax scratch (quotient)
262 C edx scratch (remainder)
271 movl %eax, -4(%edi,%ebx,4)
273 jnz L(divide_fraction)
279 C -----------------------------------------------------------------------------
290 leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop
292 movl %ebx, VAR_DST_STOP
293 leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
296 movl %ecx, %ebx C size
298 bsrl %ebp, %ecx C 31-l
299 movl %edx, %edi C carry
301 leal 1(%ecx), %eax C 32-l
307 shll %cl, %ebp C d normalized
311 subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
313 divl %ebp C floor (b*(b-d)-1) / d
326 movl %eax, VAR_INVERSE
327 orl %ebx, %ebx C size
328 leal -12(%esi,%ebx,4), %eax C &src[size-3]
333 movl 8(%eax), %esi C src high limb
337 L(start_two_or_more):
338 movl 4(%eax), %edx C src second highest limb
340 shldl( %cl, %esi, %edi) C n2 = carry,high << l
342 shldl( %cl, %edx, %esi) C n10 = high,second << l
345 je L(integer_two_left)
350 shldl( %cl, %esi, %edi) C n2 = carry,high << l
352 shll %cl, %esi C n10 = high << l
353 jmp L(integer_one_left)
357 C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
358 C skipped a division.
360 shll %cl, %edi C n2 = carry << l
361 movl %edi, %eax C return value for zero_done
369 C -----------------------------------------------------------------------------
371 C This loop runs at about 25 cycles, which is probably sub-optimal, and
372 C certainly more than the dependent chain would suggest. A better loop, or
373 C a better rough analysis of what's possible, would be welcomed.
375 C In the current implementation, the following successively dependent
376 C micro-ops seem to exist.
388 C Lack of registers hinders explicit scheduling and it might be that the
389 C normal out of order execution isn't able to hide enough under the mul
392 C Using sarl/negl to pick out n1 for the n2+n1 stage is a touch faster than
393 C cmov (and takes one uop off the dependent chain). A sarl/andl/addl
394 C combination was tried for the addback (despite the fact it would lengthen
395 C the dependent chain) but found to be no faster.
401 C ebx scratch (nadj, q1)
402 C ecx scratch (src, dst)
408 C mm0 scratch (src qword)
409 C mm7 rshift for normalization
417 andl %eax, %ebx C -n1 & d
420 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
421 addl %edi, %eax C n2+n1
422 movq (%ecx), %mm0 C next src limb and the one below it
424 mull VAR_INVERSE C m*(n2+n1)
434 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
436 leal 1(%edi), %ebx C n2+1
438 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
453 movl VAR_DST_STOP, %eax
455 sbbl %edx, %edi C n - (q1+1)*d
456 movl %esi, %edi C remainder -> n2
457 leal (%ebp,%esi), %edx
459 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
472 L(integer_loop_done):
475 C -----------------------------------------------------------------------------
477 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
478 C q1_ff special case. This make the code a bit smaller and simpler, and
479 C costs only 2 cycles (each).
483 C ebx scratch (nadj, q1)
484 C ecx scratch (src, dst)
499 andl %eax, %ebx C -n1 & d
502 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
503 addl %edi, %eax C n2+n1
505 mull VAR_INVERSE C m*(n2+n1)
507 movd (%ecx), %mm0 C src low limb
509 movl VAR_DST_STOP, %ecx
515 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
516 leal 1(%edi), %ebx C n2+1
519 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
535 sbbl %edx, %edi C n - (q1+1)*d
536 movl %esi, %edi C remainder -> n2
537 leal (%ebp,%esi), %edx
539 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
547 C -----------------------------------------------------------------------------
550 C ebx scratch (nadj, q1)
564 movl VAR_DST_STOP, %ecx
566 andl %eax, %ebx C -n1 & d
569 addl %esi, %ebx C nadj = n10 + (-n1 & d), ignoring overflow
570 addl %edi, %eax C n2+n1
572 mull VAR_INVERSE C m*(n2+n1)
580 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
581 leal 1(%edi), %ebx C n2+1
586 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
588 sbbl $0, %ebx C q1 if q1+1 overflowed
601 movl PARAM_XSIZE, %eax
603 sbbl %edx, %edi C n - (q1+1)*d
604 movl %esi, %edi C remainder -> n2
605 leal (%ebp,%esi), %edx
607 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
616 orl %eax, %eax C xsize
630 addl $STACK_SPACE, %esp
638 C -----------------------------------------------------------------------------
640 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
641 C of q*d is simply -d and the remainder n-q*d = n10+d
653 movl VAR_DST_STOP, %edx
658 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
661 movd %mm0, %esi C next n10
666 jmp L(integer_loop_done)
670 C -----------------------------------------------------------------------------
672 C In the current implementation, the following successively dependent
673 C micro-ops seem to exist.
684 C The loop in fact runs at about 17.5 cycles. Using a sarl/andl/addl for
685 C the addback was found to be a touch slower.
699 movl VAR_DST_STOP, %ecx C &dst[xsize+2]
702 subl $8, %ecx C &dst[xsize]
707 C eax n2, then scratch
708 C ebx scratch (nadj, q1)
709 C ecx dst, decrementing
715 mull VAR_INVERSE C m*n2
727 addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
739 negl %eax C low of n - (q1+1)*d
741 sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
742 leal (%ebp,%eax), %edx
744 cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1
747 movl %eax, %edi C remainder->n2
750 movl %ebx, (%ecx) C previous q