1 dnl AMD K7 mpn_divrem_1, mpn_divrem_1c, mpn_preinv_divrem_1 -- mpn by limb
4 dnl Copyright 1999, 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or
9 dnl modify it under the terms of the GNU Lesser General Public License as
10 dnl published by the Free Software Foundation; either version 3 of the
11 dnl License, or (at your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful,
14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 dnl Lesser General Public License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
24 C K7: 17.0 cycles/limb integer part, 15.0 cycles/limb fraction part.
27 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
28 C mp_srcptr src, mp_size_t size,
30 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
31 C mp_srcptr src, mp_size_t size,
32 C mp_limb_t divisor, mp_limb_t carry);
33 C mp_limb_t mpn_preinv_divrem_1 (mp_ptr dst, mp_size_t xsize,
34 C mp_srcptr src, mp_size_t size,
35 C mp_limb_t divisor, mp_limb_t inverse,
40 C The method and nomenclature follow part 8 of "Division by Invariant
41 C Integers using Multiplication" by Granlund and Montgomery, reference in
44 C The "and"s shown in the paper are done here with "cmov"s. "m" is written
45 C for m', and "d" for d_norm, which won't cause any confusion since it's
46 C only the normalized divisor that's of any use in the code. "b" is written
47 C for 2^N, the size of a limb, N being 32 here.
49 C The step "sdword dr = n - 2^N*d + (2^N-1-q1) * d" is instead done as
50 C "n-(q1+1)*d"; this rearrangement gives the same two-limb answer. If
51 C q1==0xFFFFFFFF, then q1+1 would overflow. We branch to a special case
52 C "q1_ff" if this occurs. Since the true quotient is either q1 or q1+1 then
53 C if q1==0xFFFFFFFF that must be the right value.
55 C For the last and second last steps q1==0xFFFFFFFF is instead handled by an
56 C sbbl to go back to 0xFFFFFFFF if an overflow occurs when adding 1. This
57 C then goes through as normal, and finding no addback required. sbbl costs
58 C an extra cycle over what the main loop code does, but it keeps code size
59 C and complexity down.
63 C mpn_divrem_1 and mpn_preinv_divrem_1 avoid one division if the src high
64 C limb is less than the divisor. mpn_divrem_1c doesn't check for a zero
65 C carry, since in normal circumstances that will be a very rare event.
67 C The test for skipping a division is branch free (once size>=1 is tested).
68 C The store to the destination high limb is 0 when a divide is skipped, or
69 C if it's not skipped then a copy of the src high limb is used. The latter
70 C is in case src==dst.
72 C There's a small bias towards expecting xsize==0, by having code for
73 C xsize==0 in a straight line and xsize!=0 under forward jumps.
77 C If the divisor is normalized (high bit set) then a division step can
78 C always be skipped, since the high destination limb is always 0 or 1 in
79 C that case. It doesn't seem worth checking for this though, since it
80 C probably occurs infrequently, in particular note that big_base for a
81 C decimal mpn_get_str is not normalized in a 32-bit limb.
84 dnl MUL_THRESHOLD is the value of xsize+size at which the multiply by
85 dnl inverse method is used, rather than plain "divl"s. Minimum value 1.
87 dnl The inverse takes about 50 cycles to calculate, but after that the
88 dnl multiply is 17 c/l versus division at 42 c/l.
90 dnl At 3 limbs the mul is a touch faster than div on the integer part, and
91 dnl even more so on the fractional part.
93 deflit(MUL_THRESHOLD, 3)
96 defframe(PARAM_PREINV_SHIFT, 28) dnl mpn_preinv_divrem_1
97 defframe(PARAM_PREINV_INVERSE, 24) dnl mpn_preinv_divrem_1
98 defframe(PARAM_CARRY, 24) dnl mpn_divrem_1c
99 defframe(PARAM_DIVISOR,20)
100 defframe(PARAM_SIZE, 16)
101 defframe(PARAM_SRC, 12)
102 defframe(PARAM_XSIZE, 8)
103 defframe(PARAM_DST, 4)
105 defframe(SAVE_EBX, -4)
106 defframe(SAVE_ESI, -8)
107 defframe(SAVE_EDI, -12)
108 defframe(SAVE_EBP, -16)
110 defframe(VAR_NORM, -20)
111 defframe(VAR_INVERSE, -24)
112 defframe(VAR_SRC, -28)
113 defframe(VAR_DST, -32)
114 defframe(VAR_DST_STOP,-36)
116 deflit(STACK_SPACE, 36)
121 PROLOGUE(mpn_preinv_divrem_1)
123 movl PARAM_XSIZE, %ecx
125 subl $STACK_SPACE, %esp FRAME_subl_esp(STACK_SPACE)
131 movl PARAM_SIZE, %ebx
133 leal 8(%edx,%ecx,4), %edx C &dst[xsize+2]
135 movl PARAM_DIVISOR, %ebp
137 movl %edx, VAR_DST_STOP C &dst[xsize+2]
139 xorl %edi, %edi C carry
141 movl -4(%esi,%ebx,4), %eax C src high limb
148 cmpl %ebp, %eax C high cmp divisor
150 cmovc( %eax, %edi) C high is carry if high<divisor
151 cmovnc( %eax, %ecx) C 0 if skip div, src high if not
152 C (the latter in case src==dst)
154 movl %ecx, -12(%edx,%ebx,4) C dst high limb
155 sbbl $0, %ebx C skip one division if high<divisor
156 movl PARAM_PREINV_SHIFT, %ecx
158 leal -8(%edx,%ebx,4), %edx C &dst[xsize+size]
161 movl %edx, VAR_DST C &dst[xsize+size]
163 shll %cl, %ebp C d normalized
167 movd %eax, %mm7 C rshift
168 movl PARAM_PREINV_INVERSE, %eax
176 PROLOGUE(mpn_divrem_1c)
178 movl PARAM_CARRY, %edx
179 movl PARAM_SIZE, %ecx
180 subl $STACK_SPACE, %esp
181 deflit(`FRAME',STACK_SPACE)
184 movl PARAM_XSIZE, %ebx
190 movl PARAM_DIVISOR, %ebp
195 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
201 C offset 0xa1, close enough to aligned
202 PROLOGUE(mpn_divrem_1)
205 movl PARAM_SIZE, %ecx
206 movl $0, %edx C initial carry (if can't skip a div)
207 subl $STACK_SPACE, %esp
208 deflit(`FRAME',STACK_SPACE)
214 movl PARAM_XSIZE, %ebx
217 movl PARAM_DIVISOR, %ebp
218 orl %ecx, %ecx C size
222 leal -4(%edi,%ebx,4), %edi C &dst[xsize-1]
224 jz L(no_skip_div) C if size==0
225 movl -4(%esi,%ecx,4), %eax C src high limb
228 cmpl %ebp, %eax C high cmp divisor
230 cmovc( %eax, %edx) C high is carry if high<divisor
231 cmovnc( %eax, %esi) C 0 if skip div, src high if not
233 movl %esi, (%edi,%ecx,4) C dst high limb
234 sbbl $0, %ecx C size-1 if high<divisor
235 movl PARAM_SRC, %esi C reload
248 leal (%ebx,%ecx), %eax C size+xsize
249 cmpl $MUL_THRESHOLD, %eax
250 jae L(mul_by_inverse)
253 C With MUL_THRESHOLD set to 3, the simple loops here only do 0 to 2 limbs.
254 C It'd be possible to write them out without the looping, but no speedup
257 C Using PARAM_DIVISOR instead of %ebp measures 1 cycle/loop faster on the
258 C integer part, but curiously not on the fractional part, where %ebp is a
259 C (fixed) couple of cycles faster.
262 jz L(divide_no_integer)
265 C eax scratch (quotient)
268 C edx scratch (remainder)
273 movl -4(%esi,%ecx,4), %eax
277 movl %eax, (%edi,%ecx,4)
279 jnz L(divide_integer)
282 L(divide_no_integer):
285 jnz L(divide_fraction)
294 addl $STACK_SPACE, %esp
300 C eax scratch (quotient)
303 C edx scratch (remainder)
312 movl %eax, -4(%edi,%ebx,4)
314 jnz L(divide_fraction)
320 C -----------------------------------------------------------------------------
331 bsrl %ebp, %eax C 31-l
333 leal 12(%edi), %ebx C &dst[xsize+2], loop dst stop
334 leal 4(%edi,%ecx,4), %edi C &dst[xsize+size]
337 movl %ebx, VAR_DST_STOP
339 movl %ecx, %ebx C size
342 movl %edx, %edi C carry
350 shll %cl, %ebp C d normalized
356 subl %ebp, %edx C (b-d)-1 giving edx:eax = b*(b-d)-1
358 divl %ebp C floor (b*(b-d)-1) / d
371 orl %ebx, %ebx C size
372 movl %eax, VAR_INVERSE
373 leal -12(%esi,%ebx,4), %eax C &src[size-3]
379 movl 8(%eax), %esi C src high limb
382 L(start_two_or_more):
383 movl 4(%eax), %edx C src second highest limb
385 shldl( %cl, %esi, %edi) C n2 = carry,high << l
387 shldl( %cl, %edx, %esi) C n10 = high,second << l
390 je L(integer_two_left)
395 shldl( %cl, %esi, %edi) C n2 = carry,high << l
397 shll %cl, %esi C n10 = high << l
399 jmp L(integer_one_left)
403 C Can be here with xsize==0 if mpn_preinv_divrem_1 had size==1 and
404 C skipped a division.
406 shll %cl, %edi C n2 = carry << l
407 movl %edi, %eax C return value for zero_done
415 C -----------------------------------------------------------------------------
417 C The multiply by inverse loop is 17 cycles, and relies on some out-of-order
418 C execution. The instruction scheduling is important, with various
419 C apparently equivalent forms running 1 to 5 cycles slower.
421 C A lower bound for the time would seem to be 16 cycles, based on the
422 C following successive dependencies.
434 C This chain is what the loop has already, but 16 cycles isn't achieved.
435 C K7 has enough decode, and probably enough execute (depending maybe on what
436 C a mul actually consumes), but nothing running under 17 has been found.
438 C In theory n2+n1 could be done in the sub and addback stages (by
439 C calculating both n2 and n2+n1 there), but lack of registers makes this an
440 C unlikely proposition.
442 C The jz in the loop keeps the q1+1 stage to 1 cycle. Handling an overflow
443 C from q1+1 with an "sbbl $0, %ebx" would add a cycle to the dependent
444 C chain, and nothing better than 18 cycles has been found when using it.
445 C The jump is taken only when q1 is 0xFFFFFFFF, and on random data this will
446 C be an extremely rare event.
448 C Branch mispredictions will hit random occurrances of q1==0xFFFFFFFF, but
449 C if some special data is coming out with this always, the q1_ff special
450 C case actually runs at 15 c/l. 0x2FFF...FFFD divided by 3 is a good way to
451 C induce the q1_ff case, for speed measurements or testing. Note that
452 C 0xFFF...FFF divided by 1 or 2 doesn't induce it.
454 C The instruction groupings and empty comments show the cycles for a naive
455 C in-order view of the code (conveniently ignoring the load latency on
456 C VAR_INVERSE). This shows some of where the time is going, but is nonsense
457 C to the extent that out-of-order execution rearranges it. In this case
458 C there's 19 cycles shown, but it executes at 17.
463 C ebx scratch (nadj, q1)
464 C ecx scratch (src, dst)
470 C mm0 scratch (src qword)
471 C mm7 rshift for normalization
473 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
477 leal (%ebp,%esi), %ebx
478 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
479 sbbl $-1, %eax C n2+n1
481 mull VAR_INVERSE C m*(n2+n1)
483 movq (%ecx), %mm0 C next limb and the one below it
490 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
491 leal 1(%edi), %ebx C n2+1
496 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
509 movl VAR_DST_STOP, %eax
513 sbbl %edx, %edi C n - (q1+1)*d
514 movl %esi, %edi C remainder -> n2
515 leal (%ebp,%esi), %edx
519 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
528 L(integer_loop_done):
531 C -----------------------------------------------------------------------------
533 C Here, and in integer_one_left below, an sbbl $0 is used rather than a jz
534 C q1_ff special case. This make the code a bit smaller and simpler, and
535 C costs only 1 cycle (each).
539 C ebx scratch (nadj, q1)
540 C ecx scratch (src, dst)
548 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
552 leal (%ebp,%esi), %ebx
553 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
554 sbbl $-1, %eax C n2+n1
556 mull VAR_INVERSE C m*(n2+n1)
558 movd (%ecx), %mm0 C src low limb
560 movl VAR_DST_STOP, %ecx
564 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
565 leal 1(%edi), %ebx C n2+1
568 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
584 sbbl %edx, %edi C n - (q1+1)*d
585 movl %esi, %edi C remainder -> n2
586 leal (%ebp,%esi), %edx
590 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
596 C -----------------------------------------------------------------------------
599 C ebx scratch (nadj, q1)
608 movl VAR_DST_STOP, %ecx
609 cmpl $0x80000000, %esi C n1 as 0=c, 1=nc
612 leal (%ebp,%esi), %ebx
613 cmovc( %esi, %ebx) C nadj = n10 + (-n1 & d), ignoring overflow
614 sbbl $-1, %eax C n2+n1
616 mull VAR_INVERSE C m*(n2+n1)
624 addl %ebx, %eax C m*(n2+n1) + nadj, low giving carry flag
625 leal 1(%edi), %ebx C n2+1
630 adcl %edx, %ebx C 1 + high(n2<<32 + m*(n2+n1) + nadj) = q1+1
632 sbbl $0, %ebx C q1 if q1+1 overflowed
646 sbbl %edx, %edi C n - (q1+1)*d
647 movl %esi, %edi C remainder -> n2
648 leal (%ebp,%esi), %edx
650 cmovc( %edx, %edi) C n - q1*d if underflow from using q1+1
672 addl $STACK_SPACE, %esp
680 C -----------------------------------------------------------------------------
682 C Special case for q1=0xFFFFFFFF, giving q=0xFFFFFFFF meaning the low dword
683 C of q*d is simply -d and the remainder n-q*d = n10+d
695 movl VAR_DST_STOP, %edx
699 leal (%ebp,%esi), %edi C n-q*d remainder -> next n2
702 movd %mm0, %esi C next n10
708 jmp L(integer_loop_done)
712 C -----------------------------------------------------------------------------
714 C Being the fractional part, the "source" limbs are all zero, meaning
715 C n10=0, n1=0, and hence nadj=0, leading to many instructions eliminated.
717 C The loop runs at 15 cycles. The dependent chain is the same as the
718 C general case above, but without the n2+n1 stage (due to n1==0), so 15
719 C would seem to be the lower bound.
721 C A not entirely obvious simplification is that q1+1 never overflows a limb,
722 C and so there's no need for the sbbl $0 or jz q1_ff from the general case.
723 C q1 is the high word of m*n2+b*n2 and the following shows q1<=b-2 always.
724 C rnd() means rounding down to a multiple of d.
726 C m*n2 + b*n2 <= m*(d-1) + b*(d-1)
727 C = m*d + b*d - m - b
728 C = floor((b(b-d)-1)/d)*d + b*d - m - b
729 C = rnd(b(b-d)-1) + b*d - m - b
730 C = rnd(b(b-d)-1 + b*d) - m - b
731 C = rnd(b*b-1) - m - b
734 C Unchanged from the general case is that the final quotient limb q can be
735 C either q1 or q1+1, and the q1+1 case occurs often. This can be seen from
736 C equation 8.4 of the paper which simplifies as follows when n1==0 and
739 C n-q1*d = (n2*k+q0*d)/b <= d + (d*d-2d)/b
741 C As before, the instruction groupings and empty comments show a naive
742 C in-order view of the code, which is made a nonsense by out of order
743 C execution. There's 17 cycles shown, but it executes at 15.
745 C Rotating the store q and remainder->n2 instructions up to the top of the
746 C loop gets the run time down from 16 to 15.
759 movl VAR_DST_STOP, %ecx C &dst[xsize+2]
762 subl $8, %ecx C &dst[xsize]
763 jmp L(fraction_entry)
768 C eax n2 carry, then scratch
769 C ebx scratch (nadj, q1)
770 C ecx dst, decrementing
776 movl %ebx, (%ecx) C previous q
777 movl %eax, %edi C remainder->n2
780 mull VAR_INVERSE C m*n2
794 addl %edx, %ebx C 1 + high(n2<<32 + m*n2) = q1+1
804 negl %eax C low of n - (q1+1)*d
808 sbbl %edx, %edi C high of n - (q1+1)*d, caring only about carry
809 leal (%ebp,%eax), %edx
811 cmovc( %edx, %eax) C n - q1*d if underflow from using q1+1