1 dnl x86 mpn_divrem_1 -- mpn by limb division extending to fractional quotient.
3 dnl Copyright 1999, 2000, 2001, 2002, 2003, 2007 Free Software Foundation,
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or
9 dnl modify it under the terms of the GNU Lesser General Public License as
10 dnl published by the Free Software Foundation; either version 3 of the
11 dnl License, or (at your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful,
14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 dnl Lesser General Public License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
34 C mp_limb_t mpn_divrem_1 (mp_ptr dst, mp_size_t xsize,
35 C mp_srcptr src, mp_size_t size, mp_limb_t divisor);
36 C mp_limb_t mpn_divrem_1c (mp_ptr dst, mp_size_t xsize,
37 C mp_srcptr src, mp_size_t size, mp_limb_t divisor,
40 C Divide src,size by divisor and store the quotient in dst+xsize,size.
41 C Extend the division to fractional quotient limbs in dst,xsize. Return the
42 C remainder. Either or both xsize and size can be 0.
44 C mpn_divrem_1c takes a carry parameter which is an initial high limb,
45 C effectively one extra limb at the top of src,size. Must have
49 C Essentially the code is the same as the division based part of
50 C mpn/generic/divrem_1.c, but has the advantage that we get the desired divl
51 C instruction even when gcc is not being used (when longlong.h only has the
52 C rather slow generic C udiv_qrnnd().
54 C A test is done to see if the high limb is less than the divisor, and if so
55 C one less div is done. A div is between 20 and 40 cycles on the various
56 C x86s, so assuming high<divisor about half the time, then this test saves
57 C half that amount. The branch misprediction penalty on each chip is less
63 C It might be thought that moving the load down to pair with the store would
64 C save 1 cycle, but that doesn't seem to happen in practice, and in any case
65 C would be a mere 2.2% saving, so it's hardly worth bothering about.
67 C A mul-by-inverse might be a possibility for P5, as done in
68 C mpn/x86/pentium/mod_1.asm. The number of auxiliary instructions required
69 C is a hinderance, but there could be a 10-15% speedup available.
74 C K6 has its own version of this code, using loop and paying attention to
75 C cache line boundary crossings. The target 20 c/l can be had with the
76 C decl+jnz of the present code by pairing up the load and store in the
77 C loops. But it's considered easier not to introduce complexity just for
78 C that, but instead let k6 have its own code.
81 defframe(PARAM_CARRY, 24)
82 defframe(PARAM_DIVISOR,20)
83 defframe(PARAM_SIZE, 16)
84 defframe(PARAM_SRC, 12)
85 defframe(PARAM_XSIZE, 8)
86 defframe(PARAM_DST, 4)
91 PROLOGUE(mpn_divrem_1c)
95 pushl %edi FRAME_pushl()
98 pushl %esi FRAME_pushl()
100 movl PARAM_DIVISOR, %esi
101 pushl %ebx FRAME_pushl()
104 pushl %ebp FRAME_pushl()
106 movl PARAM_XSIZE, %ebp
109 movl PARAM_CARRY, %edx
112 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
118 PROLOGUE(mpn_divrem_1)
121 movl PARAM_SIZE, %ecx
122 pushl %edi FRAME_pushl()
125 pushl %esi FRAME_pushl()
127 movl PARAM_DIVISOR, %esi
131 pushl %ebx FRAME_pushl()
133 movl -4(%edi,%ecx,4), %eax C src high limb
137 pushl %ebp FRAME_pushl()
139 movl PARAM_XSIZE, %ebp
142 leal -4(%ebx,%ebp,4), %ebx C dst one limb below integer part
146 C high<divisor, so high of dst is zero, and avoid one div
148 movl %edx, (%ebx,%ecx,4)
156 C eax scratch (quotient)
159 C edx scratch (remainder)
164 movl -4(%edi,%ecx,4), %eax
169 movl %eax, (%ebx,%ecx,4)
182 C eax scratch (quotient)
185 C edx scratch (remainder)
194 movl %eax, -4(%ebx,%ecx,4)
210 movl PARAM_XSIZE, %ecx
215 cld C better safe than sorry, see mpn/x86/README