1 dnl AMD K6 mpn_gcd_1 -- mpn by 1 gcd.
3 dnl Copyright 2000, 2001, 2002, 2004 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C K6: 9.5 cycles/bit (approx) 1x1 gcd
24 C 11.0 cycles/limb Nx1 reduction (modexact_1_odd)
27 C mp_limb_t mpn_gcd_1 (mp_srcptr src, mp_size_t size, mp_limb_t y);
29 C This code is nothing very special, but offers a speedup over what gcc 2.95
30 C can do with mpn/generic/gcd_1.c.
34 C Using a lookup table to count trailing zeros seems a touch quicker, but
35 C after a slightly longer startup. Might be worthwhile if an mpn_gcd_2 used
39 dnl If size==1 and x (the larger operand) is more than DIV_THRESHOLD bits
40 dnl bigger than y, then a division x%y is done to reduce it.
42 dnl A divl is 20 cycles and the loop runs at about 9.5 cycles/bitpair so
43 dnl there should be an advantage in the divl at about 4 or 5 bits, which is
46 deflit(DIV_THRESHOLD, 5)
49 defframe(PARAM_LIMB, 12)
50 defframe(PARAM_SIZE, 8)
51 defframe(PARAM_SRC, 4)
59 ASSERT(ne, `cmpl $0, PARAM_LIMB')
60 ASSERT(ae, `cmpl $1, PARAM_SIZE')
64 pushl %ebx FRAME_pushl()
69 movl (%eax), %ebx C src low limb
71 movl %ebx, %eax C src low limb
78 jnc L(common_twos) C 1/4 chance on random data
82 ja L(size_two_or_more)
85 ASSERT(nz, `orl %eax, %eax') C should have src limb != 0
90 C Swap if necessary to make x>=y. Measures a touch quicker as a
91 C jump than a branch free calculation.
109 C See if it's worth reducing x with a divl.
116 shrl $DIV_THRESHOLD, %ebx
154 leal 1(%edx,%edx), %edx
155 movl %ecx, %ebx C common twos
161 C Calculating a %cl shift based on the low bit 0 or 1 avoids doing a branch
162 C on a 50/50 chance of 0 or 1. The chance of the next bit also being 0 is
165 C A second computed %cl shift was tried, but that measured a touch slower
166 C than branching back.
168 C A branch-free abs(x-y) and min(x,y) calculation was tried, but that
169 C measured about 1 cycle/bit slower.
178 addl %eax, %edx C x-y+y = x
179 negl %eax C -(x-y) = y-x
182 shrl %eax C odd-odd = even, so always one to strip
189 andl $1, %ecx C (x^1)&1
191 shrl %cl, %eax C shift if x even
196 ASSERT(nz,`testl $1, %eax') C x, y odd
197 ASSERT(nz,`testl $1, %edx')
214 C -----------------------------------------------------------------------------
217 C x={src,size} is reduced modulo y using either a plain mod_1 style
218 C remainder, or a modexact_1 style exact division.
220 deflit(MODEXACT_THRESHOLD, ifdef(`PIC', 4, 4))
227 C edx y, without common twos
232 deflit(FRAME_TWO_OR_MORE, FRAME)
234 pushl %edi defframe_pushl(SAVE_EDI)
241 movl %ecx, %edi C common twos
242 movl PARAM_SIZE, %ecx
244 pushl %esi defframe_pushl(SAVE_ESI)
245 leal 1(%edx,%edx), %esi C y (odd)
247 movl -4(%ebx,%ecx,4), %eax C src high limb
249 cmpl %edx, %eax C carry if high<divisor
251 sbbl %edx, %edx C -1 if high<divisor
253 addl %edx, %ecx C skip one limb if high<divisor
256 cmpl $MODEXACT_THRESHOLD, %ecx
261 C eax scratch (quotient)
263 C ecx counter, size-1 to 1
264 C edx carry (remainder)
269 movl -4(%ebx,%ecx,4), %eax
275 movl %esi, %edx C y (odd)
277 movl %edi, %ebx C common twos
306 movl PARAM_SIZE, %eax
307 pushl %esi FRAME_pushl()
309 pushl %eax FRAME_pushl()
311 pushl %ebx FRAME_pushl()
317 addl $_GLOBAL_OFFSET_TABLE_, %ebx
318 call GSYM_PREFIX`'mpn_modexact_1_odd@PLT
320 call GSYM_PREFIX`'mpn_modexact_1_odd
323 movl %esi, %edx C y odd
326 movl %edi, %ebx C common twos
329 addl $eval(FRAME - FRAME_TWO_OR_MORE), %esp