1 dnl AMD K7 mpn_add_n/mpn_sub_n -- mpn add or subtract.
3 dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C K7: 1.64 cycles/limb (at 16 limbs/loop).
27 dnl K7: UNROLL_COUNT cycles/limb
32 dnl Maximum possible with the current code is 64.
34 deflit(UNROLL_COUNT, 16)
37 ifdef(`OPERATION_add_n', `
39 define(M4_function_n, mpn_add_n)
40 define(M4_function_nc, mpn_add_nc)
41 define(M4_description, add)
42 ',`ifdef(`OPERATION_sub_n', `
44 define(M4_function_n, mpn_sub_n)
45 define(M4_function_nc, mpn_sub_nc)
46 define(M4_description, subtract)
47 ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
50 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
53 C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
55 C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
56 C mp_size_t size, mp_limb_t carry);
58 C Calculate src1,size M4_description src2,size, and store the result in
59 C dst,size. The return value is the carry bit from the top of the result (1
62 C The _nc version accepts 1 or 0 for an initial carry into the low limb of
63 C the calculation. Note values other than 1 or 0 here will lead to garbage
66 C This code runs at 1.64 cycles/limb, which might be the best possible with
67 C plain integer operations. Each limb is 2 loads and 1 store, any 2 of
68 C which can be done each cycle, leading to 1.5 c/l.
70 dnl Must have UNROLL_THRESHOLD >= 2, since the unrolled loop can't handle 1.
72 deflit(UNROLL_THRESHOLD, 8)
74 deflit(UNROLL_THRESHOLD, 8)
77 defframe(PARAM_CARRY,20)
78 defframe(PARAM_SIZE, 16)
79 defframe(PARAM_SRC2, 12)
80 defframe(PARAM_SRC1, 8)
81 defframe(PARAM_DST, 4)
83 defframe(SAVE_EBP, -4)
84 defframe(SAVE_ESI, -8)
85 defframe(SAVE_EBX, -12)
86 defframe(SAVE_EDI, -16)
87 deflit(STACK_SPACE, 16)
93 PROLOGUE(M4_function_nc)
94 movl PARAM_CARRY, %eax
98 PROLOGUE(M4_function_n)
100 xorl %eax, %eax C carry
102 movl PARAM_SIZE, %ecx
103 subl $STACK_SPACE, %esp
104 deflit(`FRAME',STACK_SPACE)
108 cmpl $UNROLL_THRESHOLD, %ecx
110 movl PARAM_SRC2, %edx
111 movl PARAM_SRC1, %ebx
115 leal (%ebx,%ecx,4), %ebx
116 leal (%edx,%ecx,4), %edx
118 leal (%edi,%ecx,4), %edi
122 C This loop in in a single 16 byte code block already, so no
123 C alignment necessary.
133 movl (%ebx,%ecx,4), %eax
134 M4_inst (%edx,%ecx,4), %eax
135 movl %eax, (%edi,%ecx,4)
144 addl $STACK_SPACE, %esp
149 C -----------------------------------------------------------------------------
150 C This is at 0x55, close enough to aligned.
152 deflit(`FRAME',STACK_SPACE)
154 andl $-2, %ecx C size low bit masked out
155 andl $1, PARAM_SIZE C size low bit kept
161 shrl $UNROLL_LOG2, %ecx
165 andl $UNROLL_MASK, %edi
171 leal L(entry) (%edi,%edi,8), %esi C 9 bytes per
176 leal ifelse(UNROLL_BYTES,256,128) (%ebx,%edi,4), %ebx
177 leal ifelse(UNROLL_BYTES,256,128) (%edx,%edi,4), %edx
178 leal ifelse(UNROLL_BYTES,256,128) (%ebp,%edi,4), %edi
185 C See mpn/x86/README about old gas bugs
186 leal (%edi,%edi,8), %esi
187 addl $L(entry)-L(here), %esi
193 C -----------------------------------------------------------------------------
200 C esi scratch (was computed jump)
204 leal UNROLL_BYTES(%edx), %edx
207 deflit(CHUNK_COUNT, 2)
208 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
209 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
210 deflit(`disp1', eval(disp0 + 4))
212 Zdisp( movl, disp0,(%ebx), %esi)
213 movl disp1(%ebx), %ebp
214 Zdisp( M4_inst,disp0,(%edx), %esi)
215 Zdisp( movl, %esi, disp0,(%edi))
216 M4_inst disp1(%edx), %ebp
217 movl %ebp, disp1(%edi)
221 leal UNROLL_BYTES(%ebx), %ebx
222 leal UNROLL_BYTES(%edi), %edi
234 M4_inst UNROLL_BYTES(%edx), %ecx
243 addl $STACK_SPACE, %esp