1 dnl AMD K6 mpn_add/sub_n -- mpn addition or subtraction.
3 dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C K6: normal 3.25 cycles/limb, in-place 2.75 cycles/limb.
26 ifdef(`OPERATION_add_n', `
28 define(M4_function_n, mpn_add_n)
29 define(M4_function_nc, mpn_add_nc)
30 define(M4_description, add)
31 ',`ifdef(`OPERATION_sub_n', `
33 define(M4_function_n, mpn_sub_n)
34 define(M4_function_nc, mpn_sub_nc)
35 define(M4_description, subtract)
36 ',`m4_error(`Need OPERATION_add_n or OPERATION_sub_n
39 MULFUNC_PROLOGUE(mpn_add_n mpn_add_nc mpn_sub_n mpn_sub_nc)
42 C mp_limb_t M4_function_n (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
44 C mp_limb_t M4_function_nc (mp_ptr dst, mp_srcptr src1, mp_srcptr src2,
45 C mp_size_t size, mp_limb_t carry);
47 C Calculate src1,size M4_description src2,size, and store the result in
48 C dst,size. The return value is the carry bit from the top of the result
51 C The _nc version accepts 1 or 0 for an initial carry into the low limb of
52 C the calculation. Note values other than 1 or 0 here will lead to garbage
55 C Instruction decoding limits a normal dst=src1+src2 operation to 3 c/l, and
56 C an in-place dst+=src to 2.5 c/l. The unrolled loops have 1 cycle/loop of
57 C loop control, which with 4 limbs/loop means an extra 0.25 c/l.
59 define(PARAM_CARRY, `FRAME+20(%esp)')
60 define(PARAM_SIZE, `FRAME+16(%esp)')
61 define(PARAM_SRC2, `FRAME+12(%esp)')
62 define(PARAM_SRC1, `FRAME+8(%esp)')
63 define(PARAM_DST, `FRAME+4(%esp)')
66 dnl minimum 5 because the unrolled code can't handle less
67 deflit(UNROLL_THRESHOLD, 5)
72 PROLOGUE(M4_function_nc)
73 movl PARAM_CARRY, %eax
78 PROLOGUE(M4_function_n)
90 cmpl $UNROLL_THRESHOLD, %ecx
96 shrl %eax C initial carry flag
98 C offset 0x21 here, close enough to aligned
108 C The store to (%edi) could be done with a stosl; it'd be smaller
109 C code, but there's no speed gain and a cld would have to be added
110 C (per mpn/x86/README).
133 C -----------------------------------------------------------------------------
148 ifdef(`OPERATION_add_n',`
151 je L(inplace_reverse)
159 leal (%ebx,%ecx,4), %ebx
160 leal (%edx,%ecx,4), %edx
161 leal (%edi,%ecx,4), %edi
168 C eax counter, qwords, negative
176 movl (%ebx,%ecx,4), %eax
178 M4_inst -20(%edx,%ecx,4), %eax
179 movl %eax, -20(%edi,%ecx,4)
181 movl 4-20(%ebx,%ecx,4), %eax
182 M4_inst 4-20(%edx,%ecx,4), %eax
183 movl %eax, 4-20(%edi,%ecx,4)
185 movl 8-20(%ebx,%ecx,4), %eax
186 M4_inst 8-20(%edx,%ecx,4), %eax
187 movl %eax, 8-20(%edi,%ecx,4)
189 movl 12-20(%ebx,%ecx,4), %eax
190 M4_inst 12-20(%edx,%ecx,4), %eax
191 movl %eax, 12-20(%edi,%ecx,4)
197 jz L(normal_finish_one)
200 C two or three more limbs
207 M4_inst 4(%edx), %eax
214 L(normal_finish_one):
215 movl (%ebx,%ecx,4), %eax
216 M4_inst (%edx,%ecx,4), %eax
217 movl %eax, (%edi,%ecx,4)
231 C -----------------------------------------------------------------------------
233 ifdef(`OPERATION_add_n',`
255 movl (%edx), %ebx C src low limb
256 leal (%edx,%ecx,4), %edx
258 leal (%edi,%ecx,4), %edi
274 M4_inst %ebx, (%edi,%ecx,4)
276 movl 4(%edx,%ecx,4), %eax
279 M4_inst %eax, 4-20(%edi,%ecx,4)
281 movl 8-20(%edx,%ecx,4), %eax
282 movl 12-20(%edx,%ecx,4), %ebx
284 M4_inst %eax, 8-20(%edi,%ecx,4)
285 M4_inst %ebx, 12-20(%edi,%ecx,4)
287 movl 16-20(%edx,%ecx,4), %ebx
291 C now %esi is 0 to 3 representing respectively 1 to 4 limbs more
296 jz L(inplace_finish_one)
299 C two or three more limbs
303 M4_inst %eax, 4(%edi)
304 M4_inst %ebx, 8(%edi)
311 L(inplace_finish_one):
312 movl 4(%edx,%ecx,4), %eax
313 M4_inst %eax, 4(%edi,%ecx,4)