1 dnl Intel P6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
3 dnl Copyright 1999, 2000, 2001, 2002, 2005 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
25 C P6 model 0-8,10-12) 6.44
27 C P6 model 13 (Dothan) 6.11
28 C P4 model 0 (Willamette)
30 C P4 model 2 (Northwood)
31 C P4 model 3 (Prescott)
38 dnl P6 UNROLL_COUNT cycles/limb
43 dnl Maximum possible with the current code is 64.
45 deflit(UNROLL_COUNT, 16)
48 ifdef(`OPERATION_addmul_1', `
50 define(M4_function_1, mpn_addmul_1)
51 define(M4_function_1c, mpn_addmul_1c)
52 define(M4_description, add it to)
53 define(M4_desc_retval, carry)
54 ',`ifdef(`OPERATION_submul_1', `
56 define(M4_function_1, mpn_submul_1)
57 define(M4_function_1c, mpn_submul_1c)
58 define(M4_description, subtract it from)
59 define(M4_desc_retval, borrow)
60 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
63 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
66 C mp_limb_t M4_function_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
68 C mp_limb_t M4_function_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
69 C mp_limb_t mult, mp_limb_t carry);
71 C Calculate src,size multiplied by mult and M4_description dst,size.
72 C Return the M4_desc_retval limb from the top of the result.
74 C This code is pretty much the same as the K6 code. The unrolled loop is
75 C the same, but there's just a few scheduling tweaks in the setups and the
78 C A number of variations have been tried for the unrolled loop, with one or
79 C two carries, and with loads scheduled earlier, but nothing faster than 6
80 C cycles/limb has been found.
83 deflit(UNROLL_THRESHOLD, 5)
85 deflit(UNROLL_THRESHOLD, 5)
88 defframe(PARAM_CARRY, 20)
89 defframe(PARAM_MULTIPLIER,16)
90 defframe(PARAM_SIZE, 12)
91 defframe(PARAM_SRC, 8)
92 defframe(PARAM_DST, 4)
97 PROLOGUE(M4_function_1c)
100 movl PARAM_CARRY, %ebx
104 PROLOGUE(M4_function_1)
107 xorl %ebx, %ebx C initial carry
110 movl PARAM_SIZE, %ecx
121 cmpl $UNROLL_THRESHOLD, %ecx
123 movl PARAM_MULTIPLIER, %ebp
128 C this is offset 0x22, so close enough to aligned
146 M4_inst %eax, -4(%edi)
167 C------------------------------------------------------------------------------
168 C VAR_JUMP holds the computed jump temporarily because there's not enough
169 C registers when doing the mul for the initial two carry limbs.
171 C The add/adc for the initial carry in %ebx is necessary only for the
172 C mpn_add/submul_1c entry points. Duplicating the startup code to
173 C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
176 dnl overlapping with parameters already fetched
177 define(VAR_COUNTER,`PARAM_SIZE')
178 define(VAR_JUMP, `PARAM_DST')
180 C this is offset 0x43, so close enough to aligned
196 shrl $UNROLL_LOG2, %edx
197 andl $UNROLL_MASK, %ecx
199 movl %edx, VAR_COUNTER
202 C 15 code bytes per limb
210 leal L(entry) (%edx,%ecx,1), %edx
212 movl (%esi), %eax C src low limb
215 leal ifelse(UNROLL_BYTES,256,128+) 4(%esi,%ecx,4), %esi
219 addl %ebx, %eax C initial carry (from _1c)
222 movl %edx, %ebx C high carry
223 leal ifelse(UNROLL_BYTES,256,128) (%edi,%ecx,4), %edi
227 movl %eax, %ecx C low carry
229 cmovnz( %ebx, %ecx) C high,low carry other way around
240 C See mpn/x86/README about old gas bugs
241 leal (%edx,%ecx,1), %edx
242 addl $L(entry)-L(here), %edx
250 C -----------------------------------------------------------
262 C VAR_COUNTER loop counter
264 C 15 code bytes per limb
266 addl $UNROLL_BYTES, %edi
269 deflit(CHUNK_COUNT,2)
270 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
271 deflit(`disp0', eval(i*4*CHUNK_COUNT ifelse(UNROLL_BYTES,256,-128)))
272 deflit(`disp1', eval(disp0 + 4))
274 Zdisp( movl, disp0,(%esi), %eax)
276 Zdisp( M4_inst,%ecx, disp0,(%edi))
281 movl disp1(%esi), %eax
283 M4_inst %ebx, disp1(%edi)
290 leal UNROLL_BYTES(%esi), %esi
295 deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
297 M4_inst %ecx, disp0(%edi)