1 dnl AMD K6 mpn_addmul_1/mpn_submul_1 -- add or subtract mpn multiple.
3 dnl Copyright 1999, 2000, 2001, 2002, 2003, 2005 Free Software Foundation,
6 dnl This file is part of the GNU MP Library.
8 dnl The GNU MP Library is free software; you can redistribute it and/or
9 dnl modify it under the terms of the GNU Lesser General Public License as
10 dnl published by the Free Software Foundation; either version 3 of the
11 dnl License, or (at your option) any later version.
13 dnl The GNU MP Library is distributed in the hope that it will be useful,
14 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
15 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 dnl Lesser General Public License for more details.
18 dnl You should have received a copy of the GNU Lesser General Public License
19 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
26 C P6 model 0-8,10-12) 5.94
28 C P6 model 13 (Dothan) 5.57
29 C P4 model 0 (Willamette)
31 C P4 model 2 (Northwood)
32 C P4 model 3 (Prescott)
34 C K6: 7.65-8.5 (data dependent)
39 dnl K6: large multipliers small multipliers
40 dnl UNROLL_COUNT cycles/limb cycles/limb
46 dnl Maximum possible unrolling with the current code is 32.
48 dnl Unrolling to 16 limbs/loop makes the unrolled loop fit exactly in a 256
49 dnl byte block, which might explain the good speed at that unrolling.
51 deflit(UNROLL_COUNT, 16)
54 ifdef(`OPERATION_addmul_1', `
56 define(M4_function_1, mpn_addmul_1)
57 define(M4_function_1c, mpn_addmul_1c)
58 ',`ifdef(`OPERATION_submul_1', `
60 define(M4_function_1, mpn_submul_1)
61 define(M4_function_1c, mpn_submul_1c)
62 ',`m4_error(`Need OPERATION_addmul_1 or OPERATION_submul_1
65 MULFUNC_PROLOGUE(mpn_addmul_1 mpn_addmul_1c mpn_submul_1 mpn_submul_1c)
68 C mp_limb_t mpn_addmul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
70 C mp_limb_t mpn_addmul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
71 C mp_limb_t mult, mp_limb_t carry);
72 C mp_limb_t mpn_submul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
74 C mp_limb_t mpn_submul_1c (mp_ptr dst, mp_srcptr src, mp_size_t size,
75 C mp_limb_t mult, mp_limb_t carry);
77 C The jadcl0()s in the unrolled loop makes the speed data dependent. Small
78 C multipliers (most significant few bits clear) result in few carry bits and
79 C speeds up to 7.65 cycles/limb are attained. Large multipliers (most
80 C significant few bits set) make the carry bits 50/50 and lead to something
81 C more like 8.4 c/l. With adcl's both of these would be 9.3 c/l.
83 C It's important that the gains for jadcl0 on small multipliers don't come
84 C at the cost of slowing down other data. Tests on uniformly distributed
85 C random data, designed to confound branch prediction, show about a 7%
86 C speed-up using jadcl0 over adcl (8.93 versus 9.57 cycles/limb, with all
87 C overheads included).
89 C In the simple loop, jadcl0() measures slower than adcl (11.9-14.7 versus
90 C 11.0 cycles/limb), and hence isn't used.
92 C In the simple loop, note that running ecx from negative to zero and using
93 C it as an index in the two movs wouldn't help. It would save one
94 C instruction (2*addl+loop becoming incl+jnz), but there's nothing unpaired
95 C that would be collapsed by this.
97 C Attempts at a simpler main loop, with less unrolling, haven't yielded much
98 C success, generally running over 9 c/l.
104 C jadcl0() being faster than adcl $0 seems to be an artifact of two things,
105 C firstly the instruction decoding and secondly the fact that there's a
106 C carry bit for the jadcl0 only on average about 1/4 of the time.
108 C The code in the unrolled loop decodes something like the following.
112 C M4_inst %esi, disp(%edi) 1
114 C movl %edx, %esi \ 1
117 C 1: movl disp(%ebx), %eax /
121 C In a back-to-back style test this measures 7 with the jnc not taken, or 8
122 C with it taken (both when correctly predicted). This is opposite to the
123 C measurements showing small multipliers running faster than large ones.
124 C Don't really know why.
126 C It's not clear how much branch misprediction might be costing. The K6
127 C doco says it will be 1 to 4 cycles, but presumably it's near the low end
128 C of that range to get the measured results.
131 C In the code the two carries are more or less the preceding mul product and
132 C the calculation is roughly
136 C where b=2^32 is the size of a limb, x*y is the two carry limbs, and u and
137 C v are the two limbs it's added to (being the low of the next mul, and a
138 C limb from the destination).
140 C To get a carry requires x*y+u*b+v >= b^2, which is u*b+v >= b^2-x*y, and
141 C there are b^2-(b^2-x*y) = x*y many such values, giving a probability of
142 C x*y/b^2. If x, y, u and v are random and uniformly distributed between 0
143 C and b-1, then the total probability can be summed over x and y,
145 C 1 b-1 b-1 x*y 1 b*(b-1) b*(b-1)
146 C --- * sum sum --- = --- * ------- * ------- = 1/4
147 C b^2 x=0 y=1 b^2 b^4 2 2
149 C Actually it's a very tiny bit less than 1/4 of course. If y is fixed,
150 C then the probability is 1/2*y/b thus varying linearly between 0 and 1/2.
154 deflit(UNROLL_THRESHOLD, 9)
156 deflit(UNROLL_THRESHOLD, 6)
159 defframe(PARAM_CARRY, 20)
160 defframe(PARAM_MULTIPLIER,16)
161 defframe(PARAM_SIZE, 12)
162 defframe(PARAM_SRC, 8)
163 defframe(PARAM_DST, 4)
168 PROLOGUE(M4_function_1c)
171 movl PARAM_CARRY, %esi
175 PROLOGUE(M4_function_1)
178 xorl %esi, %esi C initial carry
181 movl PARAM_SIZE, %ecx
189 cmpl $UNROLL_THRESHOLD, %ecx
199 movl PARAM_MULTIPLIER, %ebp
220 M4_inst %eax, -4(%edi)
239 C -----------------------------------------------------------------------------
240 C The unrolled loop uses a "two carry limbs" scheme. At the top of the loop
241 C the carries are ecx=lo, esi=hi, then they swap for each limb processed.
242 C For the computed jump an odd size means they start one way around, an even
245 C VAR_JUMP holds the computed jump temporarily because there's not enough
246 C registers at the point of doing the mul for the initial two carry limbs.
248 C The add/adc for the initial carry in %esi is necessary only for the
249 C mpn_addmul/submul_1c entry points. Duplicating the startup code to
250 C eliminate this for the plain mpn_add/submul_1 doesn't seem like a good
253 dnl overlapping with parameters already fetched
254 define(VAR_COUNTER, `PARAM_SIZE')
255 define(VAR_JUMP, `PARAM_DST')
272 shrl $UNROLL_LOG2, %edx
273 andl $UNROLL_MASK, %ecx
275 movl %edx, VAR_COUNTER
281 C 15 code bytes per limb
286 leal L(entry) (%edx,%ecx,1), %edx
288 movl (%ebx), %eax C src low limb
290 movl PARAM_MULTIPLIER, %ebp
295 addl %esi, %eax C initial carry (from _1c)
299 leal 4(%ebx,%ecx,4), %ebx
300 movl %edx, %esi C high carry
303 leal (%edi,%ecx,4), %edi
306 movl %eax, %ecx C low carry
309 movl %esi, %ecx C high,low carry other way around
319 C See mpn/x86/README about old gas bugs
320 leal (%edx,%ecx,1), %edx
321 addl $L(entry)-L(here), %edx
327 C -----------------------------------------------------------
339 C 15 code bytes per limb
341 leal UNROLL_BYTES(%edi), %edi
344 forloop(`i', 0, UNROLL_COUNT/2-1, `
345 deflit(`disp0', eval(2*i*4))
346 deflit(`disp1', eval(disp0 + 4))
348 Zdisp( movl, disp0,(%ebx), %eax)
350 Zdisp( M4_inst,%ecx, disp0,(%edi))
355 movl disp1(%ebx), %eax
357 M4_inst %esi, disp1(%edi)
365 leal UNROLL_BYTES(%ebx), %ebx
370 M4_inst %ecx, UNROLL_BYTES(%edi)