1 dnl AMD64 mpn_mul_basecase.
3 dnl Contributed to the GNU project by Torbjorn Granlund and David Harvey.
5 dnl Copyright 2008 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22 include(`../config.m4')
30 C The inner loops of this code are the result of running a code generation and
31 C optimization tool suite written by David Harvey and Torbjorn Granlund.
34 C * Use fewer registers. (how??? I can't see it -- david)
35 C * Avoid some "mov $0,r" and instead use "xor r,r".
36 C * Can the top of each L(addmul_outer_n) prologue be folded into the
37 C mul_1/mul_2 prologues, saving a LEA (%rip)? It would slow down the
38 C case where vn = 1 or 2; is it worth it?
43 define(`un_param',`%rdx')
56 define(`outer_addr', `%r14')
62 PROLOGUE(mpn_mul_basecase)
74 sub un_param, un C rdx used by mul
76 mov R32(un_param), R32(w0)
78 lea (rp,un_param,8), rp
79 lea (up,un_param,8), up
86 C ===========================================================
87 C mul_1 for vp[0] if vn is odd
91 jz L(mul_1_prologue_0)
93 jc L(mul_1_prologue_1)
94 jz L(mul_1_prologue_2)
95 jmp L(mul_1_prologue_3)
99 mov %rdx, w3 C note: already w0 == 0
100 lea L(addmul_outer_0)(%rip), outer_addr
110 lea L(addmul_outer_1)(%rip), outer_addr
119 lea L(addmul_outer_2)(%rip), outer_addr
129 lea L(addmul_outer_3)(%rip), outer_addr
135 C this loop is 10 c/loop = 2.5 c/l on K8, for all up/rp alignments
159 xor R32(w2), R32(w2) C zero
176 add $-1, vn C vn -= 1
182 lea 8(vp), vp C vp += 1
183 lea 8(rp), rp C rp += 1
187 C ===========================================================
188 C mul_2 for vp[0], vp[1] if vn is even
195 jz L(mul_2_prologue_0)
197 jz L(mul_2_prologue_2)
198 jc L(mul_2_prologue_1)
201 lea L(addmul_outer_3)(%rip), outer_addr
203 mov %rax, -16(rp,n,8)
207 mov -16(up,n,8), %rax
216 mov -24(up,n,8), %rax
217 lea L(addmul_outer_0)(%rip), outer_addr
225 lea L(addmul_outer_1)(%rip), outer_addr
231 lea L(addmul_outer_2)(%rip), outer_addr
239 C this loop is 18 c/loop = 2.25 c/l on K8, for all up/rp alignments
243 mov -32(up,n,8), %rax
247 mov -24(up,n,8), %rax
251 mov -24(up,n,8), %rax
259 mov -16(up,n,8), %rax
264 mov -16(up,n,8), %rax
278 adc R32(w1), R32(w0) C adc $0, w0
294 mov -32(up,n,8), %rax
301 add $-2, vn C vn -= 2
307 lea 16(vp), vp C vp += 2
308 lea 16(rp), rp C rp += 2
313 C ===========================================================
314 C addmul_2 for remaining vp's
316 C in the following prologues, we reuse un to store the
317 C adjusted value of n that is reloaded on each iteration
321 lea 0(%rip), outer_addr
324 mov -24(up,un,8), %rax
327 mov -24(up,un,8), %rax
330 jmp L(addmul_entry_0)
340 jmp L(addmul_entry_1)
344 lea 0(%rip), outer_addr
347 mov -8(up,un,8), %rax
353 mov -8(up,un,8), %rax
354 jmp L(addmul_entry_2)
358 lea 0(%rip), outer_addr
361 mov -16(up,un,8), %rax
365 mov -16(up,un,8), %rax
367 jmp L(addmul_entry_3)
369 C this loop is 19 c/loop = 2.375 c/l on K8, for all up/rp alignments
375 mov -24(up,n,8), %rax
380 mov -24(up,n,8), %rax
382 adc R32(w2), R32(w2) C adc $0, w2
388 mov -16(up,n,8), %rax
392 mov -16(up,n,8), %rax
407 adc R32(w1), R32(w0) C adc $0, w0
430 add $-2, vn C vn -= 2
433 lea 16(rp), rp C rp += 2
434 lea 16(vp), vp C vp += 2