1 dnl AMD64 mpn_sqr_basecase.
3 dnl Contributed to the GNU project by Torbjorn Granlund.
5 dnl Copyright 2008, 2009 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22 include(`../config.m4')
24 C The inner loops of this code are the result of running a code generation and
25 C optimization tool suite written by David Harvey and Torbjorn Granlund.
28 C * This code only handles operands up to SQR_TOOM2_THRESHOLD_MAX. That
29 C means we can safely use 32-bit operations for all sizes, unlike in e.g.,
31 C * The jump table could probably be optimized, at least for non-pic.
32 C * The special code for n=1,2,3 was quickly written. It is probably too
33 C large and unnecessarily slow.
34 C * Consider combining small cases code so that the n=k-1 code jumps into
35 C the middle of the n=k code.
36 C * Avoid saving registers for small cases code.
39 C i r8 work left, initially n
40 C j r9 inner loop count
58 define(`n_param', `%rdx')
60 C We should really trim this, for better spatial locality. Alternatively,
61 C we could grab the upper part of the stack area, leaving the lower part
62 C instead of the upper part unused.
63 deflit(SQR_TOOM2_THRESHOLD_MAX, 80)
64 define(`STACK_ALLOC', eval(8*2*SQR_TOOM2_THRESHOLD_MAX))
82 PROLOGUE(mpn_sqr_basecase)
90 mov R32(n_param), R32(n) C free original n register (rdx)
91 mov R32(n_param), R32(%rcx)
96 lea L(jmptab)(%rip), %rax
163 xor R32(%r10), R32(%r10)
170 xor R32(%r11), R32(%r11)
212 xor R32(%r10), R32(%r10)
217 xor R32(%r11), R32(%r11)
223 xor R32(%r12), R32(%r12)
234 xor R32(%rbp), R32(%rbp)
261 L(0m4): add $-STACK_ALLOC, %rsp
262 lea -24(%rsp,n,8), tp C point tp in middle of result operand
265 lea (up,n,8), up C point up at end of input operand
268 C Function mpn_mul_1_m3(tp, up - i, i, up[-i - 1])
298 L(L3): xor R32(w1), R32(w1)
314 lea eval(2*8)(tp), tp C tp += 2
319 L(1m4): add $-STACK_ALLOC, %rsp
320 lea (%rsp,n,8), tp C point tp in middle of result operand
323 lea 8(up,n,8), up C point up at end of input operand
326 C Function mpn_mul_2s_m0(tp, up - i, i, up - i - 1)
342 mov -24(up,j,8), %rax
346 mov -24(up,j,8), %rax
353 L(m0): mov -16(up,j,8), %rax C u2, u6 ...
358 mov -16(up,j,8), %rax
376 L(m2x): mov (up,j,8), %rax
382 mov -32(up,j,8), %rax
393 lea eval(3*8-24)(tp), tp C tp += 3
397 L(2m4): add $-STACK_ALLOC, %rsp
398 lea -24(%rsp,n,8), tp C point tp in middle of result operand
401 lea (up,n,8), up C point up at end of input operand
404 C Function mpn_mul_1_m1(tp, up - (i - 1), i - 1, up[-i])
420 L(L1): xor R32(w0), R32(w0)
449 lea eval(2*8)(tp), tp C tp += 2
454 L(3m4): add $-STACK_ALLOC, %rsp
455 lea (%rsp,n,8), tp C point tp in middle of result operand
458 lea 8(up,n,8), up C point up at end of input operand
461 C Function mpn_mul_2s_m2(tp, up - i + 1, i - 1, up - i)
478 mov -24(up,j,8), %rax
482 mov -24(up,j,8), %rax
489 mov -16(up,j,8), %rax
494 mov -16(up,j,8), %rax
512 L(m2): mov (up,j,8), %rax
518 mov -32(up,j,8), %rax
532 C Function mpn_addmul_2s_m2(tp, up - (i - 1), i - 1, up - i)
586 L(am2): mov 32(up,j,8), %rax
594 js L(addmul_2_m2_top)
602 lea eval(2*8)(tp), tp C tp += 2
604 add $-2, R32(i) C i -= 2
607 C Function mpn_addmul_2s_m0(tp, up - (i - 1), i - 1, up - i)
637 L(20): mov 16(up,j,8), %rax
668 js L(addmul_2_m0_top)
676 lea eval(2*8)(tp), tp C tp += 2
679 add $-2, R32(i) C i -= 2
682 C Function mpn_addmul_2s_2
704 C Function mpn_sqr_diag_addlsh1
718 L(evn): add %r11, %r11
719 sbb R32(%rbx), R32(%rbx) C save CF
724 L(odd): add %r11, %r11
725 sbb R32(%rbp), R32(%rbp) C save CF
732 L(top): mov (up,j,4), %rax
734 add R32(%rbp), R32(%rbp) C restore carry
738 L(d0): mov %r11, 8(rp,j,8)
744 sbb R32(%rbp), R32(%rbp) C save CF
747 add R32(%rbx), R32(%rbx) C restore carry
751 L(d1): mov %r11, 24(rp,j,8)
756 sbb R32(%rbx), R32(%rbx) C save CF
762 add R32(%rbp), R32(%rbp) C restore carry
769 sbb R32(%rbp), R32(%rbp) C save CF
773 add R32(%rbx), R32(%rbx) C restore carry
779 add $eval(8+STACK_ALLOC), %rsp