1 dnl AMD64 mpn_sqr_basecase optimised for AMD bobcat.
3 dnl Copyright 2003-2005, 2007, 2008, 2011, 2012 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl or both in parallel, as here.
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
45 C This sqr_basecase is based on mul_1 and addmul_1, since these both run at the
46 C multiply insn bandwidth, without any apparent loop branch exit pipeline
47 C replays experienced on K8. The structure is unusual: it falls into mul_1 in
48 C the same way for all n, then it splits into 4 different wind-down blocks and
49 C 4 separate addmul_1 loops.
51 C We have not tried using the same addmul_1 loops with a switch into feed-in
52 C code, as we do in other basecase implementations. Doing that could save
53 C substantial code volume, but would also probably add some overhead.
57 C * Perhaps implement a larger final corner (it is now 2 x 1).
58 C * Lots of space could be saved by replacing the "switch" code by gradual
59 C jumps out from mul_1 winddown code, perhaps with no added overhead.
60 C * Are the ALIGN(16) really necessary? They add about 25 bytes of padding.
68 define(`un_param', `%rdx')
69 C Standard allocations
78 C Temp macro for allowing control over indexing.
79 C Define to return $1 for more conservative ptr handling.
87 PROLOGUE(mpn_sqr_basecase)
117 mov %rdx, w0 C CAUTION: r8 realloc
130 L(g2): cmp $3, R32(un_param)
179 lea -24(rp,un_param,8), rp
180 lea -24(up,un_param,8), up
182 push un_param C for sqr_diag_addlsh1
192 L(top): mov w0, -16(rp,n,8)
209 L(L3): mov 16(up,n,8), %rax
234 L(r1): mov X((up,n,8),8(up)), %rax
238 mov w2, X(-8(rp,n,8),(rp))
241 mov X(8(up,n,8),16(up)), %rax
245 mov w0, X((rp,n,8),8(rp))
248 mov w2, X(8(rp,n,8),16(rp))
249 mov w3, X(16(rp,n,8),24(rp))
253 L(r2): mov X((up,n,8),(up)), %rax
257 mov w2, X(-8(rp,n,8),-8(rp))
260 mov X(8(up,n,8),8(up)), %rax
264 mov w0, X((rp,n,8),(rp))
267 mov X(16(up,n,8),16(up)), %rax
271 mov w2, X(8(rp,n,8),8(rp))
274 mov w0, X(16(rp,n,8),16(rp))
276 mov w1, X(24(rp,n,8),24(rp))
280 L(r3): mov w2, X(-8(rp,n,8),16(rp))
281 mov w3, X((rp,n,8),24(rp))
285 L(r0): mov X((up,n,8),16(up)), %rax
289 mov w2, X(-8(rp,n,8),8(rp))
292 mov w0, X((rp,n,8),16(rp))
293 mov w1, X(8(rp,n,8),24(rp))
296 C fall through into main loop
310 L(ta3): add w0, -16(rp,n,8)
327 L(al3): mov 16(up,n,8), %rax
341 add w0, X(-16(rp,n,8),8(rp))
344 add w2, X(-8(rp,n,8),16(rp))
346 mov w3, X((rp,n,8),24(rp))
362 L(ta2): add w0, -16(rp,n,8)
386 L(al2): mov 24(up,n,8), %rax
393 add w0, X(-16(rp,n,8),8(rp))
396 add w2, X(-8(rp,n,8),16(rp))
398 mov w3, X((rp,n,8),24(rp))
403 mov -8(up,un,8), %rax
411 L(ta1): add w0, -16(rp,n,8)
414 L(al1): mov (up,n,8), %rax
442 add w0, X(-16(rp,n,8),8(rp))
445 add w2, X(-8(rp,n,8),16(rp))
447 mov w3, X((rp,n,8),24(rp))
460 L(ta0): add w0, -16(rp,n,8)
470 L(al0): mov 8(up,n,8), %rax
491 add w0, X(-16(rp,n,8),8(rp))
494 add w2, X(-8(rp,n,8),16(rp))
496 mov w3, X((rp,n,8),24(rp))
500 L(end): mov X(8(up,un,8),(up)), v0
501 mov X(16(up,un,8),8(up)), %rax
505 mov X(24(up,un,8),16(up)), %rax
509 add w0, X(24(rp,un,8),16(rp))
512 add w2, X(32(rp,un,8),24(rp))
514 mov X(16(up,un,8),8(up)), v0
515 mov X(24(up,un,8),16(up)), %rax
518 mov w3, X(40(rp,un,8),32(rp))
520 mov %rdx, X(48(rp,un,8),40(rp))
544 L(lm): mov 16(rp,n,8), w1
556 mov w0, X(-8(rp,n,8),-8(rp))
557 mov w1, X((rp,n,8),(rp))
559 mov %rdx, X(8(rp,n,8),8(rp))