1 dnl AMD64 mpn_mul_basecase optimised for Intel Haswell.
3 dnl Contributed to the GNU project by Torbjörn Granlund.
5 dnl Copyright 2003-2005, 2007, 2008, 2011-2013 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of either:
12 dnl * the GNU Lesser General Public License as published by the Free
13 dnl Software Foundation; either version 3 of the License, or (at your
14 dnl option) any later version.
18 dnl * the GNU General Public License as published by the Free Software
19 dnl Foundation; either version 2 of the License, or (at your option) any
22 dnl or both in parallel, as here.
24 dnl The GNU MP Library is distributed in the hope that it will be useful, but
25 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
26 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
29 dnl You should have received copies of the GNU General Public License and the
30 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
31 dnl see https://www.gnu.org/licenses/.
33 include(`../config.m4')
35 C cycles/limb mul_1 mul_2 mul_3 addmul_2
36 C AMD K8,K9 n/a n/a - n/a
37 C AMD K10 n/a n/a - n/a
38 C AMD bull n/a n/a - n/a
39 C AMD pile n/a n/a - n/a
41 C AMD bobcat n/a n/a - n/a
43 C Intel P4 n/a n/a - n/a
44 C Intel core n/a n/a - n/a
45 C Intel NHM n/a n/a - n/a
46 C Intel SBR n/a n/a - n/a
47 C Intel IBR n/a n/a - n/a
48 C Intel HWL 1.77 1.86 - 2.15
50 C Intel atom n/a n/a - n/a
51 C VIA nano n/a n/a - n/a
53 C The inner loops of this code are the result of running a code generation and
54 C optimisation tool suite written by David Harvey and Torbjörn Granlund.
58 C * Further micro-optimise.
62 define(`un_param',`%rdx')
81 PROLOGUE(mpn_mul_basecase)
83 IFDOS(` mov 56(%rsp), %r8d ')
89 mov un_param, un C free up rdx
92 mov un_param, n C FIXME: share
93 sar $2, n C FIXME: share
107 L(m1x0):test $2, R8(un)
112 mulx( 8,(up), w1, w3)
118 mulx( 8,(up), w1, w5)
122 mulx( 16,(up), w0, w2)
126 L(m1x1):test $2, R8(un)
134 mulx( 8,(up), w0, w2)
140 mulx( 8,(up), w0, w4)
141 mulx( 16,(up), w1, w5)
151 L(m1tp):lea 32(rp), rp
154 L(m1l2):mov w3, 8(rp)
158 mulx( 8,(up), w1, w3)
159 L(m1l0):mov w5, 24(rp)
160 mulx( 16,(up), w0, w4)
162 mulx( 24,(up), w1, w5)
168 L(m1ed):lea 32(rp), rp
170 L(cj2): mov w3, 8(rp)
172 L(cj1): mov w4, 16(rp)
207 L(m210):lea -16(rp), rp
217 L(m201):lea -24(rp), rp
221 L(m211):lea -8(rp), rp
226 L(m2tp):mulx( v1, %rax, w0)
244 L(m2l3):mov w0, 8(rp)
254 L(m2l2):mov w2, 16(rp)
265 L(m2l1):mov w0, 24(rp)
271 L(m2ed):mulx( v1, %rdx, %rax)
287 push vn C save vn in new stack slot
288 define(`vn', `(%rsp)')
320 L(b00): lea 16(up), up
324 L(b10): mov 16(rp), X0
343 L(b01): mov 16(rp), X1
348 L(b11): lea 8(rp), rp
353 L(top): mulx( v0, %rax, w3)
381 L(lo0): mulx( v0, %rax, w3)
410 L(end): mulx( v0, %rax, w3)
415 mulx( v1, %rdx, %rax)
432 pop %rax C deallocate vn slot