1 dnl AMD64 mpn_lshiftc optimised for CPUs with fast SSE.
3 dnl Contributed to the GNU project by David Harvey and Torbjorn Granlund.
5 dnl Copyright 2010, 2011, 2012 Free Software Foundation, Inc.
7 dnl This file is part of the GNU MP Library.
9 dnl The GNU MP Library is free software; you can redistribute it and/or modify
10 dnl it under the terms of the GNU Lesser General Public License as published
11 dnl by the Free Software Foundation; either version 3 of the License, or (at
12 dnl your option) any later version.
14 dnl The GNU MP Library is distributed in the hope that it will be useful, but
15 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
16 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
17 dnl License for more details.
19 dnl You should have received a copy of the GNU Lesser General Public License
20 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
22 include(`../config.m4')
25 C cycles/limb cycles/limb good
26 C 16-byte aligned 16-byte unaligned for cpu?
28 C AMD K10 1.85 (1.635) 1.9 (1.67) Y
29 C AMD bd1 1.82 (1.75) 1.82 (1.75) Y
31 C Intel P4 3.6 (3.125) 3.6 (3.125) Y
32 C Intel core2 2.05 (1.67) 2.55 (1.75)
33 C Intel NHM 2.05 (1.875) 2.6 (2.25)
34 C Intel SBR 1.55 (1.44) 2 (1.57) Y
36 C VIA nano 2.5 (2.5) 2.5 (2.5) Y
38 C We try to do as many 16-byte operations as possible. The top-most and
39 C bottom-most writes might need 8-byte operations. We always write using
40 C 16-byte operations, we read with both 8-byte and 16-byte operations.
42 C There are two inner-loops, one for when rp = ap (mod 16) and one when this is
43 C not true. The aligned case reads 16+8 bytes, the unaligned case reads
44 C 16+8+X bytes, where X is 8 or 16 depending on how punpcklqdq is implemented.
46 C This is not yet great code:
47 C (1) The unaligned case makes too many reads.
48 C (2) We should do some unrolling, at least 2-way.
49 C With 2-way unrolling but no scheduling we reach 1.5 c/l on K10 and 2 c/l on
64 sub R32(%rcx), R32(%rax)
71 pcmpeqb %xmm7, %xmm7 C set to 111...111
76 lea (rp,n,8), R32(%rcx)
80 C Do one initial limb in order to make rp aligned
81 movq -8(ap,n,8), %xmm0
82 movq -16(ap,n,8), %xmm1
87 movq %xmm0, -8(rp,n,8)
91 lea (ap,n,8), R32(%rcx)
95 C *****************************************************************************
97 C Handle the case when ap != rp (mod 16).
100 L(utop):movq (ap,n,8), %xmm1
101 punpcklqdq 8(ap,n,8), %xmm1
102 movdqa -8(ap,n,8), %xmm0
107 movdqa %xmm0, (rp,n,8)
115 punpcklqdq %xmm1, %xmm0
116 punpcklqdq 8(ap), %xmm1
123 C *****************************************************************************
125 C Handle the case when ap = rp (mod 16).
128 L(atop):movdqa (ap,n,8), %xmm0 C xmm0 = B*ap[n-1] + ap[n-2]
129 movq -8(ap,n,8), %xmm1 C xmm1 = ap[n-3]
130 punpcklqdq %xmm0, %xmm1 C xmm1 = B*ap[n-2] + ap[n-3]
135 movdqa %xmm0, (rp,n,8)
143 punpcklqdq %xmm0, %xmm1
150 C *****************************************************************************
163 L(end8):movq (ap), %xmm0