1 dnl Intel Pentium mpn_copyi -- copy limb vector, incrementing.
3 dnl Copyright 1996, 2001, 2002, 2006 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C P5: 1.25 cycles/limb
26 C void mpn_copyi (mp_ptr dst, mp_srcptr src, mp_size_t size);
28 C Destination prefetching is done to avoid repeated write-throughs on lines
31 C At least one of the src or dst pointer needs to be incremented rather than
32 C using indexing, so that there's somewhere to put the loop control without
33 C an AGI. Incrementing one and not two lets us keep loop overhead to 2
34 C cycles. Making it the src pointer incremented avoids an AGI on the %ecx
35 C subtracts in the finishup code.
37 C The block of finishup code is almost as big as the main loop itself, which
38 C is unfortunate, but it's faster that way than with say rep movsl, by about
39 C 10 cycles for instance on P55.
41 C There's nothing to be gained from MMX on P55, since it can do only one
42 C movq load (or store) per cycle, so the throughput would be the same as the
43 C code here (and even then only if src and dst have the same alignment mod
46 defframe(PARAM_SIZE,12)
47 defframe(PARAM_SRC, 8)
48 defframe(PARAM_DST, 4)
58 pushl %ebx FRAME_pushl()
59 pushl %esi FRAME_pushl()
61 leal (%edx,%ecx,4), %edx C &dst[size-1]
62 xorl $-1, %ecx C -size-1
65 addl $8, %ecx C -size+7
69 movl -28(%edx,%ecx,4), %eax C fetch destination cache line, dst[0]
75 C ecx counter, limbs, negative
77 C esi src, incrementing
81 movl (%edx,%ecx,4), %eax C fetch destination cache line
84 movl (%esi), %eax C read words pairwise
86 movl %eax, -60(%edx,%ecx,4) C store words pairwise
87 movl %ebx, -56(%edx,%ecx,4)
91 movl %eax, -52(%edx,%ecx,4)
92 movl %ebx, -48(%edx,%ecx,4)
96 movl %eax, -44(%edx,%ecx,4)
97 movl %ebx, -40(%edx,%ecx,4)
101 movl %eax, -36(%edx,%ecx,4)
102 movl %ebx, -32(%edx,%ecx,4)
109 C ecx 0 to 7, representing respectively 7 to 0 limbs remaining
111 C edx dst, next location to store
118 movl %eax, -12(%edx,%ecx,4)
119 movl %ebx, -8(%edx,%ecx,4)
123 movl %eax, -4(%edx,%ecx,4)
124 movl %ebx, (%edx,%ecx,4)
135 movl %eax, -4(%edx,%ecx,4)
136 movl %ebx, (%edx,%ecx,4)
145 movl %eax, -4(%edx,%ecx,4) C risk of cache bank clash here