1 dnl AMD K6-2 mpn_lshift -- mpn left shift.
3 dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of either:
10 dnl * the GNU Lesser General Public License as published by the Free
11 dnl Software Foundation; either version 3 of the License, or (at your
12 dnl option) any later version.
16 dnl * the GNU General Public License as published by the Free Software
17 dnl Foundation; either version 2 of the License, or (at your option) any
20 dnl or both in parallel, as here.
22 dnl The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
27 dnl You should have received copies of the GNU General Public License and the
28 dnl GNU Lesser General Public License along with the GNU MP Library. If not,
29 dnl see https://www.gnu.org/licenses/.
31 include(`../config.m4')
34 C K6-2: 1.75 cycles/limb
37 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
41 defframe(PARAM_SHIFT,16)
42 defframe(PARAM_SIZE, 12)
43 defframe(PARAM_SRC, 8)
44 defframe(PARAM_DST, 4)
47 dnl used after src has been fetched
48 define(VAR_RETVAL,`PARAM_SRC')
50 dnl minimum 9, because unrolled loop can't handle less
51 deflit(UNROLL_THRESHOLD, 9)
59 C The 1 limb case can be done without the push %ebx, but it's then
60 C still the same speed. The push is left as a free helping hand for
61 C the two_or_more code.
64 pushl %ebx FRAME_pushl()
69 movl PARAM_SHIFT, %ecx
72 movl (%ebx), %edx C src limb
75 shldl( %cl, %edx, %eax) C return value
79 movl %edx, (%ebx) C dst limb
85 C -----------------------------------------------------------------------------
86 ALIGN(16) C avoid offset 0x1f
93 movl (%ebx,%eax,4), %edx C src high limb
96 movd PARAM_SHIFT, %mm6
97 addl $32, %ecx C 32-shift
100 cmpl $UNROLL_THRESHOLD-1, %eax
102 movl %edx, VAR_RETVAL
114 C ecx counter, size-1 to 1
121 movq -4(%ebx,%ecx,4), %mm0
125 Zdisp( movd, %mm0, 0,(%eax,%ecx,4))
141 C -----------------------------------------------------------------------------
147 C edx retval (but instead VAR_RETVAL is used)
155 subl $7, %eax C size-8
157 leal (%edx,%eax,4), %ecx C alignment of dst
159 movq 32-8(%ebx,%eax,4), %mm2 C src high qword
168 movd %mm2, 32(%edx,%eax,4) C dst high limb
169 movq 32-8(%ebx,%eax,4), %mm2 C new src high qword
172 movq 32-16(%ebx,%eax,4), %mm0 C src second highest qword
175 C This loop is the important bit, the rest is just support for it.
176 C Four src limbs are held at the start, and four more will be read.
177 C Four dst limbs will be written. This schedule seems necessary for
180 C The use of size-8 lets the loop stop when %eax goes negative and
181 C leaves -4 to -1 which can be tested with test $1 and $2.
184 C eax counter, size-8 step by -4 until <0
202 movq 24(%ebx,%eax,4), %mm0
205 movq %mm2, 40(%edx,%eax,4)
211 movq 16(%ebx,%eax,4), %mm0
213 movq %mm1, 32(%edx,%eax,4)
217 C Now have four limbs in mm2 (prev) and mm0 (next), plus eax mod 4.
219 C 8(%ebx) is the next source, and 24(%edx) is the next destination.
220 C %eax is between -4 and -1, representing respectively 0 to 3 extra
221 C limbs that must be read.
224 testl $2, %eax C testl to avoid bad cache line crossing
227 C Two more limbs: lshift mm2, OR it with rshifted mm0, mm0 becomes
228 C new mm2 and a new mm0 is loaded.
237 movq 16(%ebx,%eax,4), %mm0
239 movq %mm2, 32(%edx,%eax,4)
244 C lshift mm2, OR with rshifted mm0, mm1 becomes lshifted mm0
255 movq %mm2, 24(%edx,%eax,4)
259 C Size is odd, so mm1 and one extra limb to process.
261 movd (%ebx), %mm0 C src[0]
273 movq %mm1, 4(%edx) C dst[1,2]
274 movd %mm2, (%edx) C dst[0]
276 movl VAR_RETVAL, %eax
282 nop C avoid bad cache line crossing
285 C Size is even, so only mm1 left to process.
287 movq %mm1, (%edx) C dst[0,1]
288 movl VAR_RETVAL, %eax