1 dnl AMD K6-2 mpn_rshift -- mpn right shift.
3 dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C K6-2: 1.75 cycles/limb
26 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
30 defframe(PARAM_SHIFT,16)
31 defframe(PARAM_SIZE, 12)
32 defframe(PARAM_SRC, 8)
33 defframe(PARAM_DST, 4)
36 dnl Minimum 9, because the unrolled loop can't handle less.
38 deflit(UNROLL_THRESHOLD, 9)
46 C The 1 limb case can be done without the push %ebx, but it's then
47 C still the same speed. The push is left as a free helping hand for
48 C the two_or_more code.
51 pushl %ebx FRAME_pushl()
56 movl PARAM_SHIFT, %ecx
59 movl (%ebx), %edx C src limb
62 shrdl( %cl, %edx, %eax) C return value
66 movl %edx, (%ebx) C dst limb
72 C -----------------------------------------------------------------------------
73 ALIGN(16) C avoid offset 0x1f
80 movl (%ebx), %edx C src low limb
84 movd PARAM_SHIFT, %mm6
87 cmpl $UNROLL_THRESHOLD-1, %eax
100 leal (%ebx,%eax,4), %ebx
102 leal -4(%ecx,%eax,4), %ecx
105 C This loop runs at about 3 cycles/limb, which is the amount of
106 C decoding, and this is despite every second access being unaligned.
109 C eax counter, -(size-1) to -1
117 Zdisp( movq, 0,(%ebx,%eax,4), %mm0)
122 Zdisp( movd, %mm0, 0,(%ecx,%eax,4))
135 C -----------------------------------------------------------------------------
146 subl $7, %eax C size-8
151 movq (%ebx), %mm2 C src low qword
152 leal (%ebx,%eax,4), %ebx C src end - 32
155 leal (%ecx,%eax,4), %ecx C dst end - 32
157 notl %eax C -(size-7)
163 Zdisp( movd, %mm2, 0,(%ecx,%eax,4)) C dst low limb
164 movq 4(%ebx,%eax,4), %mm2 C new src low qword
167 movq 12(%ebx,%eax,4), %mm0 C src second lowest qword
168 nop C avoid bad cache line crossing
171 C This loop is the important bit, the rest is just support for it.
172 C Four src limbs are held at the start, and four more will be read.
173 C Four dst limbs will be written. This schedule seems necessary for
176 C The use of -(size-7) lets the loop stop when %eax becomes >= 0 and
177 C and leaves 0 to 3 which can be tested with test $1 and $2.
180 C eax counter, -(size-7) step by +4 until >=0
198 movq 4(%ebx,%eax,4), %mm0
201 movq %mm2, -12(%ecx,%eax,4)
207 movq 12(%ebx,%eax,4), %mm0
209 movq %mm1, -4(%ecx,%eax,4)
210 ja L(top) C jump if no carry and not zero
214 C Now have the four limbs in mm2 (low) and mm0 (high), and %eax is 0
215 C to 3 representing respectively 3 to 0 further limbs.
217 testl $2, %eax C testl to avoid bad cache line crossings
220 C Two or three extra limbs: rshift mm2, OR it with lshifted mm0, mm0
221 C becomes new mm2 and a new mm0 is loaded.
230 movq 12(%ebx,%eax,4), %mm0
232 movq %mm2, -4(%ecx,%eax,4)
246 movq %mm2, 4(%ecx,%eax,4)
250 C one further extra limb to process
252 movd 32-4(%ebx), %mm0 C src[size-1], most significant limb
261 movq %mm1, 32-12(%ecx) C dst[size-3,size-2]
262 movd %mm2, 32-4(%ecx) C dst[size-1]
264 movl %edx, %eax C retval
270 nop C avoid bad cache line crossing
272 C no further extra limbs
274 movq %mm1, 32-8(%ecx) C dst[size-2,size-1]
275 movl %edx, %eax C retval