1 dnl AMD K7 mpn_rshift -- mpn right shift.
3 dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C K7: 1.21 cycles/limb (at 16 limbs/loop).
27 dnl K7: UNROLL_COUNT cycles/limb
32 dnl Maximum possible with the current code is 64.
34 deflit(UNROLL_COUNT, 16)
37 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
40 C Shift src,size right by shift many bits and store the result in dst,size.
41 C Zeros are shifted in at the left. The bits shifted out at the right are
44 C This code uses 64-bit MMX operations, which makes it possible to handle
45 C two limbs at a time, for a theoretical 1.0 cycles/limb. Plain integer
46 C code, on the other hand, suffers from shrd being a vector path decode and
47 C running at 3 cycles back-to-back.
49 C Full speed depends on source and destination being aligned, and some hairy
50 C setups and finish-ups are done to arrange this for the loop.
53 deflit(UNROLL_THRESHOLD, 10)
55 deflit(UNROLL_THRESHOLD, 10)
58 defframe(PARAM_SHIFT,16)
59 defframe(PARAM_SIZE, 12)
60 defframe(PARAM_SRC, 8)
61 defframe(PARAM_DST, 4)
63 defframe(SAVE_EDI, -4)
64 defframe(SAVE_ESI, -8)
65 defframe(SAVE_EBX, -12)
77 deflit(`FRAME',SAVE_SIZE)
79 movl PARAM_SHIFT, %ecx
84 jnz L(more_than_one_limb)
86 movl (%edx), %edx C src limb
88 shrdl( %cl, %edx, %eax) C eax was decremented to zero
92 movl %edx, (%edi) C dst limb
99 C -----------------------------------------------------------------------------
100 L(more_than_one_limb):
109 movd PARAM_SHIFT, %mm6 C rshift
110 movd (%edx), %mm5 C src low limb
111 cmp $UNROLL_THRESHOLD-1, %eax
114 leal (%edx,%eax,4), %edx C &src[size-1]
115 leal -4(%edi,%eax,4), %edi C &dst[size-2]
117 movd (%edx), %mm4 C src high limb
122 C eax loop counter, limbs, negative
135 movq (%edx,%eax,4), %mm0
140 movd %mm0, (%edi,%eax,4)
148 movd %mm4, 4(%edi) C dst high limb
150 movd %mm5, %eax C return value
153 addl $SAVE_SIZE, %esp
159 C -----------------------------------------------------------------------------
178 jz L(start_src_aligned)
181 C src isn't aligned, process low limb separately (marked xxx) and
182 C step src and dst by one limb, making src aligned.
185 C --+-------+-------+-------+
187 C --+-------+-------+-------+
191 C --+-------+-------+
193 C --+-------+-------+
195 movq (%edx), %mm0 C src low two limbs
197 movl %eax, PARAM_SIZE C size-1
200 decl %eax C size-2 is new size-1
203 movl %edi, PARAM_DST C new dst
206 L(start_src_aligned):
209 movq (%edx), %mm1 C src low two limbs
210 decl %eax C size-2, two last limbs handled at end
214 jz L(start_dst_aligned)
217 C dst isn't aligned, add 4 to make it so, and pretend the shift is
218 C 32 bits extra. Low limb of dst (marked xxx) handled here separately.
221 C --+-------+-------+
223 C --+-------+-------+
227 C --+-------+-------+-------+
229 C --+-------+-------+-------+
234 addl $32, %ecx C shift+32
238 addl $4, %edi C new dst
241 L(start_dst_aligned):
244 movq %mm1, %mm2 C copy of src low two limbs
246 andl $-2, %eax C round size down to even
252 andl $UNROLL_MASK, %eax
257 movd %ecx, %mm7 C lshift = 64-rshift
263 leal L(entry) (%eax,%eax,4), %esi
266 shrl $UNROLL_LOG2, %ebx C loop counter
268 leal ifelse(UNROLL_BYTES,256,128+) 8(%edx,%eax,2), %edx
269 leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
270 movl PARAM_SIZE, %eax C for use at end
277 C See mpn/x86/README about old gas bugs
278 leal (%eax,%eax,4), %esi
279 addl $L(entry)-L(here), %esi
287 C -----------------------------------------------------------------------------
290 C eax size, for use at end
294 C esi was computed jump
299 C mm1 \ carry (alternating)
306 C The two chunks differ in whether mm1 or mm2 hold the carry.
307 C The computed jump puts the initial carry in both mm1 and mm2.
310 deflit(CHUNK_COUNT, 4)
311 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
312 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
313 deflit(`disp1', eval(disp0 + 8))
315 Zdisp( movq, disp0,(%edx), %mm0)
322 Zdisp( movq, %mm0, disp0,(%edi))
325 Zdisp( movq, disp1,(%edx), %mm0)
332 Zdisp( movq, %mm0, disp1,(%edi))
335 addl $UNROLL_BYTES, %edx
336 addl $UNROLL_BYTES, %edi
342 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
343 deflit(`disp1', eval(disp0-0 + 8))
346 psrlq %mm6, %mm2 C wanted rshifted in all cases below
349 movd %mm5, %eax C return value
355 C Size odd, destination was aligned.
359 C +-------+---------------+--
361 C +-------+---------------+--
364 C +-------+---------------+---------------+--
366 C +-------+---------------+---------------+--
369 C mm7 = ecx = 64-shift
372 C Size odd, destination was unaligned.
376 C +-------+---------------+--
378 C +-------+---------------+--
381 C +---------------+---------------+--
383 C +---------------+---------------+--
386 C mm7 = ecx = 64-(shift+32)
389 C In both cases there's one extra limb of src to fetch and combine
390 C with mm2 to make a qword to store, and in the aligned case there's
391 C a further extra limb of dst to be formed.
394 movd disp0(%edx), %mm0
403 movq %mm0, disp0(%edi)
404 jz L(finish_odd_unaligned)
406 movd %mm1, disp1(%edi)
407 L(finish_odd_unaligned):
410 addl $SAVE_SIZE, %esp
418 C Size even, destination was aligned.
421 C +---------------+--
423 C +---------------+--
426 C +---------------+---------------+--
428 C +---------------+---------------+--
431 C mm7 = ecx = 64-shift
434 C Size even, destination was unaligned.
437 C +---------------+--
439 C +---------------+--
442 C +-------+---------------+--
444 C +-------+---------------+--
447 C mm7 = 64-(shift+32)
450 C The movd for the unaligned case is the same data as the movq for
451 C the aligned case, it's just a choice between whether one or two
452 C limbs should be written.
456 movd %mm2, disp0(%edi)
458 jz L(end_even_unaligned)
460 movq %mm2, disp0(%edi)
461 L(end_even_unaligned):
464 addl $SAVE_SIZE, %esp