1 dnl AMD K7 mpn_lshift -- mpn left shift.
3 dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C K7: 1.21 cycles/limb (at 16 limbs/loop).
27 dnl K7: UNROLL_COUNT cycles/limb
32 dnl Maximum possible with the current code is 64.
34 deflit(UNROLL_COUNT, 16)
37 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
40 C Shift src,size left by shift many bits and store the result in dst,size.
41 C Zeros are shifted in at the right. The bits shifted out at the left are
44 C The comments in mpn_rshift apply here too.
47 deflit(UNROLL_THRESHOLD, 10)
49 deflit(UNROLL_THRESHOLD, 10)
52 defframe(PARAM_SHIFT,16)
53 defframe(PARAM_SIZE, 12)
54 defframe(PARAM_SRC, 8)
55 defframe(PARAM_DST, 4)
57 defframe(SAVE_EDI, -4)
58 defframe(SAVE_ESI, -8)
59 defframe(SAVE_EBX, -12)
71 deflit(`FRAME',SAVE_SIZE)
73 movl PARAM_SHIFT, %ecx
78 jnz L(more_than_one_limb)
82 shldl( %cl, %edx, %eax) C eax was decremented to zero
93 C -----------------------------------------------------------------------------
94 L(more_than_one_limb):
103 movd PARAM_SHIFT, %mm6
104 movd (%edx,%eax,4), %mm5 C src high limb
105 cmp $UNROLL_THRESHOLD-1, %eax
109 movd (%edx), %mm4 C src low limb
116 C eax loop counter, limbs
130 movq -4(%edx,%eax,4), %mm0
135 movd %mm0, 4(%edi,%eax,4)
143 movd %mm4, (%edi) C dst low limb
145 movd %mm5, %eax C return value
148 addl $SAVE_SIZE, %esp
154 C -----------------------------------------------------------------------------
165 C mm5 src high limb, for return value
170 leal -4(%edx,%eax,4), %edx C &src[size-2]
173 movq (%edx), %mm1 C src high qword
175 jz L(start_src_aligned)
178 C src isn't aligned, process high limb (marked xxx) separately to
181 C source -4(edx,%eax,4)
183 C +-------+-------+-------+--
185 C +-------+-------+-------+--
188 C dest -4(edi,%eax,4)
190 C +-------+-------+--
192 C +-------+-------+--
196 movl %eax, PARAM_SIZE C size-1
199 decl %eax C size-2 is new size-1
201 movd %mm1, 4(%edi,%eax,4)
202 movq (%edx), %mm1 C new src high qword
203 L(start_src_aligned):
206 leal -4(%edi,%eax,4), %edi C &dst[size-2]
210 psrlq $32, %mm5 C return value
212 jz L(start_dst_aligned)
215 C dst isn't aligned, subtract 4 bytes to make it so, and pretend the
216 C shift is 32 bits extra. High limb of dst (marked xxx) handled
220 C +-------+-------+--
222 C +-------+-------+--
226 C +-------+-------+-------+--
228 C +-------+-------+-------+--
233 addl $32, %ecx C shift+32
241 movd %ecx, %mm6 C new lshift
242 L(start_dst_aligned):
244 decl %eax C size-2, two last limbs handled at end
245 movq %mm1, %mm2 C copy of src high qword
248 andl $-2, %eax C round size down to even
254 andl $UNROLL_MASK, %eax
259 movd %ecx, %mm7 C rshift = 64-lshift
265 leal L(entry) (%eax,%eax,4), %esi
267 shrl $UNROLL_LOG2, %ebx C loop counter
269 leal ifelse(UNROLL_BYTES,256,128) -8(%edx,%eax,2), %edx
270 leal ifelse(UNROLL_BYTES,256,128) (%edi,%eax,2), %edi
271 movl PARAM_SIZE, %eax C for use at end
277 C See mpn/x86/README about old gas bugs
278 leal (%eax,%eax,4), %esi
279 addl $L(entry)-L(here), %esi
286 C -----------------------------------------------------------------------------
289 C eax size (for use at end)
298 C mm1 \ carry (alternating, mm2 first)
305 C The two chunks differ in whether mm1 or mm2 hold the carry.
306 C The computed jump puts the initial carry in both mm1 and mm2.
309 deflit(CHUNK_COUNT, 4)
310 forloop(i, 0, UNROLL_COUNT/CHUNK_COUNT-1, `
311 deflit(`disp0', eval(-i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
312 deflit(`disp1', eval(disp0 - 8))
314 Zdisp( movq, disp0,(%edx), %mm0)
321 Zdisp( movq, %mm0, disp0,(%edi))
324 Zdisp( movq, disp1,(%edx), %mm0)
331 Zdisp( movq, %mm0, disp1,(%edi))
334 subl $UNROLL_BYTES, %edx
335 subl $UNROLL_BYTES, %edi
342 define(`disp', `m4_empty_if_zero(eval($1 ifelse(UNROLL_BYTES,256,-128)))')
347 psllq %mm6, %mm2 C wanted left shifted in all cases below
357 C Size odd, destination was aligned.
360 C --+---------------+-------+
362 C --+---------------+-------+
365 C --+---------------+---------------+-------+
367 C --+---------------+---------------+-------+
370 C mm7 = ecx = 64-shift
373 C Size odd, destination was unaligned.
376 C --+---------------+-------+
378 C --+---------------+-------+
381 C --+---------------+---------------+
383 C --+---------------+---------------+
386 C mm7 = ecx = 64-(shift+32)
389 C In both cases there's one extra limb of src to fetch and combine
390 C with mm2 to make a qword at (%edi), and in the aligned case
391 C there's an extra limb of dst to be formed from that extra src limb
394 movd disp(4) (%edx), %mm0
405 movq %mm0, disp(0) (%edi)
406 jz L(end_odd_unaligned)
407 movd %mm1, disp(-4) (%edi)
408 L(end_odd_unaligned):
411 addl $SAVE_SIZE, %esp
419 C Size even, destination was aligned.
422 C --+---------------+
424 C --+---------------+
427 C --+---------------+---------------+
429 C --+---------------+---------------+
432 C mm7 = ecx = 64-shift
435 C Size even, destination was unaligned.
438 C --+---------------+
440 C --+---------------+
443 C --+---------------+-------+
445 C --+---------------+-------+
448 C mm7 = ecx = 64-(shift+32)
451 C The movq for the aligned case overwrites the movd for the
458 movd %mm2, disp(4) (%edi)
460 jz L(end_even_unaligned)
461 movq %mm0, disp(0) (%edi)
462 L(end_even_unaligned):
465 addl $SAVE_SIZE, %esp