1 dnl Intel P5 mpn_lshift -- mpn left shift.
3 dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C P5: 1.75 cycles/limb.
26 C mp_limb_t mpn_lshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
29 C Shift src,size left by shift many bits and store the result in dst,size.
30 C Zeros are shifted in at the right. Return the bits shifted out at the
33 C The comments in mpn_rshift apply here too.
35 defframe(PARAM_SHIFT,16)
36 defframe(PARAM_SIZE, 12)
37 defframe(PARAM_SRC, 8)
38 defframe(PARAM_DST, 4)
41 dnl minimum 5, because the unrolled loop can't handle less
42 deflit(UNROLL_THRESHOLD, 5)
57 movl PARAM_SHIFT, %ecx
59 cmp $UNROLL_THRESHOLD, %eax
62 movl -4(%ebx,%eax,4), %edi C src high limb
67 shldl( %cl, %edi, %eax) C eax was decremented to zero
71 movl %edi, (%edx) C dst low limb
72 popl %edi C risk of data cache bank clash
79 C -----------------------------------------------------------------------------
90 movd (%ebx,%eax,4), %mm5 C src high limb
92 movd %ecx, %mm6 C lshift
99 psrlq $32, %mm5 C retval
103 C eax counter, limbs, negative
115 movq -4(%ebx,%eax,4), %mm0
122 movd %mm0, 4(%edx,%eax,4)
141 C -----------------------------------------------------------------------------
153 movd -4(%ebx,%eax,4), %mm5 C src high limb
154 leal (%ebx,%eax,4), %edi
156 movd %ecx, %mm6 C lshift
160 jz L(start_src_aligned)
163 C src isn't aligned, process high limb separately (marked xxx) to
166 C source -8(ebx,%eax,4)
168 C +-------+-------+-------+--
170 C +-------+-------+-------+--
176 C +-------+-------+--
178 C +-------+-------+--
180 movq -8(%ebx,%eax,4), %mm0 C unaligned load
189 movd %mm0, (%edx,%eax,4)
190 L(start_src_aligned):
192 movq -8(%ebx,%eax,4), %mm1 C src high qword
193 leal (%edx,%eax,4), %edi
196 psrlq $32, %mm5 C return value
198 movq -16(%ebx,%eax,4), %mm3 C src second highest qword
199 jz L(start_dst_aligned)
201 C dst isn't aligned, subtract 4 to make it so, and pretend the shift
202 C is 32 bits extra. High limb of dst (marked xxx) handled here
205 C source -8(ebx,%eax,4)
207 C +-------+-------+--
209 C +-------+-------+--
215 C +-------+-------+-------+--
217 C +-------+-------+-------+--
221 addl $32, %ecx C new shift
228 C wasted cycle here waiting for %mm0
230 movd %mm0, -4(%edx,%eax,4)
232 L(start_dst_aligned):
238 addl $64, %ecx C 64-shift
242 subl $8, %eax C size-8
246 por %mm1, %mm3 C mm3 ready to store
250 C The comments in mpn_rshift apply here too.
263 C mm2 src qword from 16(%ebx,%eax,4)
264 C mm3 dst qword ready to store to 24(%edx,%eax,4)
270 movq 8(%ebx,%eax,4), %mm0
276 movq %mm3, 24(%edx,%eax,4) C prev
279 movq (%ebx,%eax,4), %mm3 C
282 movq %mm0, 16(%edx,%eax,4)
294 C eax -4 to -1 representing respectively 0 to 3 limbs remaining
300 movq 8(%ebx,%eax,4), %mm0
306 movq %mm3, 24(%edx,%eax,4) C prev
316 C eax -4 or -3 representing respectively 0 or 1 limbs remaining
318 C mm2 src prev qword, from 16(%ebx,%eax,4)
319 C mm3 dst qword, for 24(%edx,%eax,4)
322 movd %mm5, %eax C retval
328 C One extra src limb, destination was aligned.
331 C --+---------------+-------+
333 C --+---------------+-------+
335 C dest edx+12 edx+4 edx
336 C --+---------------+---------------+-------+
338 C --+---------------+---------------+-------+
341 C mm7 = ecx = 64-shift
344 C One extra src limb, destination was unaligned.
347 C --+---------------+-------+
349 C --+---------------+-------+
352 C --+---------------+---------------+
354 C --+---------------+---------------+
357 C mm7 = ecx = 64-(shift+32)
360 C In both cases there's one extra limb of src to fetch and combine
361 C with mm2 to make a qword at 4(%edx), and in the aligned case
362 C there's an extra limb of dst to be formed from that extra src limb
384 jz L(finish_one_unaligned)
387 L(finish_one_unaligned):
396 C No extra src limbs, destination was aligned.
399 C --+---------------+
401 C --+---------------+
404 C --+---------------+---------------+
406 C --+---------------+---------------+
409 C mm7 = ecx = 64-shift
412 C No extra src limbs, destination was unaligned.
415 C --+---------------+
417 C --+---------------+
420 C --+---------------+-------+
422 C --+---------------+-------+
425 C mm7 = ecx = 64-(shift+32)
428 C The movd for the unaligned case writes the same data to 4(%edx)
429 C that the movq does for the aligned case.
436 jz L(finish_zero_unaligned)
439 L(finish_zero_unaligned):
444 movd %mm5, %eax C retval