1 dnl Intel P5 mpn_rshift -- mpn right shift.
3 dnl Copyright 2000, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C P5: 1.75 cycles/limb.
26 C mp_limb_t mpn_rshift (mp_ptr dst, mp_srcptr src, mp_size_t size,
29 C Shift src,size right by shift many bits and store the result in dst,size.
30 C Zeros are shifted in at the left. Return the bits shifted out at the
33 C It takes 6 mmx instructions to process 2 limbs, making 1.5 cycles/limb,
34 C and with a 4 limb loop and 1 cycle of loop overhead the total is 1.75 c/l.
36 C Full speed depends on source and destination being aligned. Unaligned mmx
37 C loads and stores on P5 don't pair and have a 2 cycle penalty. Some hairy
38 C setups and finish-ups are done to ensure alignment for the loop.
40 C MMX shifts work out a bit faster even for the simple loop.
42 defframe(PARAM_SHIFT,16)
43 defframe(PARAM_SIZE, 12)
44 defframe(PARAM_SRC, 8)
45 defframe(PARAM_DST, 4)
48 dnl Minimum 5, because the unrolled loop can't handle less.
49 deflit(UNROLL_THRESHOLD, 5)
64 movl PARAM_SHIFT, %ecx
66 cmp $UNROLL_THRESHOLD, %eax
70 movl (%ebx), %edi C src low limb
74 shrdl( %cl, %edi, %eax) C eax was decremented to zero
78 movl %edi, (%edx) C dst low limb
79 popl %edi C risk of data cache bank clash
86 C -----------------------------------------------------------------------------
98 movd (%ebx), %mm5 C src[0]
99 leal (%ebx,%eax,4), %ebx C &src[size-1]
101 movd %ecx, %mm6 C rshift
102 leal -4(%edx,%eax,4), %edx C &dst[size-2]
108 C This loop is 5 or 8 cycles, with every second load unaligned and a wasted
109 C cycle waiting for the mm0 result to be ready. For comparison a shrdl is 4
110 C cycles and would be 8 in a simple loop. Using mmx helps the return value
111 C and last limb calculations too.
114 C eax counter, limbs, negative
123 movq (%ebx,%eax,4), %mm0
128 movd %mm0, (%edx,%eax,4)
133 psrlq %mm6, %mm5 C return value
148 C -----------------------------------------------------------------------------
160 movd (%ebx), %mm5 C src[0]
163 movd %ecx, %mm6 C rshift
167 jz L(start_src_aligned)
170 C src isn't aligned, process low limb separately (marked xxx) and
171 C step src and dst by one limb, making src aligned.
174 C --+-------+-------+-------+
176 C --+-------+-------+-------+
180 C --+-------+-------+
182 C --+-------+-------+
184 movq (%ebx), %mm0 C unaligned load
193 L(start_src_aligned):
199 psrlq %mm6, %mm5 C retval
200 jz L(start_dst_aligned)
202 C dst isn't aligned, add 4 to make it so, and pretend the shift is
203 C 32 bits extra. Low limb of dst (marked xxx) handled here
207 C --+-------+-------+
209 C --+-------+-------+
213 C --+-------+-------+-------+
215 C --+-------+-------+-------+
219 addl $32, %ecx C new shift
227 L(start_dst_aligned):
233 movq %mm3, %mm2 C mm2 src qword
239 leal -12(%ebx,%eax,4), %ebx
240 leal -20(%edx,%eax,4), %edx
243 subl $7, %eax C size-7
245 por %mm1, %mm3 C mm3 ready to store
246 negl %eax C -(size-7)
251 C This loop is the important bit, the rest is just support. Careful
252 C instruction scheduling achieves the claimed 1.75 c/l. The
253 C relevant parts of the pairing rules are:
255 C - mmx loads and stores execute only in the U pipe
256 C - only one mmx shift in a pair
257 C - wait one cycle before storing an mmx register result
258 C - the usual address generation interlock
260 C Two qword calculations are slightly interleaved. The instructions
261 C marked "C" belong to the second qword, and the "C prev" one is for
262 C the second qword from the previous iteration.
266 C eax counter, limbs, negative
275 C mm2 src qword from -8(%ebx,%eax,4)
276 C mm3 dst qword ready to store to -8(%edx,%eax,4)
282 movq (%ebx,%eax,4), %mm0
288 movq %mm3, -8(%edx,%eax,4) C prev
291 movq 8(%ebx,%eax,4), %mm3 C
294 movq %mm0, (%edx,%eax,4)
305 C eax 0 to 3 representing respectively 3 to 0 limbs remaining
311 movq (%ebx,%eax,4), %mm0
317 movq %mm3, -8(%edx,%eax,4) C prev
327 C eax 2 or 3 representing respectively 1 or 0 limbs remaining
329 C mm2 src prev qword, from -8(%ebx,%eax,4)
330 C mm3 dst qword, for -8(%edx,%eax,4)
335 movd %mm5, %eax C retval
339 C One extra limb, destination was aligned.
342 C +-------+---------------+--
344 C +-------+---------------+--
347 C +-------+---------------+---------------+--
349 C +-------+---------------+---------------+--
352 C mm7 = ecx = 64-shift
355 C One extra limb, destination was unaligned.
358 C +-------+---------------+--
360 C +-------+---------------+--
363 C +---------------+---------------+--
365 C +---------------+---------------+--
368 C mm7 = ecx = 64-(shift+32)
371 C In both cases there's one extra limb of src to fetch and combine
372 C with mm2 to make a qword at 8(%edx), and in the aligned case
373 C there's a further extra limb of dst to be formed.
389 jz L(finish_one_unaligned)
391 C dst was aligned, must store one extra limb
393 L(finish_one_unaligned):
404 C No extra limbs, destination was aligned.
407 C +---------------+--
409 C +---------------+--
412 C +---------------+---------------+--
414 C +---------------+---------------+--
417 C mm7 = ecx = 64-shift
420 C No extra limbs, destination was unaligned.
423 C +---------------+--
425 C +---------------+--
428 C +-------+---------------+--
430 C +-------+---------------+--
433 C mm7 = 64-(shift+32)
436 C The movd for the unaligned case is clearly the same data as the
437 C movq for the aligned case, it's just a choice between whether one
438 C or two limbs should be written.
448 jz L(finish_zero_unaligned)
451 L(finish_zero_unaligned):