1 dnl Intel Pentium MMX mpn_mul_1 -- mpn by limb multiplication.
3 dnl Copyright 2000, 2001, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
24 C P5: 12.0 for 32-bit multiplier
25 C 7.0 for 16-bit multiplier
28 C mp_limb_t mpn_mul_1 (mp_ptr dst, mp_srcptr src, mp_size_t size,
29 C mp_limb_t multiplier);
31 C When the multiplier is 16 bits some special case MMX code is used. Small
32 C multipliers might arise reasonably often from mpz_mul_ui etc. If the size
33 C is odd there's roughly a 5 cycle penalty, so times for say size==7 and
34 C size==8 end up being quite close. If src isn't aligned to an 8 byte
35 C boundary then one limb is processed separately with roughly a 5 cycle
36 C penalty, so in that case it's say size==8 and size==9 which are close.
40 C MMX is not believed to be of any use for 32-bit multipliers, since for
41 C instance the current method would just have to be more or less duplicated
42 C for the high and low halves of the multiplier, and would probably
43 C therefore run at about 14 cycles, which is slower than the plain integer
46 C Adding the high and low MMX products using integer code seems best. An
47 C attempt at using paddd and carry bit propagation with pcmpgtd didn't give
48 C any joy. Perhaps something could be done keeping the values signed and
49 C thereby avoiding adjustments to make pcmpgtd into an unsigned compare, or
54 C An mpn_mul_1c entrypoint would need a double carry out of the low result
55 C limb in the 16-bit code, unless it could be assumed the carry fits in 16
56 C bits, possibly as carry<multiplier, this being true of a big calculation
57 C done piece by piece. But let's worry about that if/when mul_1c is
60 defframe(PARAM_MULTIPLIER,16)
61 defframe(PARAM_SIZE, 12)
62 defframe(PARAM_SRC, 8)
63 defframe(PARAM_DST, 4)
79 movl PARAM_MULTIPLIER, %eax
99 pushl %esi FRAME_pushl()
100 pushl %edi FRAME_pushl()
102 movl %edx, %esi C src
105 movl PARAM_MULTIPLIER, %eax
106 pushl %ebx FRAME_pushl()
108 leal (%esi,%ecx,4), %esi C src end
109 leal (%edi,%ecx,4), %edi C dst end
113 pushl %ebp FRAME_pushl()
120 xorl %ebx, %ebx C carry limb
123 jnc L(top) C with carry flag clear
126 C size was odd, process one limb separately
128 mull 4(%esi,%ecx,8) C m * src[0]
130 movl %eax, 4(%edi,%ecx,8)
133 orl %edx, %ebx C carry limb, and clear carry flag
139 C ecx counter, negative
143 C ebp (scratch carry)
146 movl (%esi,%ecx,8), %eax
148 mull PARAM_MULTIPLIER
154 movl 4(%esi,%ecx,8), %eax
156 mull PARAM_MULTIPLIER
158 movl %ebx, (%edi,%ecx,8)
161 movl %eax, 4(%edi,%ecx,8)
181 C Special case for 16-bit multiplier.
191 C size<3 not supported here. At size==3 we're already a couple of
192 C cycles faster, so there's no threshold as such, just use the MMX
193 C as soon as possible.
199 pxor %mm6, %mm6 C initial carry word
201 punpcklwd %mm7, %mm7 C m replicated 2 times
202 addl $2, %ecx C -size+2
204 punpckldq %mm7, %mm7 C m replicated 4 times
205 andl $4, %edx C test alignment, clear carry flag
211 C Source is unaligned, process one limb separately.
213 C Plain integer code is used here, since it's smaller and is about
214 C the same 13 cycles as an mmx block would be.
216 C An "addl $1,%ecx" doesn't clear the carry flag when size==3, hence
217 C the use of separate incl and orl.
219 mull -8(%esi,%ecx,4) C m * src[0]
221 movl %eax, -8(%edi,%ecx,4) C dst[0]
222 incl %ecx C one limb processed
224 movd %edx, %mm6 C initial carry
226 orl %eax, %eax C clear carry flag
230 C The scheduling here is quite tricky, since so many instructions have
231 C pairing restrictions. In particular the js won't pair with a movd, and
232 C can't be paired with an adc since it wants flags from the inc, so
233 C instructions are rotated to the top of the loop to find somewhere useful
236 C Trouble has been taken to avoid overlapping successive loop iterations,
237 C since that would greatly increase the size of the startup and finishup
238 C code. Actually there's probably not much advantage to be had from
239 C overlapping anyway, since the difficulties are mostly with pairing, not
240 C with latencies as such.
242 C In the comments x represents the src data and m the multiplier (16
243 C bits, but replicated 4 times).
245 C The m signs calculated in %mm3 are a loop invariant and could be held in
246 C say %mm5, but that would save only one instruction and hence be no faster.
249 C eax l.low, then l.high
251 C ecx counter, -size+2 to 0 or 1
257 C %mm0 (high products)
258 C %mm1 (low products)
259 C %mm2 (adjust for m using x signs)
260 C %mm3 (adjust for x using m signs)
263 C %mm6 h.low, then carry
264 C %mm7 m replicated 4 times
266 movd %mm6, %ebx C h.low
267 psrlq $32, %mm1 C l.high
269 movd %mm0, %edx C h.high
270 movq %mm0, %mm6 C new c
275 movd %mm1, %eax C l.high
279 movl %ebx, -16(%edi,%ecx,4)
281 movl %edx, -12(%edi,%ecx,4)
285 pmulhw -8(%esi,%ecx,4), %mm0 C h = (x*m).high
288 pmullw -8(%esi,%ecx,4), %mm1 C l = (x*m).low
291 movq -8(%esi,%ecx,4), %mm2 C x
292 psraw $15, %mm3 C m signs
294 pand -8(%esi,%ecx,4), %mm3 C x selected by m signs
295 psraw $15, %mm2 C x signs
297 paddw %mm3, %mm0 C add x to h if m neg
298 pand %mm7, %mm2 C m selected by x signs
300 paddw %mm2, %mm0 C add m to h if x neg
303 movd %mm1, %eax C l.low
304 punpcklwd %mm0, %mm6 C c + h.low << 16
306 psrlq $16, %mm0 C h.high
312 movd %mm6, %ebx C h.low
313 psrlq $32, %mm1 C l.high
316 popl %ebp FRAME_popl()
318 movd %mm0, %edx C h.high
319 psrlq $32, %mm0 C l.high
321 movd %mm1, %eax C l.high
324 movl %ebx, -12(%edi,%ecx,4)
329 movl %edx, -8(%edi,%ecx,4)
332 jnz L(small_done) C final %ecx==1 means even, ==0 odd
335 C Size odd, one extra limb to process.
336 C Plain integer code is used here, since it's smaller and is about
337 C the same speed as another mmx block would be.
340 movl PARAM_MULTIPLIER, %eax