1 dnl Intel P6 mpn_mul_basecase -- multiply two mpn numbers.
3 dnl Copyright 1999, 2000, 2001, 2002, 2003 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C P6: approx 6.5 cycles per cross product (16 limbs/loop unrolling).
26 dnl P6 UNROLL_COUNT cycles/product (approx)
30 dnl Maximum possible with the current code is 32.
32 deflit(UNROLL_COUNT, 16)
35 C void mpn_mul_basecase (mp_ptr wp,
36 C mp_srcptr xp, mp_size_t xsize,
37 C mp_srcptr yp, mp_size_t ysize);
39 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
40 C it's faster because it does most of the mpn_addmul_1() startup
41 C calculations only once.
44 deflit(UNROLL_THRESHOLD, 5)
46 deflit(UNROLL_THRESHOLD, 5)
49 defframe(PARAM_YSIZE,20)
50 defframe(PARAM_YP, 16)
51 defframe(PARAM_XSIZE,12)
58 PROLOGUE(mpn_mul_basecase)
61 movl PARAM_XSIZE, %ecx
67 movl (%eax), %eax C yp[0]
69 ja L(xsize_more_than_two)
70 je L(two_by_something)
73 C one limb by one limb
83 C -----------------------------------------------------------------------------
87 dnl re-use parameter space
88 define(SAVE_EBX, `PARAM_XSIZE')
89 define(SAVE_ESI, `PARAM_YSIZE')
93 movl %eax, %ecx C yp[0]
95 movl %esi, SAVE_ESI C save esi
99 movl (%edx), %eax C xp[0]
103 C two limbs by one limb
115 movl %edx, %esi C carry
133 C -----------------------------------------------------------------------------
145 dnl more parameter space re-use
146 define(SAVE_EDI, `PARAM_WP')
148 mull %ecx C xp[0] * yp[0]
151 movl %edx, %edi C carry, for wp[1]
156 mull %ecx C xp[1] * yp[0]
162 movl 4(%ecx), %ecx C yp[1]
165 movl 4(%esi), %eax C xp[1]
166 movl %edx, %edi C carry, for wp[2]
168 mull %ecx C xp[1] * yp[1]
171 movl (%esi), %eax C xp[0]
174 movl %edx, %esi C carry, for wp[3]
176 mull %ecx C xp[0] * yp[1]
195 C -----------------------------------------------------------------------------
197 L(xsize_more_than_two):
199 C The first limb of yp is processed with a simple mpn_mul_1 loop running at
200 C about 6.2 c/l. Unrolling this doesn't seem worthwhile since it's only run
201 C once (whereas the addmul_1 below is run ysize-1 many times). A call to
202 C mpn_mul_1 would be slowed down by the parameter pushing and popping etc,
203 C and doesn't seem likely to be worthwhile on the typical sizes reaching
204 C here from the Karatsuba code.
214 defframe(`SAVE_EBX', -4)
215 defframe(`SAVE_ESI', -8)
216 defframe(`SAVE_EDI', -12)
217 defframe(`SAVE_EBP', -16)
218 defframe(VAR_COUNTER, -20) dnl for use in the unroll case
219 defframe(VAR_ADJUST, -24)
220 defframe(VAR_JMP, -28)
221 defframe(VAR_SWAP, -32)
222 defframe(VAR_XP_LOW, -36)
223 deflit(STACK_SPACE, 36)
225 subl $STACK_SPACE, %esp
226 deflit(`FRAME',STACK_SPACE)
238 leal (%edx,%ecx,4), %esi C xp end
240 leal (%edi,%ecx,4), %edi C wp end of mul1
247 C ecx counter, negative
253 movl (%esi,%ecx,4), %eax
258 movl %eax, (%edi,%ecx,4)
266 movl PARAM_YSIZE, %edx
268 movl %ebx, (%edi) C final carry
269 movl PARAM_XSIZE, %ecx
272 jz L(done) C if ysize==1
274 cmpl $UNROLL_THRESHOLD, %ecx
279 C -----------------------------------------------------------------------------
280 C simple addmul looping
290 leal 4(%eax,%edx,4), %ebp C yp end
294 movl %edx, PARAM_YSIZE C -(ysize-1)
295 movl (%esi,%ecx,4), %eax C xp low limb
298 movl %ecx, PARAM_XSIZE C -(xsize-1)
299 xorl %ebx, %ebx C initial carry
302 movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier
303 jmp L(simple_outer_entry)
307 C ebp ysize counter, negative
311 movl PARAM_XSIZE, %ecx C -(xsize-1)
312 xorl %ebx, %ebx C carry
314 movl %ebp, PARAM_YSIZE
315 addl $4, %edi C next position in wp
317 movl (%edx,%ebp,4), %ebp C yp limb - multiplier
319 movl -4(%esi,%ecx,4), %eax C xp low limb
322 L(simple_outer_entry):
327 C ecx loop counter (negative)
338 addl %ebx, (%edi,%ecx,4)
339 movl (%esi,%ecx,4), %eax
344 jnz L(simple_inner_top)
347 C separate code for last limb so outer loop counter handling can be
352 movl PARAM_YSIZE, %ebp
363 jnz L(simple_outer_top)
380 C -----------------------------------------------------------------------------
382 C The unrolled loop is the same as in mpn_addmul_1, see that code for some
385 C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
386 C increment xp and wp. This is used to adjust xp and wp, and is rshifted to
387 C given an initial VAR_COUNTER at the top of the outer loop.
389 C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
390 C up to -1, inclusive.
392 C VAR_JMP is the computed jump into the unrolled loop.
394 C VAR_SWAP is 0 if xsize odd or 0xFFFFFFFF if xsize even, used to swap the
395 C initial ebx and ecx on entry to the unrolling.
397 C VAR_XP_LOW is the least significant limb of xp, which is needed at the
398 C start of the unrolled loop.
400 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
403 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
404 C added to give the location of the next limb of yp, which is the multiplier
405 C in the unrolled loop.
407 C The trick with the VAR_ADJUST value means it's only necessary to do one
408 C fetch in the outer loop to take care of xp, wp and the inner loop counter.
422 movl 4(%eax), %ebp C multiplier (yp second limb)
423 leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
429 movl %edx, PARAM_YSIZE
430 leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
433 movl (%esi), %eax C xp low limb
434 andl $-UNROLL_MASK-1, %ebx
435 negl %ecx C -(xsize-1)
438 andl $UNROLL_MASK, %ecx
440 movl %ebx, VAR_ADJUST
444 movl %eax, VAR_XP_LOW
445 sarl $UNROLL_LOG2, %ebx
448 C 15 code bytes per limb
453 leal L(unroll_inner_entry) (%ecx,%edx,1), %ecx
460 sarl $31, %edx C 0 or -1 as xsize odd or even
461 leal 4(%edi,%ecx,4), %edi C wp and xp, adjust for unrolling,
462 leal 4(%esi,%ecx,4), %esi C and start at second limb
465 jmp L(unroll_outer_entry)
470 C See mpn/x86/README about old gas bugs
471 leal (%ecx,%edx,1), %ecx
472 addl $L(unroll_inner_entry)-L(unroll_here), %ecx
478 C --------------------------------------------------------------------------
487 C ebp ysize counter, negative
489 movl VAR_ADJUST, %ebx
492 movl VAR_XP_LOW, %eax
493 movl %ebp, PARAM_YSIZE C store incremented ysize counter
495 leal eval(UNROLL_BYTES + 4) (%edi,%ebx,4), %edi
496 leal (%esi,%ebx,4), %esi
497 sarl $UNROLL_LOG2, %ebx
499 movl (%edx,%ebp,4), %ebp C yp next multiplier
501 L(unroll_outer_entry):
504 movl %ebx, VAR_COUNTER
505 movl %edx, %ebx C carry high
506 movl %eax, %ecx C carry low
513 xorl %eax, %ebx C carries other way for odd index
519 C -----------------------------------------------------------------------------
528 C ebp yp multiplier limb
530 C VAR_COUNTER loop counter, negative
534 addl $UNROLL_BYTES, %edi
536 L(unroll_inner_entry):
538 deflit(CHUNK_COUNT,2)
539 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
540 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
541 deflit(`disp1', eval(disp0 + 4))
543 Zdisp( movl, disp0,(%esi), %eax)
545 Zdisp( addl, %ecx, disp0,(%edi))
546 adcl %eax, %ebx C new carry low
548 adcl $0, %ecx C new carry high
550 movl disp1(%esi), %eax
552 addl %ebx, disp1(%edi)
553 adcl %eax, %ecx C new carry low
555 adcl $0, %ebx C new carry high
560 leal UNROLL_BYTES(%esi), %esi
561 jnz L(unroll_inner_top)
569 C edi wp, pointing at second last limb)
572 deflit(`disp0', eval(UNROLL_BYTES ifelse(UNROLL_BYTES,256,-128)))
573 deflit(`disp1', eval(disp0 + 4))
575 movl PARAM_YSIZE, %ebp
576 addl %ecx, disp0(%edi) C carry low
581 movl %ebx, disp1(%edi) C carry high
582 jnz L(unroll_outer_top)