1 dnl AMD K7 mpn_mul_basecase -- multiply two mpn numbers.
3 dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C K7: approx 4.42 cycles per cross product at around 20x20 limbs (16
24 C limbs/loop unrolling).
28 dnl K7 UNROLL_COUNT cycles/product (at around 20x20)
32 dnl Maximum possible with the current code is 32.
34 dnl At 32 the typical 13-26 limb sizes from the karatsuba code will get
35 dnl done with a straight run through a block of code, no inner loop. Using
36 dnl 32 gives 1k of code, but the k7 has a 64k L1 code cache.
38 deflit(UNROLL_COUNT, 32)
41 C void mpn_mul_basecase (mp_ptr wp,
42 C mp_srcptr xp, mp_size_t xsize,
43 C mp_srcptr yp, mp_size_t ysize);
45 C Calculate xp,xsize multiplied by yp,ysize, storing the result in
48 C This routine is essentially the same as mpn/generic/mul_basecase.c, but
49 C it's faster because it does most of the mpn_addmul_1() startup
50 C calculations only once. The saving is 15-25% on typical sizes coming from
51 C the Karatsuba multiply code.
54 deflit(UNROLL_THRESHOLD, 5)
56 deflit(UNROLL_THRESHOLD, 5)
59 defframe(PARAM_YSIZE,20)
60 defframe(PARAM_YP, 16)
61 defframe(PARAM_XSIZE,12)
67 PROLOGUE(mpn_mul_basecase)
70 movl PARAM_XSIZE, %ecx
74 movl (%eax), %eax C yp low limb
77 ja L(xsize_more_than_two)
78 je L(two_by_something)
81 C one limb by one limb
91 C -----------------------------------------------------------------------------
95 pushl %ebx defframe_pushl(`SAVE_EBX')
96 movl %eax, %ecx C yp low limb
99 pushl %esi defframe_pushl(`SAVE_ESI')
102 movl (%edx), %eax C xp low limb
106 C two limbs by one limb
112 movl %edx, %esi C carry
131 C -----------------------------------------------------------------------------
132 C Could load yp earlier into another register.
144 dnl FRAME carries on from previous
146 mull %ecx C xp[0] * yp[0]
148 push %edi defframe_pushl(`SAVE_EDI')
149 movl %edx, %edi C carry, for wp[1]
154 mull %ecx C xp[1] * yp[0]
160 movl 4(%ecx), %ecx C yp[1]
163 movl 4(%esi), %eax C xp[1]
164 movl %edx, %edi C carry, for wp[2]
166 mull %ecx C xp[1] * yp[1]
171 movl (%esi), %eax C xp[0]
173 movl %edx, %esi C carry, for wp[3]
175 mull %ecx C xp[0] * yp[1]
192 C -----------------------------------------------------------------------------
194 L(xsize_more_than_two):
196 C The first limb of yp is processed with a simple mpn_mul_1 style loop
197 C inline. Unrolling this doesn't seem worthwhile since it's only run once
198 C (whereas the addmul below is run ysize-1 many times). A call to the
199 C actual mpn_mul_1 will be slowed down by the call and parameter pushing and
200 C popping, and doesn't seem likely to be worthwhile on the typical 13-26
201 C limb operations the Karatsuba code calls here with.
211 dnl FRAME doesn't carry on from previous, no pushes yet here
212 defframe(`SAVE_EBX',-4)
213 defframe(`SAVE_ESI',-8)
214 defframe(`SAVE_EDI',-12)
215 defframe(`SAVE_EBP',-16)
230 leal (%edx,%ecx,4), %esi C xp end
232 leal (%edi,%ecx,4), %edi C wp end of mul1
239 C ecx counter, negative
245 movl (%esi,%ecx,4), %eax
250 movl %eax, (%edi,%ecx,4)
258 movl PARAM_YSIZE, %edx
259 movl PARAM_XSIZE, %ecx
261 movl %ebx, (%edi) C final carry
264 jnz L(ysize_more_than_one)
277 L(ysize_more_than_one):
278 cmpl $UNROLL_THRESHOLD, %ecx
284 C -----------------------------------------------------------------------------
285 C simple addmul looping
295 leal 4(%eax,%edx,4), %ebp C yp end
299 movl (%esi,%ecx,4), %eax C xp low limb
300 movl %edx, PARAM_YSIZE C -(ysize-1)
303 xorl %ebx, %ebx C initial carry
304 movl %ecx, PARAM_XSIZE C -(xsize-1)
307 movl (%ebp,%edx,4), %ebp C yp second lowest limb - multiplier
308 jmp L(simple_outer_entry)
311 C this is offset 0x121 so close enough to aligned
313 C ebp ysize counter, negative
316 movl PARAM_XSIZE, %ecx C -(xsize-1)
317 xorl %ebx, %ebx C carry
319 movl %ebp, PARAM_YSIZE
320 addl $4, %edi C next position in wp
322 movl (%edx,%ebp,4), %ebp C yp limb - multiplier
323 movl -4(%esi,%ecx,4), %eax C xp low limb
326 L(simple_outer_entry):
331 C ecx loop counter (negative)
342 addl %ebx, (%edi,%ecx,4)
343 movl (%esi,%ecx,4), %eax
353 movl PARAM_YSIZE, %ebp
363 jnz L(simple_outer_top)
377 C -----------------------------------------------------------------------------
379 C The unrolled loop is the same as in mpn_addmul_1(), see that code for some
382 C VAR_ADJUST is the negative of how many limbs the leals in the inner loop
383 C increment xp and wp. This is used to adjust back xp and wp, and rshifted
384 C to given an initial VAR_COUNTER at the top of the outer loop.
386 C VAR_COUNTER is for the unrolled loop, running from VAR_ADJUST/UNROLL_COUNT
387 C up to -1, inclusive.
389 C VAR_JMP is the computed jump into the unrolled loop.
391 C VAR_XP_LOW is the least significant limb of xp, which is needed at the
392 C start of the unrolled loop.
394 C PARAM_YSIZE is the outer loop counter, going from -(ysize-1) up to -1,
397 C PARAM_YP is offset appropriately so that the PARAM_YSIZE counter can be
398 C added to give the location of the next limb of yp, which is the multiplier
399 C in the unrolled loop.
401 C The trick with VAR_ADJUST means it's only necessary to do one fetch in the
402 C outer loop to take care of xp, wp and the inner loop counter.
404 defframe(VAR_COUNTER, -20)
405 defframe(VAR_ADJUST, -24)
406 defframe(VAR_JMP, -28)
407 defframe(VAR_XP_LOW, -32)
408 deflit(VAR_EXTRA_SPACE, 16)
421 movl 4(%eax), %ebp C multiplier (yp second limb)
422 leal 4(%eax,%edx,4), %eax C yp adjust for ysize indexing
428 movl %edx, PARAM_YSIZE
429 leal UNROLL_COUNT-2(%ecx), %ebx C (xsize-1)+UNROLL_COUNT-1
432 movl (%esi), %eax C xp low limb
433 andl $-UNROLL_MASK-1, %ebx
436 subl $VAR_EXTRA_SPACE, %esp
437 deflit(`FRAME',16+VAR_EXTRA_SPACE)
439 andl $UNROLL_MASK, %ecx
441 movl %ebx, VAR_ADJUST
445 sarl $UNROLL_LOG2, %ebx
447 C 17 code bytes per limb
452 leal L(unroll_entry) (%ecx,%edx,1), %ecx
456 movl %eax, VAR_XP_LOW
458 leal 4(%edi,%edx,4), %edi C wp and xp, adjust for unrolling,
459 leal 4(%esi,%edx,4), %esi C and start at second limb
460 jmp L(unroll_outer_entry)
465 C See mpn/x86/README about old gas bugs
466 leal (%ecx,%edx,1), %ecx
467 addl $L(unroll_entry)-L(unroll_here), %ecx
473 C --------------------------------------------------------------------------
476 C ebp ysize counter, negative
478 movl VAR_ADJUST, %ebx
481 movl VAR_XP_LOW, %eax
482 movl %ebp, PARAM_YSIZE C store incremented ysize counter
484 leal 4(%edi,%ebx,4), %edi
485 leal (%esi,%ebx,4), %esi
486 sarl $UNROLL_LOG2, %ebx
488 movl (%edx,%ebp,4), %ebp C yp next multiplier
491 L(unroll_outer_entry):
494 testb $1, %cl C and clear carry bit
495 movl %ebx, VAR_COUNTER
499 cmovz( %eax, %ecx) C eax into low carry, zero into high carry limb
502 C Extra fetch of VAR_JMP is bad, but registers are tight
506 C -----------------------------------------------------------------------------
515 C ebp yp multiplier limb
517 C VAR_COUNTER loop counter, negative
523 deflit(CHUNK_COUNT,2)
524 forloop(`i', 0, UNROLL_COUNT/CHUNK_COUNT-1, `
525 deflit(`disp0', eval(i*CHUNK_COUNT*4 ifelse(UNROLL_BYTES,256,-128)))
526 deflit(`disp1', eval(disp0 + 4))
528 Zdisp( movl, disp0,(%esi), %eax)
533 Zdisp( addl, %ecx, disp0,(%edi))
539 movl disp1(%esi), %eax
544 addl %ebx, disp1(%edi)
552 leal UNROLL_BYTES(%esi), %esi
553 leal UNROLL_BYTES(%edi), %edi
563 C edi wp, pointing at second last limb)
566 C carry flag to be added to high
568 deflit(`disp0', ifelse(UNROLL_BYTES,256,-128))
569 deflit(`disp1', eval(disp0-0 + 4))
571 movl PARAM_YSIZE, %ebp
573 addl %ecx, disp0(%edi)
578 movl %edx, disp1(%edi)
579 jnz L(unroll_outer_top)