1 dnl AMD K6 mpn_sqr_basecase -- square an mpn number.
3 dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C K6: approx 4.7 cycles per cross product, or 9.2 cycles per triangular
24 C product (measured on the speed difference between 17 and 33 limbs,
25 C which is roughly the Karatsuba recursing range).
28 dnl SQR_TOOM2_THRESHOLD_MAX is the maximum SQR_TOOM2_THRESHOLD this
29 dnl code supports. This value is used only by the tune program to know
30 dnl what it can go up to. (An attempt to compile with a bigger value will
31 dnl trigger some m4_assert()s in the code, making the build fail.)
33 dnl The value is determined by requiring the displacements in the unrolled
34 dnl addmul to fit in single bytes. This means a maximum UNROLL_COUNT of
35 dnl 63, giving a maximum SQR_TOOM2_THRESHOLD of 66.
37 deflit(SQR_TOOM2_THRESHOLD_MAX, 66)
40 dnl Allow a value from the tune program to override config.m4.
42 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
43 `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
46 dnl UNROLL_COUNT is the number of code chunks in the unrolled addmul. The
47 dnl number required is determined by SQR_TOOM2_THRESHOLD, since
48 dnl mpn_sqr_basecase only needs to handle sizes < SQR_TOOM2_THRESHOLD.
50 dnl The first addmul is the biggest, and this takes the second least
51 dnl significant limb and multiplies it by the third least significant and
52 dnl up. Hence for a maximum operand size of SQR_TOOM2_THRESHOLD-1
53 dnl limbs, UNROLL_COUNT needs to be SQR_TOOM2_THRESHOLD-3.
55 m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
56 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
59 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
61 C The algorithm is essentially the same as mpn/generic/sqr_basecase.c, but a
62 C lot of function call overheads are avoided, especially when the given size
65 C The code size might look a bit excessive, but not all of it is executed
66 C and so won't fill up the code cache. The 1x1, 2x2 and 3x3 special cases
67 C clearly apply only to those sizes; mid sizes like 10x10 only need part of
68 C the unrolled addmul; and big sizes like 35x35 that do need all of it will
69 C at least be getting value for money, because 35x35 spends something like
72 C Different values of UNROLL_COUNT give slightly different speeds, between
73 C 9.0 and 9.2 c/tri-prod measured on the difference between 17 and 33 limbs.
74 C This isn't a big difference, but it's presumably some alignment effect
75 C which if understood could give a simple speedup.
77 defframe(PARAM_SIZE,12)
78 defframe(PARAM_SRC, 8)
79 defframe(PARAM_DST, 4)
83 PROLOGUE(mpn_sqr_basecase)
96 C -----------------------------------------------------------------------------
113 C -----------------------------------------------------------------------------
122 movl %eax, %ebx C src
143 mull %edx C src[0]*src[1]
159 C -----------------------------------------------------------------------------
166 C -----------------------------------------------------------------------------
173 movl %eax, %ebx C src
176 movl %edx, %ecx C dst
178 mull %eax C src[0] ^ 2
186 mull %eax C src[1] ^ 2
194 mull %eax C src[2] ^ 2
202 mull %edx C src[0] * src[1]
213 mull %edx C src[0] * src[2]
222 mull %edx C src[1] * src[2]
262 C -----------------------------------------------------------------------------
264 defframe(SAVE_EBX, -4)
265 defframe(SAVE_ESI, -8)
266 defframe(SAVE_EDI, -12)
267 defframe(SAVE_EBP, -16)
268 defframe(VAR_COUNTER,-20)
269 defframe(VAR_JMP, -24)
270 deflit(STACK_SPACE, 24)
283 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
285 C A test was done calling mpn_mul_1 here to get the benefit of its unrolled
286 C loop, but this was only a tiny speedup; at 35 limbs it took 24 cycles off
287 C a 5780 cycle operation, which is not surprising since the loop here is 8
288 C c/l and mpn_mul_1 is 6.25 c/l.
290 subl $STACK_SPACE, %esp deflit(`FRAME',STACK_SPACE)
311 movl (%eax), %ebp C multiplier
312 leal -1(%ecx), %ecx C size-1, and pad to a 16 byte boundary
341 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
343 C The last two addmuls, which are the bottom right corner of the product
344 C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
345 C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
346 C cases that need to be done.
348 C The unrolled code is the same as mpn_addmul_1(), see that routine for some
351 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
353 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
354 C chunk each outer loop.
356 C K6 doesn't do any branch prediction on indirect jumps, which is good
357 C actually because it's a different target each time. The unrolled addmul
358 C is about 3 cycles/limb faster than a simple loop, so the 6 cycle cost of
359 C the indirect jump is quickly recovered.
362 dnl This value is also implicitly encoded in a shift and add.
364 deflit(CODE_BYTES_PER_LIMB, 15)
366 dnl With the unmodified &src[size] and &dst[size] pointers, the
367 dnl displacements in the unrolled code fit in a byte for UNROLL_COUNT
368 dnl values up to 31. Above that an offset must be added to them.
371 ifelse(eval(UNROLL_COUNT>31),1,
372 eval((UNROLL_COUNT-31)*4),
383 movl PARAM_SIZE, %ecx
391 ` subl $OFFSET, %ebx')
395 ` subl $OFFSET, %edi')
403 leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
408 C The calculated jump mustn't be before the start of the available
409 C code. This is the limitation UNROLL_COUNT puts on the src operand
410 C size, but checked here using the jump address directly.
413 movl_text_address( L(unroll_inner_start), %eax)
418 C -----------------------------------------------------------------------------
422 C ebx &src[size], constant
424 C edx VAR_COUNTER, limbs, negative
425 C esi high limb to store
426 C edi dst ptr, high of last addmul
429 movl -12+OFFSET(%ebx,%edx,4), %ebp C multiplier
430 movl %edx, VAR_COUNTER
432 movl -8+OFFSET(%ebx,%edx,4), %eax C first limb of multiplicand
438 movl %edx, %esi C high carry
439 movl %ecx, %edx C jump
441 movl %eax, %ecx C low carry
442 leal CODE_BYTES_PER_LIMB(%edx), %edx
447 C A branch-free version of this using some xors was found to be a
448 C touch slower than just a conditional jump, despite the jump
449 C switching between taken and not taken on every loop.
451 ifelse(eval(UNROLL_COUNT%2),0,
452 jz,jnz) L(unroll_noswap)
453 movl %esi, %eax C high,low carry other way around
462 C Must be on an even address here so the low bit of the jump address
463 C will indicate which way around ecx/esi should start.
465 C An attempt was made at padding here to get the end of the unrolled
466 C code to come out on a good alignment, to save padding before
467 C L(corner). This worked, but turned out to run slower than just an
468 C ALIGN(2). The reason for this is not clear, it might be related
469 C to the different speeds on different UNROLL_COUNTs noted above.
473 L(unroll_inner_start):
482 C 15 code bytes each limb
483 C ecx/esi swapped on each chunk
485 forloop(`i', UNROLL_COUNT, 1, `
486 deflit(`disp_src', eval(-i*4 + OFFSET))
487 deflit(`disp_dst', eval(disp_src - 4))
489 m4_assert(`disp_src>=-128 && disp_src<128')
490 m4_assert(`disp_dst>=-128 && disp_dst<128')
493 Zdisp( movl, disp_src,(%ebx), %eax)
495 Zdisp( addl, %esi, disp_dst,(%edi))
500 dnl this one comes out last
501 Zdisp( movl, disp_src,(%ebx), %eax)
503 Zdisp( addl, %ecx, disp_dst,(%edi))
511 addl %esi, -4+OFFSET(%edi)
513 movl VAR_COUNTER, %edx
516 movl %ecx, m4_empty_if_zero(OFFSET)(%edi)
520 jnz L(unroll_outer_top)
529 C -----------------------------------------------------------------------------
570 C -----------------------------------------------------------------------------
571 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
572 C The loop measures about 6 cycles/iteration, though it looks like it should
576 movl PARAM_SIZE, %ecx
579 subl $1, %ecx C size-1 and clear carry
584 xorl %eax, %eax C ready for adcl
590 C ebx src (for later use)
591 C ecx counter, decrementing
592 C edx size-1 (for later use)
594 C edi dst, incrementing
605 movl %eax, 4(%edi) C dst most significant limb
606 movl (%ebx), %eax C src[0]
608 leal 4(%ebx,%edx,4), %ebx C &src[size]
609 subl %edx, %ecx C -(size-1)
612 C -----------------------------------------------------------------------------
613 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
614 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
615 C low limb of src[0]^2.
620 movl %eax, (%edi,%ecx,8) C dst[0]
627 C ecx counter, negative
633 movl (%ebx,%ecx,4), %eax
638 addl %esi, 4(%edi,%ecx,8)
639 adcl %eax, 8(%edi,%ecx,8)
649 addl %edx, 4(%edi) C dst most significant limb
658 C -----------------------------------------------------------------------------
661 C See mpn/x86/README about old gas bugs
663 addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx