1 dnl Intel P6 mpn_sqr_basecase -- square an mpn number.
3 dnl Copyright 1999, 2000, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C P6: approx 4.0 cycles per cross product, or 7.75 cycles per triangular
24 C product (measured on the speed difference between 20 and 40 limbs,
25 C which is the Karatsuba recursing range).
28 dnl These are the same as in mpn/x86/k6/sqr_basecase.asm, see that file for
29 dnl a description. The only difference here is that UNROLL_COUNT can go up
30 dnl to 64 (not 63) making SQR_TOOM2_THRESHOLD_MAX 67.
32 deflit(SQR_TOOM2_THRESHOLD_MAX, 67)
34 ifdef(`SQR_TOOM2_THRESHOLD_OVERRIDE',
35 `define(`SQR_TOOM2_THRESHOLD',SQR_TOOM2_THRESHOLD_OVERRIDE)')
37 m4_config_gmp_mparam(`SQR_TOOM2_THRESHOLD')
38 deflit(UNROLL_COUNT, eval(SQR_TOOM2_THRESHOLD-3))
41 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
43 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
44 C lot of function call overheads are avoided, especially when the given size
47 C The code size might look a bit excessive, but not all of it is executed so
48 C it won't all get into the code cache. The 1x1, 2x2 and 3x3 special cases
49 C clearly apply only to those sizes; mid sizes like 10x10 only need part of
50 C the unrolled addmul; and big sizes like 40x40 that do use the full
51 C unrolling will least be making good use of it, because 40x40 will take
52 C something like 7000 cycles.
54 defframe(PARAM_SIZE,12)
55 defframe(PARAM_SRC, 8)
56 defframe(PARAM_DST, 4)
60 PROLOGUE(mpn_sqr_basecase)
75 C -----------------------------------------------------------------------------
90 C -----------------------------------------------------------------------------
97 defframe(SAVE_ESI, -4)
98 defframe(SAVE_EBX, -8)
99 defframe(SAVE_EDI, -12)
100 defframe(SAVE_EBP, -16)
101 deflit(`STACK_SPACE',16)
103 subl $STACK_SPACE, %esp
104 deflit(`FRAME',STACK_SPACE)
112 movl %eax, (%ecx) C dst[0]
116 movl %edx, %ebx C dst[1]
121 movl %eax, %edi C dst[2]
125 movl %edx, %ebp C dst[3]
127 mull 4(%esi) C src[0]*src[1]
154 C -----------------------------------------------------------------------------
162 pushl %esi defframe_pushl(`SAVE_ESI')
169 C -----------------------------------------------------------------------------
180 pushl %ebp defframe_pushl(`SAVE_EBP')
181 pushl %edi defframe_pushl(`SAVE_EDI')
183 mull %eax C src[0] ^ 2
191 mull %eax C src[1] ^ 2
197 pushl %ebx defframe_pushl(`SAVE_EBX')
199 mull %eax C src[2] ^ 2
206 mull 4(%esi) C src[0] * src[1]
213 mull 8(%esi) C src[0] * src[2]
221 mull 8(%esi) C src[1] * src[2]
230 C esi zero, will be dst[5]
268 adcl %esi, %eax C no carry out of this
278 C -----------------------------------------------------------------------------
279 defframe(VAR_COUNTER,-20)
280 defframe(VAR_JMP, -24)
281 deflit(`STACK_SPACE',24)
291 deflit(`FRAME',4) dnl %esi already pushed
293 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
295 subl $STACK_SPACE-FRAME, %esp
296 deflit(`FRAME',STACK_SPACE)
303 subl %edx, %ecx C -(size-1)
306 movl $0, %ebx C initial carry
308 leal (%esi,%edx,4), %esi C &src[size]
309 movl %eax, %ebp C multiplier
311 leal -4(%edi,%edx,4), %edi C &dst[size-1]
314 C This loop runs at just over 6 c/l.
319 C ecx counter, limbs, negative, -(size-1) to -1
333 movl %eax, 4(%edi,%ecx,4)
342 C Addmul src[n]*src[n+1..size-1] at dst[2*n-1...], for each n=1..size-2.
344 C The last two addmuls, which are the bottom right corner of the product
345 C triangle, are left to the end. These are src[size-3]*src[size-2,size-1]
346 C and src[size-2]*src[size-1]. If size is 4 then it's only these corner
347 C cases that need to be done.
349 C The unrolled code is the same as mpn_addmul_1(), see that routine for some
352 C VAR_COUNTER is the outer loop, running from -(size-4) to -1, inclusive.
354 C VAR_JMP is the computed jump into the unrolled code, stepped by one code
355 C chunk each outer loop.
357 dnl This is also hard-coded in the address calculation below.
358 deflit(CODE_BYTES_PER_LIMB, 15)
360 dnl With &src[size] and &dst[size-1] pointers, the displacements in the
361 dnl unrolled code fit in a byte for UNROLL_COUNT values up to 32, but above
362 dnl that an offset must be added to them.
364 ifelse(eval(UNROLL_COUNT>32),1,
365 eval((UNROLL_COUNT-32)*4),
376 movl PARAM_SIZE, %ecx
385 ifelse(OFFSET,0,,`subl $OFFSET, %esi')
391 leal L(unroll_inner_end)-eval(2*CODE_BYTES_PER_LIMB)(%ecx,%edx), %ecx
395 ifelse(OFFSET,0,,`subl $OFFSET, %edi')
397 C The calculated jump mustn't be before the start of the available
398 C code. This is the limit that UNROLL_COUNT puts on the src operand
399 C size, but checked here using the jump address directly.
402 `movl_text_address( L(unroll_inner_start), %eax)
406 C -----------------------------------------------------------------------------
410 C ebx high limb to store
412 C edx VAR_COUNTER, limbs, negative
413 C esi &src[size], constant
414 C edi dst ptr, second highest limb of last addmul
417 movl -12+OFFSET(%esi,%edx,4), %ebp C multiplier
418 movl %edx, VAR_COUNTER
420 movl -8+OFFSET(%esi,%edx,4), %eax C first limb of multiplicand
424 define(cmovX,`ifelse(eval(UNROLL_COUNT%2),1,`cmovz($@)',`cmovnz($@)')')
428 movl %edx, %ebx C high carry
431 movl %ecx, %edx C jump
433 movl %eax, %ecx C low carry
434 leal CODE_BYTES_PER_LIMB(%edx), %edx
436 cmovX( %ebx, %ecx) C high carry reverse
437 cmovX( %eax, %ebx) C low carry reverse
442 C Must be on an even address here so the low bit of the jump address
443 C will indicate which way around ecx/ebx should start.
447 L(unroll_inner_start):
456 C 15 code bytes each limb
457 C ecx/ebx reversed on each chunk
459 forloop(`i', UNROLL_COUNT, 1, `
460 deflit(`disp_src', eval(-i*4 + OFFSET))
461 deflit(`disp_dst', eval(disp_src))
463 m4_assert(`disp_src>=-128 && disp_src<128')
464 m4_assert(`disp_dst>=-128 && disp_dst<128')
467 Zdisp( movl, disp_src,(%esi), %eax)
469 Zdisp( addl, %ebx, disp_dst,(%edi))
474 dnl this one comes out last
475 Zdisp( movl, disp_src,(%esi), %eax)
477 Zdisp( addl, %ecx, disp_dst,(%edi))
485 addl %ebx, m4_empty_if_zero(OFFSET)(%edi)
487 movl VAR_COUNTER, %edx
490 movl %ecx, m4_empty_if_zero(OFFSET+4)(%edi)
494 jnz L(unroll_outer_top)
503 C -----------------------------------------------------------------------------
538 movl PARAM_SIZE, %ecx
549 C Left shift of dst[1..2*size-2], the bit shifted out becomes dst[2*size-1].
551 subl $1, %ecx C size-1
552 xorl %eax, %eax C ready for final adcl, and clear carry
561 C ecx counter, size-1 to 1
562 C edx size-1 (for later use)
563 C esi src (for later use)
564 C edi dst, incrementing
577 movl %eax, 4(%edi) C dst most significant limb
578 movl (%esi), %eax C src[0]
580 leal 4(%esi,%edx,4), %esi C &src[size]
581 subl %edx, %ecx C -(size-1)
584 C Now add in the squares on the diagonal, src[0]^2, src[1]^2, ...,
585 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
586 C low limb of src[0]^2.
591 movl %eax, (%edi,%ecx,8) C dst[0]
597 C ecx counter, negative
603 movl (%esi,%ecx,4), %eax
608 addl %ebx, 4(%edi,%ecx,8)
609 adcl %eax, 8(%edi,%ecx,8)
619 addl %edx, 4(%edi) C dst most significant limb
628 C -----------------------------------------------------------------------------
632 addl $L(unroll_inner_end)-L(here)-eval(2*CODE_BYTES_PER_LIMB), %ecx