1 dnl Intel P5 mpn_sqr_basecase -- square an mpn number.
3 dnl Copyright 1999, 2000, 2001, 2002 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or
8 dnl modify it under the terms of the GNU Lesser General Public License as
9 dnl published by the Free Software Foundation; either version 3 of the
10 dnl License, or (at your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful,
13 dnl but WITHOUT ANY WARRANTY; without even the implied warranty of
14 dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 dnl Lesser General Public License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C P5: approx 8 cycles per crossproduct, or 15.5 cycles per triangular
24 C product at around 20x20 limbs.
27 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
29 C Calculate src,size squared, storing the result in dst,2*size.
31 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
32 C lot of function call overheads are avoided, especially when the size is
35 defframe(PARAM_SIZE,12)
36 defframe(PARAM_SRC, 8)
37 defframe(PARAM_DST, 4)
41 PROLOGUE(mpn_sqr_basecase)
55 C -----------------------------------------------------------------------------
69 C -----------------------------------------------------------------------------
88 movl %eax, (%ecx) C dst[0]
89 movl %edx, %esi C dst[1]
95 movl %eax, %edi C dst[2]
96 movl %edx, %ebp C dst[3]
100 mull 4(%ebx) C src[0]*src[1]
125 C -----------------------------------------------------------------------------
141 C -----------------------------------------------------------------------------
151 mull %eax C src[0] ^ 2
159 mull %eax C src[1] ^ 2
165 pushl %esi C risk of cache bank clash
167 mull %eax C src[2] ^ 2
174 mull 4(%ebx) C src[0] * src[1]
181 mull 8(%ebx) C src[0] * src[2]
189 mull 8(%ebx) C src[1] * src[2]
195 C ebx zero, will be dst[5]
236 adcl %ebx, %eax C no carry out of this
244 C -----------------------------------------------------------------------------
255 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
266 leal (%ecx,%edx,4), %edi C dst end of this mul1
268 leal (%ebx,%edx,4), %esi C src end
269 movl %ebx, %ebp C src
272 xorl %ebx, %ebx C clear carry limb and carry flag
274 leal 1(%edx), %ecx C -(size-1)
279 C ecx counter, negative
286 movl (%esi,%ecx,4), %eax
292 movl %ebx, (%edi,%ecx,4)
299 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
302 C The last two products, which are the end corner of the product
303 C triangle, are handled separately to save looping overhead. These
304 C are src[size-3]*src[size-2,size-1] and src[size-2]*src[size-1].
305 C If size is 4 then it's only these that need to be done.
307 C In the outer loop %esi is a constant, and %edi just advances by 1
308 C limb each time. The size of the operation decreases by 1 limb
312 C ebx carry (needing carry flag added)
320 movl PARAM_SIZE, %edx
330 C ebx previous carry limb to store
331 C edx outer loop counter (negative)
333 C edi dst, pointing at stored carry limb of previous loop
335 pushl %edx C new outer loop counter
342 xorl %ebx, %ebx C initial carry limb, clear carry flag
346 C ebx carry (needing carry flag added)
347 C ecx counter, negative
350 C edi dst end of this addmul
354 movl (%esi,%ecx,4), %eax
359 movl (%edi,%ecx,4), %ebx
364 movl %ebx, (%edi,%ecx,4)
372 popl %edx C outer loop counter
385 movl -4(%edi), %ebx C risk of data cache bank clash here
387 mull -12(%esi) C src[size-2]*src[size-3]
395 mull -12(%esi) C src[size-1]*src[size-3]
409 mull -8(%esi) C src[size-1]*src[size-2]
415 movl PARAM_SIZE, %eax
420 addl $1, %eax C -(size-1) and clear carry
424 C -----------------------------------------------------------------------------
425 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
428 C eax counter, negative
436 movl 12(%edi,%eax,8), %ebx
439 movl 16(%edi,%eax,8), %ecx
442 movl %ebx, 12(%edi,%eax,8)
444 movl %ecx, 16(%edi,%eax,8)
450 adcl %eax, %eax C high bit out
453 movl PARAM_SIZE, %ecx C risk of cache bank clash
454 movl %eax, 12(%edi) C dst most significant limb
457 C -----------------------------------------------------------------------------
458 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
459 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
460 C low limb of src[0]^2.
462 movl (%esi), %eax C src[0]
463 leal (%esi,%ecx,4), %esi C src end
469 movl %eax, 16(%edi,%ecx,8) C dst[0]
472 addl $1, %ecx C size-1 and clear carry
475 C eax scratch (low product)
477 C ecx counter, negative
478 C edx scratch (high product)
481 C ebp scratch (fetched dst limbs)
483 movl (%esi,%ecx,4), %eax
488 movl 16-4(%edi,%ecx,8), %ebp
491 movl 16(%edi,%ecx,8), %ebp
494 movl %ebx, 16-4(%edi,%ecx,8)
496 movl %ebp, 16(%edi,%ecx,8)
504 movl 16-4(%edi), %eax C dst most significant limb
509 movl %edx, 16-4(%edi)
510 popl %esi C risk of cache bank clash