1 dnl x86 generic mpn_sqr_basecase -- square an mpn number.
3 dnl Copyright 1999, 2000, 2002, 2003 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
21 include(`../config.m4')
24 C cycles/crossproduct cycles/triangleproduct
32 C void mpn_sqr_basecase (mp_ptr dst, mp_srcptr src, mp_size_t size);
34 C The algorithm is basically the same as mpn/generic/sqr_basecase.c, but a
35 C lot of function call overheads are avoided, especially when the size is
38 C The mul1 loop is not unrolled like mul_1.asm, it doesn't seem worth the
39 C code size to do so here.
43 C The addmul loop here is also not unrolled like aorsmul_1.asm and
44 C mul_basecase.asm are. Perhaps it should be done. It'd add to the
45 C complexity, but if it's worth doing in the other places then it should be
48 C A fully-unrolled style like other sqr_basecase.asm versions (k6, k7, p6)
49 C might be worth considering. That'd add quite a bit to the code size, but
50 C only as much as is used would be dragged into L1 cache.
52 defframe(PARAM_SIZE,12)
53 defframe(PARAM_SRC, 8)
54 defframe(PARAM_DST, 4)
58 PROLOGUE(mpn_sqr_basecase)
72 C -----------------------------------------------------------------------------
86 C -----------------------------------------------------------------------------
105 movl %edx, %esi C dst[1]
106 movl %eax, (%ecx) C dst[0]
111 movl %eax, %edi C dst[2]
112 movl %edx, %ebp C dst[3]
115 mull 4(%ebx) C src[0]*src[1]
141 C -----------------------------------------------------------------------------
150 pushl %ebx FRAME_pushl()
151 pushl %edi FRAME_pushl()
153 pushl %esi FRAME_pushl()
154 pushl %ebp FRAME_pushl()
156 leal (%ecx,%edx,4), %edi C &dst[size], end of this mul1
157 leal (%eax,%edx,4), %esi C &src[size]
159 C First multiply src[0]*src[1..size-1] and store at dst[1..size].
161 movl (%eax), %ebp C src[0], multiplier
165 xorl %ebx, %ebx C clear carry limb
167 incl %ecx C -(size-1)
172 C ecx counter, limbs, negative
178 movl (%esi,%ecx,4), %eax
182 movl %ebx, (%edi,%ecx,4)
190 C Add products src[n]*src[n+1..size-1] at dst[2*n-1...], for
193 C The last products src[size-2]*src[size-1], which is the end corner
194 C of the product triangle, is handled separately at the end to save
195 C looping overhead. If size is 3 then it's only this that needs to
198 C In the outer loop %esi is a constant, and %edi just advances by 1
199 C limb each time. The size of the operation decreases by 1 limb
203 C ebx carry (needing carry flag added)
210 movl PARAM_SIZE, %ecx
216 dnl re-use parameter space
217 define(VAR_OUTER,`PARAM_DST')
223 C edx outer loop counter, -(size-3) to -1
225 C edi dst, pointing at stored carry limb of previous loop
229 addl $4, %edi C advance dst end
231 movl -8(%esi,%ecx,4), %ebp C next multiplier
234 xorl %ebx, %ebx C initial carry limb
238 C ebx carry (needing carry flag added)
239 C ecx counter, -n-1 to -1
242 C edi dst end of this addmul
245 movl (%esi,%ecx,4), %eax
249 addl %eax, (%edi,%ecx,4)
267 mull -8(%esi) C src[size-1]*src[size-2]
270 movl %edx, 4(%edi) C dst high limb
273 C -----------------------------------------------------------------------------
274 C Left shift of dst[1..2*size-2], high bit shifted out becomes dst[2*size-1].
276 movl PARAM_SIZE, %eax
278 addl $1, %eax C -(size-1) and clear carry
281 C eax counter, negative
295 adcl %eax, %eax C high bit out
296 movl %eax, 8(%edi) C dst most significant limb
299 C Now add in the squares on the diagonal, namely src[0]^2, src[1]^2, ...,
300 C src[size-1]^2. dst[0] hasn't yet been set at all yet, and just gets the
301 C low limb of src[0]^2.
304 movl (%esi), %eax C src[0]
307 movl PARAM_SIZE, %ecx
308 leal (%esi,%ecx,4), %esi C src end
311 movl %edx, %ebx C initial carry
313 movl %eax, 12(%edi,%ecx,8) C dst[0]
314 incl %ecx C -(size-1)
317 C eax scratch (low product)
319 C ecx counter, -(size-1) to -1
320 C edx scratch (high product)
323 C ebp scratch (fetched dst limbs)
325 movl (%esi,%ecx,4), %eax
328 addl %ebx, 8(%edi,%ecx,8)
331 adcl %eax, 12(%edi,%ecx,8)
338 addl %ebx, 8(%edi) C dst most significant limb