Tizen 2.1 base
[external/gmp.git] / mpn / x86 / pentium4 / sse2 / sqr_basecase.asm
1 dnl  mpn_sqr_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
2
3 dnl  Copyright 2001, 2002, 2007 Free Software Foundation, Inc.
4 dnl
5 dnl  This file is part of the GNU MP Library.
6 dnl
7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl  it under the terms of the GNU Lesser General Public License as published
9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
10 dnl  your option) any later version.
11 dnl
12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15 dnl  License for more details.
16 dnl
17 dnl  You should have received a copy of the GNU Lesser General Public License
18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22 C TODO:
23 C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
24 C    scheduling could improve things by several cycles per outer iteration.
25 C  * In Lam3...Lam1 code for, keep accumulation operands in registers, without
26 C    storing intermediates to rp.
27 C  * We might want to keep 32 in a free mm register, since the register form is
28 C    3 bytes and the immediate form is 4 bytes.  About 80 bytes to save.
29 C  * Look into different loop alignment, we now expand the code about 50 bytes
30 C    with possibly needless alignment.
31 C  * Use OSP, should solve feed-in latency problems.
32 C  * Address relative slowness for un<=3 for Pentium M.  The old code is there
33 C    considerably faster.  (1:20/14, 2:34:32, 3:66/57)
34
35 C INPUT PARAMETERS
36 C rp            sp + 4
37 C up            sp + 8
38 C un            sp + 12
39
40         TEXT
41         ALIGN(16)
42 PROLOGUE(mpn_sqr_basecase)
43         mov     4(%esp), %edx           C rp
44         mov     8(%esp), %eax           C up
45         mov     12(%esp), %ecx          C un
46
47         cmp     $2, %ecx
48         jc      L(un1)
49         jz      L(un2)
50         cmp     $4, %ecx
51         jc      L(un3)
52         jz      L(un4)
53         jmp     L(big)
54
55 L(un1): mov     (%eax), %eax
56         mov     %edx, %ecx
57         mul     %eax
58         mov     %eax, (%ecx)
59         mov     %edx, 4(%ecx)
60         ret
61 L(un2): movd    (%eax), %mm0            C                               un=2
62         movd    (%eax), %mm2            C                               un=2
63         movd    4(%eax), %mm1           C                               un=2
64         pmuludq %mm0, %mm0              C 64b weight 0                  un=2
65         pmuludq %mm1, %mm2              C 64b weight 32                 un=2
66         pmuludq %mm1, %mm1              C 64b weight 64                 un=2
67         movd    %mm0, (%edx)            C                               un=2
68         psrlq   $32, %mm0               C 32b weight 32                 un=2
69         pcmpeqd %mm7, %mm7              C                               un=2
70         psrlq   $33, %mm7               C 0x000000007FFFFFFF            un=2
71         pand    %mm2, %mm7              C 31b weight 32                 un=2
72         psrlq   $31, %mm2               C 33b weight 65                 un=2
73         psllq   $1, %mm7                C 31b weight 33                 un=2
74         paddq   %mm7, %mm0              C                               un=2
75         movd    %mm0, 4(%edx)           C                               un=2
76         psrlq   $32, %mm0               C                               un=2
77         paddq   %mm2, %mm1              C                               un=2
78         paddq   %mm0, %mm1              C                               un=2
79         movd    %mm1, 8(%edx)           C                               un=2
80         psrlq   $32, %mm1               C                               un=2
81         movd    %mm1, 12(%edx)          C                               un=2
82         emms
83         ret
84 L(un3): movd    (%eax), %mm7            C                               un=3
85         movd    4(%eax), %mm6           C                               un=3
86         pmuludq %mm7, %mm6              C                               un=3
87         movd    8(%eax), %mm2           C                               un=3
88         pmuludq %mm7, %mm2              C                               un=3
89         movd    %mm6, 4(%edx)           C                               un=3
90         psrlq   $32, %mm6               C                               un=3
91         paddq   %mm2, %mm6              C                               un=3
92         movd    %mm6, 8(%edx)           C                               un=3
93         psrlq   $32, %mm6               C                               un=3
94         movd    %mm6, 12(%edx)          C                               un=3
95         lea     4(%edx), %edx           C                               un=3
96         lea     4(%eax), %eax           C                               un=3
97         jmp     L(am1)
98 L(un4): movd    (%eax), %mm7            C                               un=4
99         movd    4(%eax), %mm6           C                               un=4
100         pmuludq %mm7, %mm6              C                               un=4
101         movd    8(%eax), %mm0           C                               un=4
102         pmuludq %mm7, %mm0              C                               un=4
103         movd    12(%eax), %mm1          C                               un=4
104         pmuludq %mm7, %mm1              C                               un=4
105         movd    %mm6, 4(%edx)           C                               un=4
106         psrlq   $32, %mm6               C                               un=4
107         paddq   %mm0, %mm6              C                               un=4
108         movd    %mm6, 8(%edx)           C                               un=4
109         psrlq   $32, %mm6               C                               un=4
110         paddq   %mm1, %mm6              C                               un=4
111         movd    %mm6, 12(%edx)          C                               un=4
112         psrlq   $32, %mm6               C                               un=4
113         movd    %mm6, 16(%edx)          C                               un=4
114         lea     4(%edx), %edx           C                               un=4
115         lea     4(%eax), %eax           C                               un=4
116         jmp     L(am2)
117
118 L(big): push    %esi
119         push    %ebx
120         push    %edi
121         pxor    %mm6, %mm6
122         movd    (%eax), %mm7            C
123         lea     4(%eax), %esi           C init up, up++
124         lea     4(%eax), %eax           C up2++  FIXME: should fix offsets
125         lea     4(%edx), %edi           C init rp, rp++
126         lea     4(%edx), %edx           C rp2++
127         lea     -4(%ecx), %ebx          C loop count
128         and     $3, %ecx
129         jz      L(3m)
130         cmp     $2, %ecx
131         ja      L(2m)
132         jb      L(0m)
133
134 L(1m):
135         movd    (%eax), %mm4            C                               m 1
136         lea     (%ebx), %ecx            C inner loop count              m 1
137         pmuludq %mm7, %mm4              C                               m 1
138         movd    4(%eax), %mm3           C                               m 1
139         pmuludq %mm7, %mm3              C                               m 1
140         movd    8(%eax), %mm0           C                               m 1
141         jmp     L(m01)                  C                               m 1
142         ALIGN(16)                       C                               m 1
143 L(lpm1):
144         pmuludq %mm7, %mm4              C                               m 1
145         paddq   %mm0, %mm6              C                               m 1
146         movd    4(%eax), %mm3           C                               m 1
147         movd    %mm6, -8(%edx)          C                               m 1
148         psrlq   $32, %mm6               C                               m 1
149         pmuludq %mm7, %mm3              C                               m 1
150         paddq   %mm1, %mm6              C                               m 1
151         movd    8(%eax), %mm0           C                               m 1
152         movd    %mm6, -4(%edx)          C                               m 1
153         psrlq   $32, %mm6               C                               m 1
154 L(m01): pmuludq %mm7, %mm0              C                               m 1
155         paddq   %mm4, %mm6              C                               m 1
156         movd    12(%eax), %mm1          C                               m 1
157         movd    %mm6, (%edx)            C                               m 1
158         psrlq   $32, %mm6               C                               m 1
159         pmuludq %mm7, %mm1              C                               m 1
160         paddq   %mm3, %mm6              C                               m 1
161         movd    16(%eax), %mm4          C                               m 1
162         movd    %mm6, 4(%edx)           C                               m 1
163         psrlq   $32, %mm6               C                               m 1
164         lea     16(%eax), %eax          C                               m 1
165         lea     16(%edx), %edx          C                               m 1
166         sub     $4, %ecx                C                               m 1
167         ja      L(lpm1)                 C                               m 1
168         pmuludq %mm7, %mm4              C                               m 1
169         paddq   %mm0, %mm6              C                               m 1
170         movd    %mm6, -8(%edx)          C                               m 1
171         psrlq   $32, %mm6               C                               m 1
172         paddq   %mm1, %mm6              C                               m 1
173         jmp     L(0)
174
175 L(2m):
176         movd    (%eax), %mm1            C                               m 2
177         lea     (%ebx), %ecx            C inner loop count              m 2
178         pmuludq %mm7, %mm1              C                               m 2
179         movd    4(%eax), %mm4           C                               m 2
180         pmuludq %mm7, %mm4              C                               m 2
181         movd    8(%eax), %mm3           C                               m 2
182         jmp     L(m10)                  C                               m 2
183         ALIGN(16)                       C                               m 2
184 L(lpm2):
185         pmuludq %mm7, %mm4              C                               m 2
186         paddq   %mm0, %mm6              C                               m 2
187         movd    8(%eax), %mm3           C                               m 2
188         movd    %mm6, -4(%edx)          C                               m 2
189         psrlq   $32, %mm6               C                               m 2
190 L(m10): pmuludq %mm7, %mm3              C                               m 2
191         paddq   %mm1, %mm6              C                               m 2
192         movd    12(%eax), %mm0          C                               m 2
193         movd    %mm6, (%edx)            C                               m 2
194         psrlq   $32, %mm6               C                               m 2
195         pmuludq %mm7, %mm0              C                               m 2
196         paddq   %mm4, %mm6              C                               m 2
197         movd    16(%eax), %mm1          C                               m 2
198         movd    %mm6, 4(%edx)           C                               m 2
199         psrlq   $32, %mm6               C                               m 2
200         pmuludq %mm7, %mm1              C                               m 2
201         paddq   %mm3, %mm6              C                               m 2
202         movd    20(%eax), %mm4          C                               m 2
203         movd    %mm6, 8(%edx)           C                               m 2
204         psrlq   $32, %mm6               C                               m 2
205         lea     16(%eax), %eax          C                               m 2
206         lea     16(%edx), %edx          C                               m 2
207         sub     $4, %ecx                C                               m 2
208         ja      L(lpm2)                 C                               m 2
209         pmuludq %mm7, %mm4              C                               m 2
210         paddq   %mm0, %mm6              C                               m 2
211         movd    %mm6, -4(%edx)          C                               m 2
212         psrlq   $32, %mm6               C                               m 2
213         paddq   %mm1, %mm6              C                               m 2
214         jmp     L(1)
215
216 L(3m):
217         movd    (%eax), %mm0            C                               m 3
218         lea     (%ebx), %ecx            C inner loop count              m 3
219         pmuludq %mm7, %mm0              C                               m 3
220         movd    4(%eax), %mm1           C                               m 3
221         pmuludq %mm7, %mm1              C                               m 3
222         movd    8(%eax), %mm4           C                               m 3
223         jmp     L(lpm3)                 C                               m 3
224         ALIGN(16)                       C                               m 3
225 L(lpm3):
226         pmuludq %mm7, %mm4              C                               m 3
227         paddq   %mm0, %mm6              C                               m 3
228         movd    12(%eax), %mm3          C                               m 3
229         movd    %mm6, (%edx)            C                               m 3
230         psrlq   $32, %mm6               C                               m 3
231         pmuludq %mm7, %mm3              C                               m 3
232         paddq   %mm1, %mm6              C                               m 3
233         movd    16(%eax), %mm0          C                               m 3
234         movd    %mm6, 4(%edx)           C                               m 3
235         psrlq   $32, %mm6               C                               m 3
236         pmuludq %mm7, %mm0              C                               m 3
237         paddq   %mm4, %mm6              C                               m 3
238         movd    20(%eax), %mm1          C                               m 3
239         movd    %mm6, 8(%edx)           C                               m 3
240         psrlq   $32, %mm6               C                               m 3
241         pmuludq %mm7, %mm1              C                               m 3
242         paddq   %mm3, %mm6              C                               m 3
243         movd    24(%eax), %mm4          C                               m 3
244         movd    %mm6, 12(%edx)          C                               m 3
245         psrlq   $32, %mm6               C                               m 3
246         lea     16(%eax), %eax          C                               m 3
247         lea     16(%edx), %edx          C                               m 3
248         sub     $4, %ecx                C                               m 3
249         ja      L(lpm3)                 C                               m 3
250         pmuludq %mm7, %mm4              C                               m 3
251         paddq   %mm0, %mm6              C                               m 3
252         movd    %mm6, (%edx)            C                               m 3
253         psrlq   $32, %mm6               C                               m 3
254         paddq   %mm1, %mm6              C                               m 3
255         jmp     L(2)
256
257 L(0m):
258         movd    (%eax), %mm3            C                               m 0
259         lea     (%ebx), %ecx            C inner loop count              m 0
260         pmuludq %mm7, %mm3              C                               m 0
261         movd    4(%eax), %mm0           C                               m 0
262         pmuludq %mm7, %mm0              C                               m 0
263         movd    8(%eax), %mm1           C                               m 0
264         jmp     L(m00)                  C                               m 0
265         ALIGN(16)                       C                               m 0
266 L(lpm0):
267         pmuludq %mm7, %mm4              C                               m 0
268         paddq   %mm0, %mm6              C                               m 0
269         movd    (%eax), %mm3            C                               m 0
270         movd    %mm6, -12(%edx)         C                               m 0
271         psrlq   $32, %mm6               C                               m 0
272         pmuludq %mm7, %mm3              C                               m 0
273         paddq   %mm1, %mm6              C                               m 0
274         movd    4(%eax), %mm0           C                               m 0
275         movd    %mm6, -8(%edx)          C                               m 0
276         psrlq   $32, %mm6               C                               m 0
277         pmuludq %mm7, %mm0              C                               m 0
278         paddq   %mm4, %mm6              C                               m 0
279         movd    8(%eax), %mm1           C                               m 0
280         movd    %mm6, -4(%edx)          C                               m 0
281         psrlq   $32, %mm6               C                               m 0
282 L(m00): pmuludq %mm7, %mm1              C                               m 0
283         paddq   %mm3, %mm6              C                               m 0
284         movd    12(%eax), %mm4          C                               m 0
285         movd    %mm6, (%edx)            C                               m 0
286         psrlq   $32, %mm6               C                               m 0
287         lea     16(%eax), %eax          C                               m 0
288         lea     16(%edx), %edx          C                               m 0
289         sub     $4, %ecx                C                               m 0
290         ja      L(lpm0)                 C                               m 0
291         pmuludq %mm7, %mm4              C                               m 0
292         paddq   %mm0, %mm6              C                               m 0
293         movd    %mm6, -12(%edx)         C                               m 0
294         psrlq   $32, %mm6               C                               m 0
295         paddq   %mm1, %mm6              C                               m 0
296         jmp     L(3)
297
298 L(outer):
299         lea     8(%edi), %edi           C rp += 2
300         movd    (%esi), %mm7            C                               am 3
301         mov     %edi, %edx              C rp2 = rp                      am 3
302         lea     4(%esi), %esi           C up++                          am 3
303         lea     (%esi), %eax            C up2 = up                      am 3
304         movd    (%eax), %mm0            C                               am 3
305         lea     (%ebx), %ecx            C inner loop count              am 3
306         pxor    %mm6, %mm6              C                               am 3
307         pmuludq %mm7, %mm0              C                               am 3
308         movd    4(%eax), %mm1           C                               am 3
309         movd    (%edx), %mm4            C                               am 3
310         pmuludq %mm7, %mm1              C                               am 3
311         movd    8(%eax), %mm2           C                               am 3
312         paddq   %mm0, %mm4              C                               am 3
313         movd    4(%edx), %mm5           C                               am 3
314         jmp     L(lam3)                 C                               am 3
315         ALIGN(16)                       C                               am 3
316 L(lam3):
317         pmuludq %mm7, %mm2              C                               am 3
318         paddq   %mm4, %mm6              C                               am 3
319         movd    12(%eax), %mm3          C                               am 3
320         paddq   %mm1, %mm5              C                               am 3
321         movd    8(%edx), %mm4           C                               am 3
322         movd    %mm6, (%edx)            C                               am 3
323         psrlq   $32, %mm6               C                               am 3
324         pmuludq %mm7, %mm3              C                               am 3
325         paddq   %mm5, %mm6              C                               am 3
326         movd    16(%eax), %mm0          C                               am 3
327         paddq   %mm2, %mm4              C                               am 3
328         movd    12(%edx), %mm5          C                               am 3
329         movd    %mm6, 4(%edx)           C                               am 3
330         psrlq   $32, %mm6               C                               am 3
331         pmuludq %mm7, %mm0              C                               am 3
332         paddq   %mm4, %mm6              C                               am 3
333         movd    20(%eax), %mm1          C                               am 3
334         paddq   %mm3, %mm5              C                               am 3
335         movd    16(%edx), %mm4          C                               am 3
336         movd    %mm6, 8(%edx)           C                               am 3
337         psrlq   $32, %mm6               C                               am 3
338         pmuludq %mm7, %mm1              C                               am 3
339         paddq   %mm5, %mm6              C                               am 3
340         movd    24(%eax), %mm2          C                               am 3
341         paddq   %mm0, %mm4              C                               am 3
342         movd    20(%edx), %mm5          C                               am 3
343         movd    %mm6, 12(%edx)          C                               am 3
344         psrlq   $32, %mm6               C                               am 3
345         lea     16(%eax), %eax          C                               am 3
346         lea     16(%edx), %edx          C                               am 3
347         sub     $4, %ecx                C                               am 3
348         ja      L(lam3)                 C                               am 3
349         pmuludq %mm7, %mm2              C                               am 3
350         paddq   %mm4, %mm6              C                               am 3
351         paddq   %mm1, %mm5              C                               am 3
352         movd    8(%edx), %mm4           C                               am 3
353         movd    %mm6, (%edx)            C                               am 3
354         psrlq   $32, %mm6               C                               am 3
355         paddq   %mm5, %mm6              C                               am 3
356         paddq   %mm2, %mm4              C                               am 3
357 L(2):   movd    %mm6, 4(%edx)           C                               am 3
358         psrlq   $32, %mm6               C                               am 3
359         paddq   %mm4, %mm6              C                               am 3
360         movd    %mm6, 8(%edx)           C                               am 3
361         psrlq   $32, %mm6               C                               am 3
362         movd    %mm6, 12(%edx)          C                               am 3
363
364         lea     8(%edi), %edi           C rp += 2
365         movd    (%esi), %mm7            C                               am 2
366         mov     %edi, %edx              C rp2 = rp                      am 2
367         lea     4(%esi), %esi           C up++                          am 2
368         lea     (%esi), %eax            C up2 = up                      am 2
369         movd    (%eax), %mm1            C                               am 2
370         lea     (%ebx), %ecx            C inner loop count              am 2
371         pxor    %mm6, %mm6              C                               am 2
372         pmuludq %mm7, %mm1              C                               am 2
373         movd    4(%eax), %mm2           C                               am 2
374         movd    (%edx), %mm5            C                               am 2
375         pmuludq %mm7, %mm2              C                               am 2
376         movd    8(%eax), %mm3           C                               am 2
377         paddq   %mm1, %mm5              C                               am 2
378         movd    4(%edx), %mm4           C                               am 2
379         jmp     L(am10)                 C                               am 2
380         ALIGN(16)                       C                               am 2
381 L(lam2):
382         pmuludq %mm7, %mm2              C                               am 2
383         paddq   %mm4, %mm6              C                               am 2
384         movd    8(%eax), %mm3           C                               am 2
385         paddq   %mm1, %mm5              C                               am 2
386         movd    4(%edx), %mm4           C                               am 2
387         movd    %mm6, -4(%edx)          C                               am 2
388         psrlq   $32, %mm6               C                               am 2
389 L(am10):
390         pmuludq %mm7, %mm3              C                               am 2
391         paddq   %mm5, %mm6              C                               am 2
392         movd    12(%eax), %mm0          C                               am 2
393         paddq   %mm2, %mm4              C                               am 2
394         movd    8(%edx), %mm5           C                               am 2
395         movd    %mm6, (%edx)            C                               am 2
396         psrlq   $32, %mm6               C                               am 2
397         pmuludq %mm7, %mm0              C                               am 2
398         paddq   %mm4, %mm6              C                               am 2
399         movd    16(%eax), %mm1          C                               am 2
400         paddq   %mm3, %mm5              C                               am 2
401         movd    12(%edx), %mm4          C                               am 2
402         movd    %mm6, 4(%edx)           C                               am 2
403         psrlq   $32, %mm6               C                               am 2
404         pmuludq %mm7, %mm1              C                               am 2
405         paddq   %mm5, %mm6              C                               am 2
406         movd    20(%eax), %mm2          C                               am 2
407         paddq   %mm0, %mm4              C                               am 2
408         movd    16(%edx), %mm5          C                               am 2
409         movd    %mm6, 8(%edx)           C                               am 2
410         psrlq   $32, %mm6               C                               am 2
411         lea     16(%eax), %eax          C                               am 2
412         lea     16(%edx), %edx          C                               am 2
413         sub     $4, %ecx                C                               am 2
414         ja      L(lam2)                 C                               am 2
415         pmuludq %mm7, %mm2              C                               am 2
416         paddq   %mm4, %mm6              C                               am 2
417         paddq   %mm1, %mm5              C                               am 2
418         movd    4(%edx), %mm4           C                               am 2
419         movd    %mm6, -4(%edx)          C                               am 2
420         psrlq   $32, %mm6               C                               am 2
421         paddq   %mm5, %mm6              C                               am 2
422         paddq   %mm2, %mm4              C                               am 2
423 L(1):   movd    %mm6, (%edx)            C                               am 2
424         psrlq   $32, %mm6               C                               am 2
425         paddq   %mm4, %mm6              C                               am 2
426         movd    %mm6, 4(%edx)           C                               am 2
427         psrlq   $32, %mm6               C                               am 2
428         movd    %mm6, 8(%edx)           C                               am 2
429
430         lea     8(%edi), %edi           C rp += 2
431         movd    (%esi), %mm7            C                               am 1
432         mov     %edi, %edx              C rp2 = rp                      am 1
433         lea     4(%esi), %esi           C up++                          am 1
434         lea     (%esi), %eax            C up2 = up                      am 1
435         movd    (%eax), %mm2            C                               am 1
436         lea     (%ebx), %ecx            C inner loop count              am 1
437         pxor    %mm6, %mm6              C                               am 1
438         pmuludq %mm7, %mm2              C                               am 1
439         movd    4(%eax), %mm3           C                               am 1
440         movd    (%edx), %mm4            C                               am 1
441         pmuludq %mm7, %mm3              C                               am 1
442         movd    8(%eax), %mm0           C                               am 1
443         paddq   %mm2, %mm4              C                               am 1
444         movd    4(%edx), %mm5           C                               am 1
445         jmp     L(am01)                 C                               am 1
446         ALIGN(16)                       C                               am 1
447 L(lam1):
448         pmuludq %mm7, %mm2              C                               am 1
449         paddq   %mm4, %mm6              C                               am 1
450         movd    4(%eax), %mm3           C                               am 1
451         paddq   %mm1, %mm5              C                               am 1
452         movd    (%edx), %mm4            C                               am 1
453         movd    %mm6, -8(%edx)          C                               am 1
454         psrlq   $32, %mm6               C                               am 1
455         pmuludq %mm7, %mm3              C                               am 1
456         paddq   %mm5, %mm6              C                               am 1
457         movd    8(%eax), %mm0           C                               am 1
458         paddq   %mm2, %mm4              C                               am 1
459         movd    4(%edx), %mm5           C                               am 1
460         movd    %mm6, -4(%edx)          C                               am 1
461         psrlq   $32, %mm6               C                               am 1
462 L(am01):
463         pmuludq %mm7, %mm0              C                               am 1
464         paddq   %mm4, %mm6              C                               am 1
465         movd    12(%eax), %mm1          C                               am 1
466         paddq   %mm3, %mm5              C                               am 1
467         movd    8(%edx), %mm4           C                               am 1
468         movd    %mm6, (%edx)            C                               am 1
469         psrlq   $32, %mm6               C                               am 1
470         pmuludq %mm7, %mm1              C                               am 1
471         paddq   %mm5, %mm6              C                               am 1
472         movd    16(%eax), %mm2          C                               am 1
473         paddq   %mm0, %mm4              C                               am 1
474         movd    12(%edx), %mm5          C                               am 1
475         movd    %mm6, 4(%edx)           C                               am 1
476         psrlq   $32, %mm6               C                               am 1
477         lea     16(%eax), %eax          C                               am 1
478         lea     16(%edx), %edx          C                               am 1
479         sub     $4, %ecx                C                               am 1
480         ja      L(lam1)                 C                               am 1
481         pmuludq %mm7, %mm2              C                               am 1
482         paddq   %mm4, %mm6              C                               am 1
483         paddq   %mm1, %mm5              C                               am 1
484         movd    (%edx), %mm4            C                               am 1
485         movd    %mm6, -8(%edx)          C                               am 1
486         psrlq   $32, %mm6               C                               am 1
487         paddq   %mm5, %mm6              C                               am 1
488         paddq   %mm2, %mm4              C                               am 1
489 L(0):   movd    %mm6, -4(%edx)          C                               am 1
490         psrlq   $32, %mm6               C                               am 1
491         paddq   %mm4, %mm6              C                               am 1
492         movd    %mm6, (%edx)            C                               am 1
493         psrlq   $32, %mm6               C                               am 1
494         movd    %mm6, 4(%edx)           C                               am 1
495
496         lea     8(%edi), %edi           C rp += 2
497         movd    (%esi), %mm7            C                               am 0
498         mov     %edi, %edx              C rp2 = rp                      am 0
499         lea     4(%esi), %esi           C up++                          am 0
500         lea     (%esi), %eax            C up2 = up                      am 0
501         movd    (%eax), %mm3            C                               am 0
502         lea     (%ebx), %ecx            C inner loop count              am 0
503         pxor    %mm6, %mm6              C                               am 0
504         pmuludq %mm7, %mm3              C                               am 0
505         movd    4(%eax), %mm0           C                               am 0
506         movd    (%edx), %mm5            C                               am 0
507         pmuludq %mm7, %mm0              C                               am 0
508         movd    8(%eax), %mm1           C                               am 0
509         paddq   %mm3, %mm5              C                               am 0
510         movd    4(%edx), %mm4           C                               am 0
511         jmp     L(am00)                 C                               am 0
512         ALIGN(16)                       C                               am 0
513 L(lam0):
514         pmuludq %mm7, %mm2              C                               am 0
515         paddq   %mm4, %mm6              C                               am 0
516         movd    (%eax), %mm3            C                               am 0
517         paddq   %mm1, %mm5              C                               am 0
518         movd    -4(%edx), %mm4          C                               am 0
519         movd    %mm6, -12(%edx)         C                               am 0
520         psrlq   $32, %mm6               C                               am 0
521         pmuludq %mm7, %mm3              C                               am 0
522         paddq   %mm5, %mm6              C                               am 0
523         movd    4(%eax), %mm0           C                               am 0
524         paddq   %mm2, %mm4              C                               am 0
525         movd    (%edx), %mm5            C                               am 0
526         movd    %mm6, -8(%edx)          C                               am 0
527         psrlq   $32, %mm6               C                               am 0
528         pmuludq %mm7, %mm0              C                               am 0
529         paddq   %mm4, %mm6              C                               am 0
530         movd    8(%eax), %mm1           C                               am 0
531         paddq   %mm3, %mm5              C                               am 0
532         movd    4(%edx), %mm4           C                               am 0
533         movd    %mm6, -4(%edx)          C                               am 0
534         psrlq   $32, %mm6               C                               am 0
535 L(am00):
536         pmuludq %mm7, %mm1              C                               am 0
537         paddq   %mm5, %mm6              C                               am 0
538         movd    12(%eax), %mm2          C                               am 0
539         paddq   %mm0, %mm4              C                               am 0
540         movd    8(%edx), %mm5           C                               am 0
541         movd    %mm6, (%edx)            C                               am 0
542         psrlq   $32, %mm6               C                               am 0
543         lea     16(%eax), %eax          C                               am 0
544         lea     16(%edx), %edx          C                               am 0
545         sub     $4, %ecx                C                               am 0
546         ja      L(lam0)                 C                               am 0
547         pmuludq %mm7, %mm2              C                               am 0
548         paddq   %mm4, %mm6              C                               am 0
549         paddq   %mm1, %mm5              C                               am 0
550         movd    -4(%edx), %mm4          C                               am 0
551         movd    %mm6, -12(%edx)         C                               am 0
552         psrlq   $32, %mm6               C                               am 0
553         paddq   %mm5, %mm6              C                               am 0
554         paddq   %mm2, %mm4              C                               am 0
555 L(3):   movd    %mm6, -8(%edx)          C                               am 0
556         psrlq   $32, %mm6               C                               am 0
557         paddq   %mm4, %mm6              C                               am 0
558         movd    %mm6, -4(%edx)          C                               am 0
559         psrlq   $32, %mm6               C                               am 0
560         movd    %mm6, (%edx)            C                               am 0
561         sub     $4, %ebx                C                               am 0
562         ja      L(outer)                        C                               am 0
563
564         mov     %edi, %edx
565         mov     %esi, %eax
566         pop     %edi
567         pop     %ebx
568         pop     %esi
569
570 L(am3): C up[un-1..un-3] x up[un-4]
571         lea     8(%edx), %edx           C rp2 += 2
572         movd    (%eax), %mm7
573         movd    4(%eax), %mm1
574         movd    8(%eax), %mm2
575         movd    12(%eax), %mm3
576         movd    (%edx), %mm4
577         pmuludq %mm7, %mm1
578         movd    4(%edx), %mm5
579         pmuludq %mm7, %mm2
580         movd    8(%edx), %mm6
581         pmuludq %mm7, %mm3
582         paddq   %mm1, %mm4
583         paddq   %mm2, %mm5
584         paddq   %mm3, %mm6
585         movd    %mm4, (%edx)
586         psrlq   $32, %mm4
587         paddq   %mm5, %mm4
588         movd    %mm4, 4(%edx)
589         psrlq   $32, %mm4
590         paddq   %mm6, %mm4
591         movd    %mm4, 8(%edx)
592         psrlq   $32, %mm4
593         movd    %mm4, 12(%edx)          C FIXME feed through!
594         lea     4(%eax), %eax
595
596 L(am2): C up[un-1..un-2] x up[un-3]
597         lea     8(%edx), %edx           C rp2 += 2
598         movd    (%eax), %mm7
599         movd    4(%eax), %mm1
600         movd    8(%eax), %mm2
601         movd    (%edx), %mm4
602         movd    4(%edx), %mm5
603         pmuludq %mm7, %mm1
604         pmuludq %mm7, %mm2
605         paddq   %mm1, %mm4
606         paddq   %mm2, %mm5
607         movd    %mm4, (%edx)
608         psrlq   $32, %mm4
609         paddq   %mm5, %mm4
610         movd    %mm4, 4(%edx)
611         psrlq   $32, %mm4
612         movd    %mm4, 8(%edx)           C FIXME feed through!
613         lea     4(%eax), %eax
614
615 L(am1): C up[un-1] x up[un-2]
616         lea     8(%edx), %edx           C rp2 += 2
617         movd    (%eax), %mm7
618         movd    4(%eax), %mm2
619         movd    (%edx), %mm4
620         pmuludq %mm7, %mm2
621         paddq   %mm2, %mm4
622         movd    %mm4, (%edx)
623         psrlq   $32, %mm4
624         movd    %mm4, 4(%edx)
625
626 C *** diag stuff, use elementary code for now
627
628         mov     4(%esp), %edx           C rp
629         mov     8(%esp), %eax           C up
630         mov     12(%esp), %ecx          C un
631
632         movd    (%eax), %mm2
633         pmuludq %mm2, %mm2              C src[0]^2
634
635         pcmpeqd %mm7, %mm7
636         psrlq   $32, %mm7
637
638         movd    4(%edx), %mm3           C dst[1]
639
640         movd    %mm2, (%edx)
641         psrlq   $32, %mm2
642
643         psllq   $1, %mm3                C 2*dst[1]
644         paddq   %mm3, %mm2
645         movd    %mm2, 4(%edx)
646         psrlq   $32, %mm2
647
648         sub     $2, %ecx
649
650 L(diag):
651         movd    4(%eax), %mm0           C src limb
652         add     $4, %eax
653         pmuludq %mm0, %mm0
654         movq    %mm7, %mm1
655         pand    %mm0, %mm1              C diagonal low
656         psrlq   $32, %mm0               C diagonal high
657
658         movd    8(%edx), %mm3
659         psllq   $1, %mm3                C 2*dst[i]
660         paddq   %mm3, %mm1
661         paddq   %mm1, %mm2
662         movd    %mm2, 8(%edx)
663         psrlq   $32, %mm2
664
665         movd    12(%edx), %mm3
666         psllq   $1, %mm3                C 2*dst[i+1]
667         paddq   %mm3, %mm0
668         paddq   %mm0, %mm2
669         movd    %mm2, 12(%edx)
670         add     $8, %edx
671         psrlq   $32, %mm2
672
673         sub     $1, %ecx
674         jnz     L(diag)
675
676         movd    4(%eax), %mm0           C src[size-1]
677         pmuludq %mm0, %mm0
678         pand    %mm0, %mm7              C diagonal low
679         psrlq   $32, %mm0               C diagonal high
680
681         movd    8(%edx), %mm3           C dst[2*size-2]
682         psllq   $1, %mm3
683         paddq   %mm3, %mm7
684         paddq   %mm7, %mm2
685         movd    %mm2, 8(%edx)
686         psrlq   $32, %mm2
687
688         paddq   %mm0, %mm2
689         movd    %mm2, 12(%edx)          C dst[2*size-1]
690
691         emms
692         ret
693
694 EPILOGUE()