Imported Upstream version 6.0.0
[platform/upstream/gmp.git] / mpn / x86 / pentium4 / sse2 / mul_basecase.asm
1 dnl  mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
2
3 dnl  Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
4
5 dnl  This file is part of the GNU MP Library.
6 dnl
7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl  it under the terms of either:
9 dnl
10 dnl    * the GNU Lesser General Public License as published by the Free
11 dnl      Software Foundation; either version 3 of the License, or (at your
12 dnl      option) any later version.
13 dnl
14 dnl  or
15 dnl
16 dnl    * the GNU General Public License as published by the Free Software
17 dnl      Foundation; either version 2 of the License, or (at your option) any
18 dnl      later version.
19 dnl
20 dnl  or both in parallel, as here.
21 dnl
22 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
23 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
24 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
25 dnl  for more details.
26 dnl
27 dnl  You should have received copies of the GNU General Public License and the
28 dnl  GNU Lesser General Public License along with the GNU MP Library.  If not,
29 dnl  see https://www.gnu.org/licenses/.
30
31 include(`../config.m4')
32
33 C TODO:
34 C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
35 C    scheduling could improve things by several cycles per outer iteration.
36 C  * In code for un <= 3, try keeping accumulation operands in registers,
37 C    without storing intermediates to rp.
38 C  * We might want to keep 32 in a free mm register, since the register form is
39 C    3 bytes and the immediate form is 4 bytes.  About 70 bytes to save.
40 C  * Look into different loop alignment, we now expand the code about 50 bytes
41 C    with possibly needless alignment.
42 C  * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
43 C  * Use OSP, should solve feed-in latency problems.
44 C  * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
45 C  * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
46 C    so that they can share feed-in code, and changing the branch targets from
47 C    L<n> to Lm<nn>.
48
49 C                           cycles/limb
50 C P6 model 9   (Banias)         ?
51 C P6 model 13  (Dothan)         5.24
52 C P6 model 14  (Yonah)          ?
53 C P4 model 0-1 (Willamette):    5
54 C P4 model 2   (Northwood):     4.60 at 32 limbs
55 C P4 model 3-4 (Prescott):      4.94 at 32 limbs
56
57 C INPUT PARAMETERS
58 C rp            sp + 4
59 C up            sp + 8
60 C un            sp + 12
61 C vp            sp + 16
62 C vn            sp + 20
63
64         TEXT
65         ALIGN(16)
66 PROLOGUE(mpn_mul_basecase)
67         push    %esi
68         push    %ebx
69         mov     12(%esp), %edx          C rp
70         mov     16(%esp), %eax          C up
71         mov     20(%esp), %ecx          C un
72         mov     24(%esp), %esi          C vp
73         mov     28(%esp), %ebx          C vn
74         movd    (%esi), %mm7            C
75 L(ent): cmp     $3, %ecx
76         ja      L(big)
77         movd    (%eax), %mm6
78         pmuludq %mm7, %mm6
79         jz      L(un3)
80         cmp     $2, %ecx
81         jz      L(un2)
82
83 L(un1): movd    %mm6, (%edx)            C                               un=1
84         psrlq   $32, %mm6               C                               un=1
85         movd    %mm6, 4(%edx)           C                               un=1
86         jmp     L(rtr)                  C                               un=1
87
88 L(un2): movd    4(%eax), %mm1           C                               un=2
89         pmuludq %mm7, %mm1              C                               un=2
90         movd    %mm6, (%edx)            C                               un=2
91         psrlq   $32, %mm6               C                               un=2
92         paddq   %mm1, %mm6              C                               un=2
93         movd    %mm6, 4(%edx)           C                               un=2
94         psrlq   $32, %mm6               C                               un=2
95         movd    %mm6, 8(%edx)           C                               un=2
96       dec       %ebx                    C                               un=2
97       jz        L(rtr)                  C                               un=2
98         movd    4(%esi), %mm7           C                               un=2
99         movd    (%eax), %mm6            C                               un=2
100         pmuludq %mm7, %mm6              C                               un=2
101         movd    4(%eax), %mm1           C                               un=2
102         movd    4(%edx), %mm4           C                               un=2
103         pmuludq %mm7, %mm1              C                               un=2
104         movd    8(%edx), %mm5           C                               un=2
105         paddq   %mm4, %mm6              C                               un=2
106         paddq   %mm1, %mm5              C                               un=2
107         movd    %mm6, 4(%edx)           C                               un=2
108         psrlq   $32, %mm6               C                               un=2
109         paddq   %mm5, %mm6              C                               un=2
110         movd    %mm6, 8(%edx)           C                               un=2
111         psrlq   $32, %mm6               C                               un=2
112         movd    %mm6, 12(%edx)          C                               un=2
113 L(rtr): emms
114         pop     %ebx
115         pop     %esi
116         ret
117
118 L(un3): movd    4(%eax), %mm1           C                               un=3
119         pmuludq %mm7, %mm1              C                               un=3
120         movd    8(%eax), %mm2           C                               un=3
121         pmuludq %mm7, %mm2              C                               un=3
122         movd    %mm6, (%edx)            C                               un=3
123         psrlq   $32, %mm6               C                               un=3
124         paddq   %mm1, %mm6              C                               un=3
125         movd    %mm6, 4(%edx)           C                               un=3
126         psrlq   $32, %mm6               C                               un=3
127         paddq   %mm2, %mm6              C                               un=3
128         movd    %mm6, 8(%edx)           C                               un=3
129         psrlq   $32, %mm6               C                               un=3
130         movd    %mm6, 12(%edx)          C                               un=3
131       dec       %ebx                    C                               un=3
132       jz        L(rtr)                  C                               un=3
133         movd    4(%esi), %mm7           C                               un=3
134         movd    (%eax), %mm6            C                               un=3
135         pmuludq %mm7, %mm6              C                               un=3
136         movd    4(%eax), %mm1           C                               un=3
137         movd    4(%edx), %mm4           C                               un=3
138         pmuludq %mm7, %mm1              C                               un=3
139         movd    8(%eax), %mm2           C                               un=3
140         movd    8(%edx), %mm5           C                               un=3
141         pmuludq %mm7, %mm2              C                               un=3
142         paddq   %mm4, %mm6              C                               un=3
143         paddq   %mm1, %mm5              C                               un=3
144         movd    12(%edx), %mm4          C                               un=3
145         movd    %mm6, 4(%edx)           C                               un=3
146         psrlq   $32, %mm6               C                               un=3
147         paddq   %mm5, %mm6              C                               un=3
148         paddq   %mm2, %mm4              C                               un=3
149         movd    %mm6, 8(%edx)           C                               un=3
150         psrlq   $32, %mm6               C                               un=3
151         paddq   %mm4, %mm6              C                               un=3
152         movd    %mm6, 12(%edx)          C                               un=3
153         psrlq   $32, %mm6               C                               un=3
154         movd    %mm6, 16(%edx)          C                               un=3
155       dec       %ebx                    C                               un=3
156       jz        L(rtr)                  C                               un=3
157         movd    8(%esi), %mm7           C                               un=3
158         movd    (%eax), %mm6            C                               un=3
159         pmuludq %mm7, %mm6              C                               un=3
160         movd    4(%eax), %mm1           C                               un=3
161         movd    8(%edx), %mm4           C                               un=3
162         pmuludq %mm7, %mm1              C                               un=3
163         movd    8(%eax), %mm2           C                               un=3
164         movd    12(%edx), %mm5          C                               un=3
165         pmuludq %mm7, %mm2              C                               un=3
166         paddq   %mm4, %mm6              C                               un=3
167         paddq   %mm1, %mm5              C                               un=3
168         movd    16(%edx), %mm4          C                               un=3
169         movd    %mm6, 8(%edx)           C                               un=3
170         psrlq   $32, %mm6               C                               un=3
171         paddq   %mm5, %mm6              C                               un=3
172         paddq   %mm2, %mm4              C                               un=3
173         movd    %mm6, 12(%edx)          C                               un=3
174         psrlq   $32, %mm6               C                               un=3
175         paddq   %mm4, %mm6              C                               un=3
176         movd    %mm6, 16(%edx)          C                               un=3
177         psrlq   $32, %mm6               C                               un=3
178         movd    %mm6, 20(%edx)          C                               un=3
179         jmp     L(rtr)
180
181
182 L(big): push    %edi
183         pxor    %mm6, %mm6
184         lea     4(%esi), %esi
185         and     $3, %ecx
186         jz      L(0)
187         cmp     $2, %ecx
188         jc      L(1)
189         jz      L(2)
190         jmp     L(3)                    C FIXME: one case should fall through
191
192
193 L(0):   movd    (%eax), %mm3            C                               m 0
194         sub     24(%esp), %ecx          C inner loop count              m 0
195         mov     %ecx, 24(%esp)          C update loop count for later   m 0
196         pmuludq %mm7, %mm3              C                               m 0
197         movd    4(%eax), %mm0           C                               m 0
198         pmuludq %mm7, %mm0              C                               m 0
199         movd    8(%eax), %mm1           C                               m 0
200         jmp     L(m00)                  C                               m 0
201         ALIGN(16)                       C                               m 0
202 L(lpm0):
203         pmuludq %mm7, %mm4              C                               m 0
204         paddq   %mm0, %mm6              C                               m 0
205         movd    (%eax), %mm3            C                               m 0
206         movd    %mm6, -12(%edx)         C                               m 0
207         psrlq   $32, %mm6               C                               m 0
208         pmuludq %mm7, %mm3              C                               m 0
209         paddq   %mm1, %mm6              C                               m 0
210         movd    4(%eax), %mm0           C                               m 0
211         movd    %mm6, -8(%edx)          C                               m 0
212         psrlq   $32, %mm6               C                               m 0
213         pmuludq %mm7, %mm0              C                               m 0
214         paddq   %mm4, %mm6              C                               m 0
215         movd    8(%eax), %mm1           C                               m 0
216         movd    %mm6, -4(%edx)          C                               m 0
217         psrlq   $32, %mm6               C                               m 0
218 L(m00): pmuludq %mm7, %mm1              C                               m 0
219         paddq   %mm3, %mm6              C                               m 0
220         movd    12(%eax), %mm4          C                               m 0
221         movd    %mm6, (%edx)            C                               m 0
222         psrlq   $32, %mm6               C                               m 0
223         lea     16(%eax), %eax          C                               m 0
224         lea     16(%edx), %edx          C                               m 0
225         add     $4, %ecx                C                               m 0
226         ja      L(lpm0)                 C                               m 0
227         pmuludq %mm7, %mm4              C                               m 0
228         paddq   %mm0, %mm6              C                               m 0
229         movd    %mm6, -12(%edx)         C                               m 0
230         psrlq   $32, %mm6               C                               m 0
231         paddq   %mm1, %mm6              C                               m 0
232         mov     16(%esp), %edi          C rp                              0
233         jmp     L(x0)
234
235 L(olp0):
236         lea     4(%edi), %edi           C                               am 0
237         movd    (%esi), %mm7            C                               am 0
238         lea     4(%esi), %esi           C                               am 0
239         mov     %edi, %edx              C rp                            am 0
240         mov     20(%esp), %eax          C up                            am 0
241         movd    (%eax), %mm3            C                               am 0
242         mov     24(%esp), %ecx          C inner loop count              am 0
243         pxor    %mm6, %mm6              C                               am 0
244         pmuludq %mm7, %mm3              C                               am 0
245         movd    4(%eax), %mm0           C                               am 0
246         movd    (%edx), %mm5            C                               am 0
247         pmuludq %mm7, %mm0              C                               am 0
248         movd    8(%eax), %mm1           C                               am 0
249         paddq   %mm3, %mm5              C                               am 0
250         movd    4(%edx), %mm4           C                               am 0
251         jmp     L(am00)                 C                               am 0
252         ALIGN(16)                       C                               mm 0
253 L(lam0):
254         pmuludq %mm7, %mm2              C                               am 0
255         paddq   %mm4, %mm6              C                               am 0
256         movd    (%eax), %mm3            C                               am 0
257         paddq   %mm1, %mm5              C                               am 0
258         movd    -4(%edx), %mm4          C                               am 0
259         movd    %mm6, -12(%edx)         C                               am 0
260         psrlq   $32, %mm6               C                               am 0
261         pmuludq %mm7, %mm3              C                               am 0
262         paddq   %mm5, %mm6              C                               am 0
263         movd    4(%eax), %mm0           C                               am 0
264         paddq   %mm2, %mm4              C                               am 0
265         movd    (%edx), %mm5            C                               am 0
266         movd    %mm6, -8(%edx)          C                               am 0
267         psrlq   $32, %mm6               C                               am 0
268         pmuludq %mm7, %mm0              C                               am 0
269         paddq   %mm4, %mm6              C                               am 0
270         movd    8(%eax), %mm1           C                               am 0
271         paddq   %mm3, %mm5              C                               am 0
272         movd    4(%edx), %mm4           C                               am 0
273         movd    %mm6, -4(%edx)          C                               am 0
274         psrlq   $32, %mm6               C                               am 0
275 L(am00):
276         pmuludq %mm7, %mm1              C                               am 0
277         paddq   %mm5, %mm6              C                               am 0
278         movd    12(%eax), %mm2          C                               am 0
279         paddq   %mm0, %mm4              C                               am 0
280         movd    8(%edx), %mm5           C                               am 0
281         movd    %mm6, (%edx)            C                               am 0
282         psrlq   $32, %mm6               C                               am 0
283         lea     16(%eax), %eax          C                               am 0
284         lea     16(%edx), %edx          C                               am 0
285         add     $4, %ecx                C                               am 0
286         jnz     L(lam0)                 C                               am 0
287         pmuludq %mm7, %mm2              C                               am 0
288         paddq   %mm4, %mm6              C                               am 0
289         paddq   %mm1, %mm5              C                               am 0
290         movd    -4(%edx), %mm4          C                               am 0
291         movd    %mm6, -12(%edx)         C                               am 0
292         psrlq   $32, %mm6               C                               am 0
293         paddq   %mm5, %mm6              C                               am 0
294         paddq   %mm2, %mm4              C                               am 0
295 L(x0):  movd    %mm6, -8(%edx)          C                               am 0
296         psrlq   $32, %mm6               C                               am 0
297         paddq   %mm4, %mm6              C                               am 0
298         movd    %mm6, -4(%edx)          C                               am 0
299         psrlq   $32, %mm6               C                               am 0
300         movd    %mm6, (%edx)            C                               am 0
301         dec     %ebx                    C                               am 0
302         jnz     L(olp0)                 C                               am 0
303 L(oel0):
304         emms                            C                                  0
305         pop     %edi                    C                                  0
306         pop     %ebx                    C                                  0
307         pop     %esi                    C                                  0
308         ret                             C                                  0
309
310
311 L(1):   movd    (%eax), %mm4            C                               m 1
312         sub     24(%esp), %ecx          C                               m 1
313         mov     %ecx, 24(%esp)          C update loop count for later   m 1
314         pmuludq %mm7, %mm4              C                               m 1
315         movd    4(%eax), %mm3           C                               m 1
316         pmuludq %mm7, %mm3              C                               m 1
317         movd    8(%eax), %mm0           C                               m 1
318         jmp     L(m01)                  C                               m 1
319         ALIGN(16)                       C                               m 1
320 L(lpm1):
321         pmuludq %mm7, %mm4              C                               m 1
322         paddq   %mm0, %mm6              C                               m 1
323         movd    4(%eax), %mm3           C                               m 1
324         movd    %mm6, -8(%edx)          C                               m 1
325         psrlq   $32, %mm6               C                               m 1
326         pmuludq %mm7, %mm3              C                               m 1
327         paddq   %mm1, %mm6              C                               m 1
328         movd    8(%eax), %mm0           C                               m 1
329         movd    %mm6, -4(%edx)          C                               m 1
330         psrlq   $32, %mm6               C                               m 1
331 L(m01): pmuludq %mm7, %mm0              C                               m 1
332         paddq   %mm4, %mm6              C                               m 1
333         movd    12(%eax), %mm1          C                               m 1
334         movd    %mm6, (%edx)            C                               m 1
335         psrlq   $32, %mm6               C                               m 1
336         pmuludq %mm7, %mm1              C                               m 1
337         paddq   %mm3, %mm6              C                               m 1
338         movd    16(%eax), %mm4          C                               m 1
339         movd    %mm6, 4(%edx)           C                               m 1
340         psrlq   $32, %mm6               C                               m 1
341         lea     16(%eax), %eax          C                               m 1
342         lea     16(%edx), %edx          C                               m 1
343         add     $4, %ecx                C                               m 1
344         ja      L(lpm1)                 C                               m 1
345         pmuludq %mm7, %mm4              C                               m 1
346         paddq   %mm0, %mm6              C                               m 1
347         movd    %mm6, -8(%edx)          C                               m 1
348         psrlq   $32, %mm6               C                               m 1
349         paddq   %mm1, %mm6              C                               m 1
350         mov     16(%esp), %edi          C rp                              1
351         jmp     L(x1)
352
353 L(olp1):
354         lea     4(%edi), %edi           C                               am 1
355         movd    (%esi), %mm7            C                               am 1
356         lea     4(%esi), %esi           C                               am 1
357         mov     %edi, %edx              C rp                            am 1
358         mov     20(%esp), %eax          C up                            am 1
359         movd    (%eax), %mm2            C                               am 1
360         mov     24(%esp), %ecx          C inner loop count              am 1
361         pxor    %mm6, %mm6              C                               am 1
362         pmuludq %mm7, %mm2              C                               am 1
363         movd    4(%eax), %mm3           C                               am 1
364         movd    (%edx), %mm4            C                               am 1
365         pmuludq %mm7, %mm3              C                               am 1
366         movd    8(%eax), %mm0           C                               am 1
367         paddq   %mm2, %mm4              C                               am 1
368         movd    4(%edx), %mm5           C                               am 1
369         jmp     L(am01)                 C                               am 1
370         ALIGN(16)                       C                               am 1
371 L(lam1):
372         pmuludq %mm7, %mm2              C                               am 1
373         paddq   %mm4, %mm6              C                               am 1
374         movd    4(%eax), %mm3           C                               am 1
375         paddq   %mm1, %mm5              C                               am 1
376         movd    (%edx), %mm4            C                               am 1
377         movd    %mm6, -8(%edx)          C                               am 1
378         psrlq   $32, %mm6               C                               am 1
379         pmuludq %mm7, %mm3              C                               am 1
380         paddq   %mm5, %mm6              C                               am 1
381         movd    8(%eax), %mm0           C                               am 1
382         paddq   %mm2, %mm4              C                               am 1
383         movd    4(%edx), %mm5           C                               am 1
384         movd    %mm6, -4(%edx)          C                               am 1
385         psrlq   $32, %mm6               C                               am 1
386 L(am01):
387         pmuludq %mm7, %mm0              C                               am 1
388         paddq   %mm4, %mm6              C                               am 1
389         movd    12(%eax), %mm1          C                               am 1
390         paddq   %mm3, %mm5              C                               am 1
391         movd    8(%edx), %mm4           C                               am 1
392         movd    %mm6, (%edx)            C                               am 1
393         psrlq   $32, %mm6               C                               am 1
394         pmuludq %mm7, %mm1              C                               am 1
395         paddq   %mm5, %mm6              C                               am 1
396         movd    16(%eax), %mm2          C                               am 1
397         paddq   %mm0, %mm4              C                               am 1
398         movd    12(%edx), %mm5          C                               am 1
399         movd    %mm6, 4(%edx)           C                               am 1
400         psrlq   $32, %mm6               C                               am 1
401         lea     16(%eax), %eax          C                               am 1
402         lea     16(%edx), %edx          C                               am 1
403         add     $4, %ecx                C                               am 1
404         jnz     L(lam1)                 C                               am 1
405         pmuludq %mm7, %mm2              C                               am 1
406         paddq   %mm4, %mm6              C                               am 1
407         paddq   %mm1, %mm5              C                               am 1
408         movd    (%edx), %mm4            C                               am 1
409         movd    %mm6, -8(%edx)          C                               am 1
410         psrlq   $32, %mm6               C                               am 1
411         paddq   %mm5, %mm6              C                               am 1
412         paddq   %mm2, %mm4              C                               am 1
413 L(x1):  movd    %mm6, -4(%edx)          C                               am 1
414         psrlq   $32, %mm6               C                               am 1
415         paddq   %mm4, %mm6              C                               am 1
416         movd    %mm6, (%edx)            C                               am 1
417         psrlq   $32, %mm6               C                               am 1
418         movd    %mm6, 4(%edx)           C                               am 1
419         dec     %ebx                    C                               am 1
420         jnz     L(olp1)                 C                               am 1
421 L(oel1):
422         emms                            C                                  1
423         pop     %edi                    C                                  1
424         pop     %ebx                    C                                  1
425         pop     %esi                    C                                  1
426         ret                             C                                  1
427
428
429 L(2):   movd    (%eax), %mm1            C                               m 2
430         sub     24(%esp), %ecx          C                               m 2
431         mov     %ecx, 24(%esp)          C update loop count for later   m 2
432         pmuludq %mm7, %mm1              C                               m 2
433         movd    4(%eax), %mm4           C                               m 2
434         pmuludq %mm7, %mm4              C                               m 2
435         movd    8(%eax), %mm3           C                               m 2
436         jmp     L(m10)                  C                               m 2
437         ALIGN(16)                       C                               m 2
438 L(lpm2):
439         pmuludq %mm7, %mm4              C                               m 2
440         paddq   %mm0, %mm6              C                               m 2
441         movd    8(%eax), %mm3           C                               m 2
442         movd    %mm6, -4(%edx)          C                               m 2
443         psrlq   $32, %mm6               C                               m 2
444 L(m10): pmuludq %mm7, %mm3              C                               m 2
445         paddq   %mm1, %mm6              C                               m 2
446         movd    12(%eax), %mm0          C                               m 2
447         movd    %mm6, (%edx)            C                               m 2
448         psrlq   $32, %mm6               C                               m 2
449         pmuludq %mm7, %mm0              C                               m 2
450         paddq   %mm4, %mm6              C                               m 2
451         movd    16(%eax), %mm1          C                               m 2
452         movd    %mm6, 4(%edx)           C                               m 2
453         psrlq   $32, %mm6               C                               m 2
454         pmuludq %mm7, %mm1              C                               m 2
455         paddq   %mm3, %mm6              C                               m 2
456         movd    20(%eax), %mm4          C                               m 2
457         movd    %mm6, 8(%edx)           C                               m 2
458         psrlq   $32, %mm6               C                               m 2
459         lea     16(%eax), %eax          C                               m 2
460         lea     16(%edx), %edx          C                               m 2
461         add     $4, %ecx                C                               m 2
462         ja      L(lpm2)                 C                               m 2
463         pmuludq %mm7, %mm4              C                               m 2
464         paddq   %mm0, %mm6              C                               m 2
465         movd    %mm6, -4(%edx)          C                               m 2
466         psrlq   $32, %mm6               C                               m 2
467         paddq   %mm1, %mm6              C                               m 2
468         mov     16(%esp), %edi          C rp                              2
469         jmp     L(x2)
470
471 L(olp2):
472         lea     4(%edi), %edi           C                               am 2
473         movd    (%esi), %mm7            C                               am 2
474         lea     4(%esi), %esi           C                               am 2
475         mov     %edi, %edx              C rp                            am 2
476         mov     20(%esp), %eax          C up                            am 2
477         movd    (%eax), %mm1            C                               am 2
478         mov     24(%esp), %ecx          C inner loop count              am 2
479         pxor    %mm6, %mm6              C                               am 2
480         pmuludq %mm7, %mm1              C                               am 2
481         movd    4(%eax), %mm2           C                               am 2
482         movd    (%edx), %mm5            C                               am 2
483         pmuludq %mm7, %mm2              C                               am 2
484         movd    8(%eax), %mm3           C                               am 2
485         paddq   %mm1, %mm5              C                               am 2
486         movd    4(%edx), %mm4           C                               am 2
487         jmp     L(am10)                 C                               am 2
488         ALIGN(16)                       C                               am 2
489 L(lam2):
490         pmuludq %mm7, %mm2              C                               am 2
491         paddq   %mm4, %mm6              C                               am 2
492         movd    8(%eax), %mm3           C                               am 2
493         paddq   %mm1, %mm5              C                               am 2
494         movd    4(%edx), %mm4           C                               am 2
495         movd    %mm6, -4(%edx)          C                               am 2
496         psrlq   $32, %mm6               C                               am 2
497 L(am10):
498         pmuludq %mm7, %mm3              C                               am 2
499         paddq   %mm5, %mm6              C                               am 2
500         movd    12(%eax), %mm0          C                               am 2
501         paddq   %mm2, %mm4              C                               am 2
502         movd    8(%edx), %mm5           C                               am 2
503         movd    %mm6, (%edx)            C                               am 2
504         psrlq   $32, %mm6               C                               am 2
505         pmuludq %mm7, %mm0              C                               am 2
506         paddq   %mm4, %mm6              C                               am 2
507         movd    16(%eax), %mm1          C                               am 2
508         paddq   %mm3, %mm5              C                               am 2
509         movd    12(%edx), %mm4          C                               am 2
510         movd    %mm6, 4(%edx)           C                               am 2
511         psrlq   $32, %mm6               C                               am 2
512         pmuludq %mm7, %mm1              C                               am 2
513         paddq   %mm5, %mm6              C                               am 2
514         movd    20(%eax), %mm2          C                               am 2
515         paddq   %mm0, %mm4              C                               am 2
516         movd    16(%edx), %mm5          C                               am 2
517         movd    %mm6, 8(%edx)           C                               am 2
518         psrlq   $32, %mm6               C                               am 2
519         lea     16(%eax), %eax          C                               am 2
520         lea     16(%edx), %edx          C                               am 2
521         add     $4, %ecx                C                               am 2
522         jnz     L(lam2)                 C                               am 2
523         pmuludq %mm7, %mm2              C                               am 2
524         paddq   %mm4, %mm6              C                               am 2
525         paddq   %mm1, %mm5              C                               am 2
526         movd    4(%edx), %mm4           C                               am 2
527         movd    %mm6, -4(%edx)          C                               am 2
528         psrlq   $32, %mm6               C                               am 2
529         paddq   %mm5, %mm6              C                               am 2
530         paddq   %mm2, %mm4              C                               am 2
531 L(x2):  movd    %mm6, (%edx)            C                               am 2
532         psrlq   $32, %mm6               C                               am 2
533         paddq   %mm4, %mm6              C                               am 2
534         movd    %mm6, 4(%edx)           C                               am 2
535         psrlq   $32, %mm6               C                               am 2
536         movd    %mm6, 8(%edx)           C                               am 2
537         dec     %ebx                    C                               am 2
538         jnz     L(olp2)                 C                               am 2
539 L(oel2):
540         emms                            C                                  2
541         pop     %edi                    C                                  2
542         pop     %ebx                    C                                  2
543         pop     %esi                    C                                  2
544         ret                             C                                  2
545
546
547 L(3):   movd    (%eax), %mm0            C                               m 3
548         sub     24(%esp), %ecx          C                               m 3
549         mov     %ecx, 24(%esp)          C update loop count for later   m 3
550         pmuludq %mm7, %mm0              C                               m 3
551         movd    4(%eax), %mm1           C                               m 3
552         pmuludq %mm7, %mm1              C                               m 3
553         movd    8(%eax), %mm4           C                               m 3
554         jmp     L(lpm3)                 C                               m 3
555         ALIGN(16)                       C                               m 3
556 L(lpm3):
557         pmuludq %mm7, %mm4              C                               m 3
558         paddq   %mm0, %mm6              C                               m 3
559         movd    12(%eax), %mm3          C                               m 3
560         movd    %mm6, (%edx)            C                               m 3
561         psrlq   $32, %mm6               C                               m 3
562         pmuludq %mm7, %mm3              C                               m 3
563         paddq   %mm1, %mm6              C                               m 3
564         movd    16(%eax), %mm0          C                               m 3
565         movd    %mm6, 4(%edx)           C                               m 3
566         psrlq   $32, %mm6               C                               m 3
567         pmuludq %mm7, %mm0              C                               m 3
568         paddq   %mm4, %mm6              C                               m 3
569         movd    20(%eax), %mm1          C                               m 3
570         movd    %mm6, 8(%edx)           C                               m 3
571         psrlq   $32, %mm6               C                               m 3
572         pmuludq %mm7, %mm1              C                               m 3
573         paddq   %mm3, %mm6              C                               m 3
574         movd    24(%eax), %mm4          C                               m 3
575         movd    %mm6, 12(%edx)          C                               m 3
576         psrlq   $32, %mm6               C                               m 3
577         lea     16(%eax), %eax          C                               m 3
578         lea     16(%edx), %edx          C                               m 3
579         add     $4, %ecx                C                               m 3
580         ja      L(lpm3)                 C                               m 3
581         pmuludq %mm7, %mm4              C                               m 3
582         paddq   %mm0, %mm6              C                               m 3
583         movd    %mm6, (%edx)            C                               m 3
584         psrlq   $32, %mm6               C                               m 3
585         paddq   %mm1, %mm6              C                               m 3
586         mov     16(%esp), %edi          C rp                              3
587         jmp     L(x3)
588
589 L(olp3):
590         lea     4(%edi), %edi           C                               am 3
591         movd    (%esi), %mm7            C                               am 3
592         lea     4(%esi), %esi           C                               am 3
593         mov     %edi, %edx              C rp                            am 3
594         mov     20(%esp), %eax          C up                            am 3
595         movd    (%eax), %mm0            C                               am 3
596         mov     24(%esp), %ecx          C inner loop count              am 3
597         pxor    %mm6, %mm6              C                               am 3
598         pmuludq %mm7, %mm0              C                               am 3
599         movd    4(%eax), %mm1           C                               am 3
600         movd    (%edx), %mm4            C                               am 3
601         pmuludq %mm7, %mm1              C                               am 3
602         movd    8(%eax), %mm2           C                               am 3
603         paddq   %mm0, %mm4              C                               am 3
604         movd    4(%edx), %mm5           C                               am 3
605         jmp     L(lam3)                 C                               am 3
606         ALIGN(16)                       C                               am 3
607 L(lam3):
608         pmuludq %mm7, %mm2              C                               am 3
609         paddq   %mm4, %mm6              C                               am 3
610         movd    12(%eax), %mm3          C                               am 3
611         paddq   %mm1, %mm5              C                               am 3
612         movd    8(%edx), %mm4           C                               am 3
613         movd    %mm6, (%edx)            C                               am 3
614         psrlq   $32, %mm6               C                               am 3
615         pmuludq %mm7, %mm3              C                               am 3
616         paddq   %mm5, %mm6              C                               am 3
617         movd    16(%eax), %mm0          C                               am 3
618         paddq   %mm2, %mm4              C                               am 3
619         movd    12(%edx), %mm5          C                               am 3
620         movd    %mm6, 4(%edx)           C                               am 3
621         psrlq   $32, %mm6               C                               am 3
622         pmuludq %mm7, %mm0              C                               am 3
623         paddq   %mm4, %mm6              C                               am 3
624         movd    20(%eax), %mm1          C                               am 3
625         paddq   %mm3, %mm5              C                               am 3
626         movd    16(%edx), %mm4          C                               am 3
627         movd    %mm6, 8(%edx)           C                               am 3
628         psrlq   $32, %mm6               C                               am 3
629         pmuludq %mm7, %mm1              C                               am 3
630         paddq   %mm5, %mm6              C                               am 3
631         movd    24(%eax), %mm2          C                               am 3
632         paddq   %mm0, %mm4              C                               am 3
633         movd    20(%edx), %mm5          C                               am 3
634         movd    %mm6, 12(%edx)          C                               am 3
635         psrlq   $32, %mm6               C                               am 3
636         lea     16(%eax), %eax          C                               am 3
637         lea     16(%edx), %edx          C                               am 3
638         add     $4, %ecx                C                               am 3
639         jnz     L(lam3)                 C                               am 3
640         pmuludq %mm7, %mm2              C                               am 3
641         paddq   %mm4, %mm6              C                               am 3
642         paddq   %mm1, %mm5              C                               am 3
643         movd    8(%edx), %mm4           C                               am 3
644         movd    %mm6, (%edx)            C                               am 3
645         psrlq   $32, %mm6               C                               am 3
646         paddq   %mm5, %mm6              C                               am 3
647         paddq   %mm2, %mm4              C                               am 3
648 L(x3):  movd    %mm6, 4(%edx)           C                               am 3
649         psrlq   $32, %mm6               C                               am 3
650         paddq   %mm4, %mm6              C                               am 3
651         movd    %mm6, 8(%edx)           C                               am 3
652         psrlq   $32, %mm6               C                               am 3
653         movd    %mm6, 12(%edx)          C                               am 3
654         dec     %ebx                    C                               am 3
655         jnz     L(olp3)                 C                               am 3
656 L(oel3):
657         emms                            C                                  3
658         pop     %edi                    C                                  3
659         pop     %ebx                    C                                  3
660         pop     %esi                    C                                  3
661         ret                             C                                  3
662 EPILOGUE()