Upload Tizen:Base source
[external/gmp.git] / mpn / x86 / pentium4 / sse2 / mul_basecase.asm
1 dnl  mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
2
3 dnl  Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
4 dnl
5 dnl  This file is part of the GNU MP Library.
6 dnl
7 dnl  The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl  it under the terms of the GNU Lesser General Public License as published
9 dnl  by the Free Software Foundation; either version 3 of the License, or (at
10 dnl  your option) any later version.
11 dnl
12 dnl  The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl  WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl  or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
15 dnl  License for more details.
16 dnl
17 dnl  You should have received a copy of the GNU Lesser General Public License
18 dnl  along with the GNU MP Library.  If not, see http://www.gnu.org/licenses/.
19
20 include(`../config.m4')
21
22 C TODO:
23 C  * Improve ad-hoc outer loop code and register handling.  Some feed-in
24 C    scheduling could improve things by several cycles per outer iteration.
25 C  * In code for un <= 3, try keeping accumulation operands in registers,
26 C    without storing intermediates to rp.
27 C  * We might want to keep 32 in a free mm register, since the register form is
28 C    3 bytes and the immediate form is 4 bytes.  About 70 bytes to save.
29 C  * Look into different loop alignment, we now expand the code about 50 bytes
30 C    with possibly needless alignment.
31 C  * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
32 C  * Use OSP, should solve feed-in latency problems.
33 C  * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
34 C  * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
35 C    so that they can share feed-in code, and changing the branch targets from
36 C    L<n> to Lm<nn>.
37
38 C                           cycles/limb
39 C P6 model 9   (Banias)         ?
40 C P6 model 13  (Dothan)         5.24
41 C P6 model 14  (Yonah)          ?
42 C P4 model 0-1 (Willamette):    5
43 C P4 model 2   (Northwood):     4.60 at 32 limbs
44 C P4 model 3-4 (Prescott):      4.94 at 32 limbs
45
46 C INPUT PARAMETERS
47 C rp            sp + 4
48 C up            sp + 8
49 C un            sp + 12
50 C vp            sp + 16
51 C vn            sp + 20
52
53         TEXT
54         ALIGN(16)
55 PROLOGUE(mpn_mul_basecase)
56         push    %esi
57         push    %ebx
58         mov     12(%esp), %edx          C rp
59         mov     16(%esp), %eax          C up
60         mov     20(%esp), %ecx          C un
61         mov     24(%esp), %esi          C vp
62         mov     28(%esp), %ebx          C vn
63         movd    (%esi), %mm7            C
64 L(ent): cmp     $3, %ecx
65         ja      L(big)
66         movd    (%eax), %mm6
67         pmuludq %mm7, %mm6
68         jz      L(un3)
69         cmp     $2, %ecx
70         jz      L(un2)
71
72 L(un1): movd    %mm6, (%edx)            C                               un=1
73         psrlq   $32, %mm6               C                               un=1
74         movd    %mm6, 4(%edx)           C                               un=1
75         jmp     L(rtr)                  C                               un=1
76
77 L(un2): movd    4(%eax), %mm1           C                               un=2
78         pmuludq %mm7, %mm1              C                               un=2
79         movd    %mm6, (%edx)            C                               un=2
80         psrlq   $32, %mm6               C                               un=2
81         paddq   %mm1, %mm6              C                               un=2
82         movd    %mm6, 4(%edx)           C                               un=2
83         psrlq   $32, %mm6               C                               un=2
84         movd    %mm6, 8(%edx)           C                               un=2
85       dec       %ebx                    C                               un=2
86       jz        L(rtr)                  C                               un=2
87         movd    4(%esi), %mm7           C                               un=2
88         movd    (%eax), %mm6            C                               un=2
89         pmuludq %mm7, %mm6              C                               un=2
90         movd    4(%eax), %mm1           C                               un=2
91         movd    4(%edx), %mm4           C                               un=2
92         pmuludq %mm7, %mm1              C                               un=2
93         movd    8(%edx), %mm5           C                               un=2
94         paddq   %mm4, %mm6              C                               un=2
95         paddq   %mm1, %mm5              C                               un=2
96         movd    %mm6, 4(%edx)           C                               un=2
97         psrlq   $32, %mm6               C                               un=2
98         paddq   %mm5, %mm6              C                               un=2
99         movd    %mm6, 8(%edx)           C                               un=2
100         psrlq   $32, %mm6               C                               un=2
101         movd    %mm6, 12(%edx)          C                               un=2
102 L(rtr): emms
103         pop     %ebx
104         pop     %esi
105         ret
106
107 L(un3): movd    4(%eax), %mm1           C                               un=3
108         pmuludq %mm7, %mm1              C                               un=3
109         movd    8(%eax), %mm2           C                               un=3
110         pmuludq %mm7, %mm2              C                               un=3
111         movd    %mm6, (%edx)            C                               un=3
112         psrlq   $32, %mm6               C                               un=3
113         paddq   %mm1, %mm6              C                               un=3
114         movd    %mm6, 4(%edx)           C                               un=3
115         psrlq   $32, %mm6               C                               un=3
116         paddq   %mm2, %mm6              C                               un=3
117         movd    %mm6, 8(%edx)           C                               un=3
118         psrlq   $32, %mm6               C                               un=3
119         movd    %mm6, 12(%edx)          C                               un=3
120       dec       %ebx                    C                               un=3
121       jz        L(rtr)                  C                               un=3
122         movd    4(%esi), %mm7           C                               un=3
123         movd    (%eax), %mm6            C                               un=3
124         pmuludq %mm7, %mm6              C                               un=3
125         movd    4(%eax), %mm1           C                               un=3
126         movd    4(%edx), %mm4           C                               un=3
127         pmuludq %mm7, %mm1              C                               un=3
128         movd    8(%eax), %mm2           C                               un=3
129         movd    8(%edx), %mm5           C                               un=3
130         pmuludq %mm7, %mm2              C                               un=3
131         paddq   %mm4, %mm6              C                               un=3
132         paddq   %mm1, %mm5              C                               un=3
133         movd    12(%edx), %mm4          C                               un=3
134         movd    %mm6, 4(%edx)           C                               un=3
135         psrlq   $32, %mm6               C                               un=3
136         paddq   %mm5, %mm6              C                               un=3
137         paddq   %mm2, %mm4              C                               un=3
138         movd    %mm6, 8(%edx)           C                               un=3
139         psrlq   $32, %mm6               C                               un=3
140         paddq   %mm4, %mm6              C                               un=3
141         movd    %mm6, 12(%edx)          C                               un=3
142         psrlq   $32, %mm6               C                               un=3
143         movd    %mm6, 16(%edx)          C                               un=3
144       dec       %ebx                    C                               un=3
145       jz        L(rtr)                  C                               un=3
146         movd    8(%esi), %mm7           C                               un=3
147         movd    (%eax), %mm6            C                               un=3
148         pmuludq %mm7, %mm6              C                               un=3
149         movd    4(%eax), %mm1           C                               un=3
150         movd    8(%edx), %mm4           C                               un=3
151         pmuludq %mm7, %mm1              C                               un=3
152         movd    8(%eax), %mm2           C                               un=3
153         movd    12(%edx), %mm5          C                               un=3
154         pmuludq %mm7, %mm2              C                               un=3
155         paddq   %mm4, %mm6              C                               un=3
156         paddq   %mm1, %mm5              C                               un=3
157         movd    16(%edx), %mm4          C                               un=3
158         movd    %mm6, 8(%edx)           C                               un=3
159         psrlq   $32, %mm6               C                               un=3
160         paddq   %mm5, %mm6              C                               un=3
161         paddq   %mm2, %mm4              C                               un=3
162         movd    %mm6, 12(%edx)          C                               un=3
163         psrlq   $32, %mm6               C                               un=3
164         paddq   %mm4, %mm6              C                               un=3
165         movd    %mm6, 16(%edx)          C                               un=3
166         psrlq   $32, %mm6               C                               un=3
167         movd    %mm6, 20(%edx)          C                               un=3
168         jmp     L(rtr)
169
170
171 L(big): push    %edi
172         pxor    %mm6, %mm6
173         lea     4(%esi), %esi
174         and     $3, %ecx
175         jz      L(0)
176         cmp     $2, %ecx
177         jc      L(1)
178         jz      L(2)
179         jmp     L(3)                    C FIXME: one case should fall through
180
181
182 L(0):   movd    (%eax), %mm3            C                               m 0
183         sub     24(%esp), %ecx          C inner loop count              m 0
184         mov     %ecx, 24(%esp)          C update loop count for later   m 0
185         pmuludq %mm7, %mm3              C                               m 0
186         movd    4(%eax), %mm0           C                               m 0
187         pmuludq %mm7, %mm0              C                               m 0
188         movd    8(%eax), %mm1           C                               m 0
189         jmp     L(m00)                  C                               m 0
190         ALIGN(16)                       C                               m 0
191 L(lpm0):
192         pmuludq %mm7, %mm4              C                               m 0
193         paddq   %mm0, %mm6              C                               m 0
194         movd    (%eax), %mm3            C                               m 0
195         movd    %mm6, -12(%edx)         C                               m 0
196         psrlq   $32, %mm6               C                               m 0
197         pmuludq %mm7, %mm3              C                               m 0
198         paddq   %mm1, %mm6              C                               m 0
199         movd    4(%eax), %mm0           C                               m 0
200         movd    %mm6, -8(%edx)          C                               m 0
201         psrlq   $32, %mm6               C                               m 0
202         pmuludq %mm7, %mm0              C                               m 0
203         paddq   %mm4, %mm6              C                               m 0
204         movd    8(%eax), %mm1           C                               m 0
205         movd    %mm6, -4(%edx)          C                               m 0
206         psrlq   $32, %mm6               C                               m 0
207 L(m00): pmuludq %mm7, %mm1              C                               m 0
208         paddq   %mm3, %mm6              C                               m 0
209         movd    12(%eax), %mm4          C                               m 0
210         movd    %mm6, (%edx)            C                               m 0
211         psrlq   $32, %mm6               C                               m 0
212         lea     16(%eax), %eax          C                               m 0
213         lea     16(%edx), %edx          C                               m 0
214         add     $4, %ecx                C                               m 0
215         ja      L(lpm0)                 C                               m 0
216         pmuludq %mm7, %mm4              C                               m 0
217         paddq   %mm0, %mm6              C                               m 0
218         movd    %mm6, -12(%edx)         C                               m 0
219         psrlq   $32, %mm6               C                               m 0
220         paddq   %mm1, %mm6              C                               m 0
221         mov     16(%esp), %edi          C rp                              0
222         jmp     L(x0)
223
224 L(olp0):
225         lea     4(%edi), %edi           C                               am 0
226         movd    (%esi), %mm7            C                               am 0
227         lea     4(%esi), %esi           C                               am 0
228         mov     %edi, %edx              C rp                            am 0
229         mov     20(%esp), %eax          C up                            am 0
230         movd    (%eax), %mm3            C                               am 0
231         mov     24(%esp), %ecx          C inner loop count              am 0
232         pxor    %mm6, %mm6              C                               am 0
233         pmuludq %mm7, %mm3              C                               am 0
234         movd    4(%eax), %mm0           C                               am 0
235         movd    (%edx), %mm5            C                               am 0
236         pmuludq %mm7, %mm0              C                               am 0
237         movd    8(%eax), %mm1           C                               am 0
238         paddq   %mm3, %mm5              C                               am 0
239         movd    4(%edx), %mm4           C                               am 0
240         jmp     L(am00)                 C                               am 0
241         ALIGN(16)                       C                               mm 0
242 L(lam0):
243         pmuludq %mm7, %mm2              C                               am 0
244         paddq   %mm4, %mm6              C                               am 0
245         movd    (%eax), %mm3            C                               am 0
246         paddq   %mm1, %mm5              C                               am 0
247         movd    -4(%edx), %mm4          C                               am 0
248         movd    %mm6, -12(%edx)         C                               am 0
249         psrlq   $32, %mm6               C                               am 0
250         pmuludq %mm7, %mm3              C                               am 0
251         paddq   %mm5, %mm6              C                               am 0
252         movd    4(%eax), %mm0           C                               am 0
253         paddq   %mm2, %mm4              C                               am 0
254         movd    (%edx), %mm5            C                               am 0
255         movd    %mm6, -8(%edx)          C                               am 0
256         psrlq   $32, %mm6               C                               am 0
257         pmuludq %mm7, %mm0              C                               am 0
258         paddq   %mm4, %mm6              C                               am 0
259         movd    8(%eax), %mm1           C                               am 0
260         paddq   %mm3, %mm5              C                               am 0
261         movd    4(%edx), %mm4           C                               am 0
262         movd    %mm6, -4(%edx)          C                               am 0
263         psrlq   $32, %mm6               C                               am 0
264 L(am00):
265         pmuludq %mm7, %mm1              C                               am 0
266         paddq   %mm5, %mm6              C                               am 0
267         movd    12(%eax), %mm2          C                               am 0
268         paddq   %mm0, %mm4              C                               am 0
269         movd    8(%edx), %mm5           C                               am 0
270         movd    %mm6, (%edx)            C                               am 0
271         psrlq   $32, %mm6               C                               am 0
272         lea     16(%eax), %eax          C                               am 0
273         lea     16(%edx), %edx          C                               am 0
274         add     $4, %ecx                C                               am 0
275         jnz     L(lam0)                 C                               am 0
276         pmuludq %mm7, %mm2              C                               am 0
277         paddq   %mm4, %mm6              C                               am 0
278         paddq   %mm1, %mm5              C                               am 0
279         movd    -4(%edx), %mm4          C                               am 0
280         movd    %mm6, -12(%edx)         C                               am 0
281         psrlq   $32, %mm6               C                               am 0
282         paddq   %mm5, %mm6              C                               am 0
283         paddq   %mm2, %mm4              C                               am 0
284 L(x0):  movd    %mm6, -8(%edx)          C                               am 0
285         psrlq   $32, %mm6               C                               am 0
286         paddq   %mm4, %mm6              C                               am 0
287         movd    %mm6, -4(%edx)          C                               am 0
288         psrlq   $32, %mm6               C                               am 0
289         movd    %mm6, (%edx)            C                               am 0
290         dec     %ebx                    C                               am 0
291         jnz     L(olp0)                 C                               am 0
292 L(oel0):
293         emms                            C                                  0
294         pop     %edi                    C                                  0
295         pop     %ebx                    C                                  0
296         pop     %esi                    C                                  0
297         ret                             C                                  0
298
299
300 L(1):   movd    (%eax), %mm4            C                               m 1
301         sub     24(%esp), %ecx          C                               m 1
302         mov     %ecx, 24(%esp)          C update loop count for later   m 1
303         pmuludq %mm7, %mm4              C                               m 1
304         movd    4(%eax), %mm3           C                               m 1
305         pmuludq %mm7, %mm3              C                               m 1
306         movd    8(%eax), %mm0           C                               m 1
307         jmp     L(m01)                  C                               m 1
308         ALIGN(16)                       C                               m 1
309 L(lpm1):
310         pmuludq %mm7, %mm4              C                               m 1
311         paddq   %mm0, %mm6              C                               m 1
312         movd    4(%eax), %mm3           C                               m 1
313         movd    %mm6, -8(%edx)          C                               m 1
314         psrlq   $32, %mm6               C                               m 1
315         pmuludq %mm7, %mm3              C                               m 1
316         paddq   %mm1, %mm6              C                               m 1
317         movd    8(%eax), %mm0           C                               m 1
318         movd    %mm6, -4(%edx)          C                               m 1
319         psrlq   $32, %mm6               C                               m 1
320 L(m01): pmuludq %mm7, %mm0              C                               m 1
321         paddq   %mm4, %mm6              C                               m 1
322         movd    12(%eax), %mm1          C                               m 1
323         movd    %mm6, (%edx)            C                               m 1
324         psrlq   $32, %mm6               C                               m 1
325         pmuludq %mm7, %mm1              C                               m 1
326         paddq   %mm3, %mm6              C                               m 1
327         movd    16(%eax), %mm4          C                               m 1
328         movd    %mm6, 4(%edx)           C                               m 1
329         psrlq   $32, %mm6               C                               m 1
330         lea     16(%eax), %eax          C                               m 1
331         lea     16(%edx), %edx          C                               m 1
332         add     $4, %ecx                C                               m 1
333         ja      L(lpm1)                 C                               m 1
334         pmuludq %mm7, %mm4              C                               m 1
335         paddq   %mm0, %mm6              C                               m 1
336         movd    %mm6, -8(%edx)          C                               m 1
337         psrlq   $32, %mm6               C                               m 1
338         paddq   %mm1, %mm6              C                               m 1
339         mov     16(%esp), %edi          C rp                              1
340         jmp     L(x1)
341
342 L(olp1):
343         lea     4(%edi), %edi           C                               am 1
344         movd    (%esi), %mm7            C                               am 1
345         lea     4(%esi), %esi           C                               am 1
346         mov     %edi, %edx              C rp                            am 1
347         mov     20(%esp), %eax          C up                            am 1
348         movd    (%eax), %mm2            C                               am 1
349         mov     24(%esp), %ecx          C inner loop count              am 1
350         pxor    %mm6, %mm6              C                               am 1
351         pmuludq %mm7, %mm2              C                               am 1
352         movd    4(%eax), %mm3           C                               am 1
353         movd    (%edx), %mm4            C                               am 1
354         pmuludq %mm7, %mm3              C                               am 1
355         movd    8(%eax), %mm0           C                               am 1
356         paddq   %mm2, %mm4              C                               am 1
357         movd    4(%edx), %mm5           C                               am 1
358         jmp     L(am01)                 C                               am 1
359         ALIGN(16)                       C                               am 1
360 L(lam1):
361         pmuludq %mm7, %mm2              C                               am 1
362         paddq   %mm4, %mm6              C                               am 1
363         movd    4(%eax), %mm3           C                               am 1
364         paddq   %mm1, %mm5              C                               am 1
365         movd    (%edx), %mm4            C                               am 1
366         movd    %mm6, -8(%edx)          C                               am 1
367         psrlq   $32, %mm6               C                               am 1
368         pmuludq %mm7, %mm3              C                               am 1
369         paddq   %mm5, %mm6              C                               am 1
370         movd    8(%eax), %mm0           C                               am 1
371         paddq   %mm2, %mm4              C                               am 1
372         movd    4(%edx), %mm5           C                               am 1
373         movd    %mm6, -4(%edx)          C                               am 1
374         psrlq   $32, %mm6               C                               am 1
375 L(am01):
376         pmuludq %mm7, %mm0              C                               am 1
377         paddq   %mm4, %mm6              C                               am 1
378         movd    12(%eax), %mm1          C                               am 1
379         paddq   %mm3, %mm5              C                               am 1
380         movd    8(%edx), %mm4           C                               am 1
381         movd    %mm6, (%edx)            C                               am 1
382         psrlq   $32, %mm6               C                               am 1
383         pmuludq %mm7, %mm1              C                               am 1
384         paddq   %mm5, %mm6              C                               am 1
385         movd    16(%eax), %mm2          C                               am 1
386         paddq   %mm0, %mm4              C                               am 1
387         movd    12(%edx), %mm5          C                               am 1
388         movd    %mm6, 4(%edx)           C                               am 1
389         psrlq   $32, %mm6               C                               am 1
390         lea     16(%eax), %eax          C                               am 1
391         lea     16(%edx), %edx          C                               am 1
392         add     $4, %ecx                C                               am 1
393         jnz     L(lam1)                 C                               am 1
394         pmuludq %mm7, %mm2              C                               am 1
395         paddq   %mm4, %mm6              C                               am 1
396         paddq   %mm1, %mm5              C                               am 1
397         movd    (%edx), %mm4            C                               am 1
398         movd    %mm6, -8(%edx)          C                               am 1
399         psrlq   $32, %mm6               C                               am 1
400         paddq   %mm5, %mm6              C                               am 1
401         paddq   %mm2, %mm4              C                               am 1
402 L(x1):  movd    %mm6, -4(%edx)          C                               am 1
403         psrlq   $32, %mm6               C                               am 1
404         paddq   %mm4, %mm6              C                               am 1
405         movd    %mm6, (%edx)            C                               am 1
406         psrlq   $32, %mm6               C                               am 1
407         movd    %mm6, 4(%edx)           C                               am 1
408         dec     %ebx                    C                               am 1
409         jnz     L(olp1)                 C                               am 1
410 L(oel1):
411         emms                            C                                  1
412         pop     %edi                    C                                  1
413         pop     %ebx                    C                                  1
414         pop     %esi                    C                                  1
415         ret                             C                                  1
416
417
418 L(2):   movd    (%eax), %mm1            C                               m 2
419         sub     24(%esp), %ecx          C                               m 2
420         mov     %ecx, 24(%esp)          C update loop count for later   m 2
421         pmuludq %mm7, %mm1              C                               m 2
422         movd    4(%eax), %mm4           C                               m 2
423         pmuludq %mm7, %mm4              C                               m 2
424         movd    8(%eax), %mm3           C                               m 2
425         jmp     L(m10)                  C                               m 2
426         ALIGN(16)                       C                               m 2
427 L(lpm2):
428         pmuludq %mm7, %mm4              C                               m 2
429         paddq   %mm0, %mm6              C                               m 2
430         movd    8(%eax), %mm3           C                               m 2
431         movd    %mm6, -4(%edx)          C                               m 2
432         psrlq   $32, %mm6               C                               m 2
433 L(m10): pmuludq %mm7, %mm3              C                               m 2
434         paddq   %mm1, %mm6              C                               m 2
435         movd    12(%eax), %mm0          C                               m 2
436         movd    %mm6, (%edx)            C                               m 2
437         psrlq   $32, %mm6               C                               m 2
438         pmuludq %mm7, %mm0              C                               m 2
439         paddq   %mm4, %mm6              C                               m 2
440         movd    16(%eax), %mm1          C                               m 2
441         movd    %mm6, 4(%edx)           C                               m 2
442         psrlq   $32, %mm6               C                               m 2
443         pmuludq %mm7, %mm1              C                               m 2
444         paddq   %mm3, %mm6              C                               m 2
445         movd    20(%eax), %mm4          C                               m 2
446         movd    %mm6, 8(%edx)           C                               m 2
447         psrlq   $32, %mm6               C                               m 2
448         lea     16(%eax), %eax          C                               m 2
449         lea     16(%edx), %edx          C                               m 2
450         add     $4, %ecx                C                               m 2
451         ja      L(lpm2)                 C                               m 2
452         pmuludq %mm7, %mm4              C                               m 2
453         paddq   %mm0, %mm6              C                               m 2
454         movd    %mm6, -4(%edx)          C                               m 2
455         psrlq   $32, %mm6               C                               m 2
456         paddq   %mm1, %mm6              C                               m 2
457         mov     16(%esp), %edi          C rp                              2
458         jmp     L(x2)
459
460 L(olp2):
461         lea     4(%edi), %edi           C                               am 2
462         movd    (%esi), %mm7            C                               am 2
463         lea     4(%esi), %esi           C                               am 2
464         mov     %edi, %edx              C rp                            am 2
465         mov     20(%esp), %eax          C up                            am 2
466         movd    (%eax), %mm1            C                               am 2
467         mov     24(%esp), %ecx          C inner loop count              am 2
468         pxor    %mm6, %mm6              C                               am 2
469         pmuludq %mm7, %mm1              C                               am 2
470         movd    4(%eax), %mm2           C                               am 2
471         movd    (%edx), %mm5            C                               am 2
472         pmuludq %mm7, %mm2              C                               am 2
473         movd    8(%eax), %mm3           C                               am 2
474         paddq   %mm1, %mm5              C                               am 2
475         movd    4(%edx), %mm4           C                               am 2
476         jmp     L(am10)                 C                               am 2
477         ALIGN(16)                       C                               am 2
478 L(lam2):
479         pmuludq %mm7, %mm2              C                               am 2
480         paddq   %mm4, %mm6              C                               am 2
481         movd    8(%eax), %mm3           C                               am 2
482         paddq   %mm1, %mm5              C                               am 2
483         movd    4(%edx), %mm4           C                               am 2
484         movd    %mm6, -4(%edx)          C                               am 2
485         psrlq   $32, %mm6               C                               am 2
486 L(am10):
487         pmuludq %mm7, %mm3              C                               am 2
488         paddq   %mm5, %mm6              C                               am 2
489         movd    12(%eax), %mm0          C                               am 2
490         paddq   %mm2, %mm4              C                               am 2
491         movd    8(%edx), %mm5           C                               am 2
492         movd    %mm6, (%edx)            C                               am 2
493         psrlq   $32, %mm6               C                               am 2
494         pmuludq %mm7, %mm0              C                               am 2
495         paddq   %mm4, %mm6              C                               am 2
496         movd    16(%eax), %mm1          C                               am 2
497         paddq   %mm3, %mm5              C                               am 2
498         movd    12(%edx), %mm4          C                               am 2
499         movd    %mm6, 4(%edx)           C                               am 2
500         psrlq   $32, %mm6               C                               am 2
501         pmuludq %mm7, %mm1              C                               am 2
502         paddq   %mm5, %mm6              C                               am 2
503         movd    20(%eax), %mm2          C                               am 2
504         paddq   %mm0, %mm4              C                               am 2
505         movd    16(%edx), %mm5          C                               am 2
506         movd    %mm6, 8(%edx)           C                               am 2
507         psrlq   $32, %mm6               C                               am 2
508         lea     16(%eax), %eax          C                               am 2
509         lea     16(%edx), %edx          C                               am 2
510         add     $4, %ecx                C                               am 2
511         jnz     L(lam2)                 C                               am 2
512         pmuludq %mm7, %mm2              C                               am 2
513         paddq   %mm4, %mm6              C                               am 2
514         paddq   %mm1, %mm5              C                               am 2
515         movd    4(%edx), %mm4           C                               am 2
516         movd    %mm6, -4(%edx)          C                               am 2
517         psrlq   $32, %mm6               C                               am 2
518         paddq   %mm5, %mm6              C                               am 2
519         paddq   %mm2, %mm4              C                               am 2
520 L(x2):  movd    %mm6, (%edx)            C                               am 2
521         psrlq   $32, %mm6               C                               am 2
522         paddq   %mm4, %mm6              C                               am 2
523         movd    %mm6, 4(%edx)           C                               am 2
524         psrlq   $32, %mm6               C                               am 2
525         movd    %mm6, 8(%edx)           C                               am 2
526         dec     %ebx                    C                               am 2
527         jnz     L(olp2)                 C                               am 2
528 L(oel2):
529         emms                            C                                  2
530         pop     %edi                    C                                  2
531         pop     %ebx                    C                                  2
532         pop     %esi                    C                                  2
533         ret                             C                                  2
534
535
536 L(3):   movd    (%eax), %mm0            C                               m 3
537         sub     24(%esp), %ecx          C                               m 3
538         mov     %ecx, 24(%esp)          C update loop count for later   m 3
539         pmuludq %mm7, %mm0              C                               m 3
540         movd    4(%eax), %mm1           C                               m 3
541         pmuludq %mm7, %mm1              C                               m 3
542         movd    8(%eax), %mm4           C                               m 3
543         jmp     L(lpm3)                 C                               m 3
544         ALIGN(16)                       C                               m 3
545 L(lpm3):
546         pmuludq %mm7, %mm4              C                               m 3
547         paddq   %mm0, %mm6              C                               m 3
548         movd    12(%eax), %mm3          C                               m 3
549         movd    %mm6, (%edx)            C                               m 3
550         psrlq   $32, %mm6               C                               m 3
551         pmuludq %mm7, %mm3              C                               m 3
552         paddq   %mm1, %mm6              C                               m 3
553         movd    16(%eax), %mm0          C                               m 3
554         movd    %mm6, 4(%edx)           C                               m 3
555         psrlq   $32, %mm6               C                               m 3
556         pmuludq %mm7, %mm0              C                               m 3
557         paddq   %mm4, %mm6              C                               m 3
558         movd    20(%eax), %mm1          C                               m 3
559         movd    %mm6, 8(%edx)           C                               m 3
560         psrlq   $32, %mm6               C                               m 3
561         pmuludq %mm7, %mm1              C                               m 3
562         paddq   %mm3, %mm6              C                               m 3
563         movd    24(%eax), %mm4          C                               m 3
564         movd    %mm6, 12(%edx)          C                               m 3
565         psrlq   $32, %mm6               C                               m 3
566         lea     16(%eax), %eax          C                               m 3
567         lea     16(%edx), %edx          C                               m 3
568         add     $4, %ecx                C                               m 3
569         ja      L(lpm3)                 C                               m 3
570         pmuludq %mm7, %mm4              C                               m 3
571         paddq   %mm0, %mm6              C                               m 3
572         movd    %mm6, (%edx)            C                               m 3
573         psrlq   $32, %mm6               C                               m 3
574         paddq   %mm1, %mm6              C                               m 3
575         mov     16(%esp), %edi          C rp                              3
576         jmp     L(x3)
577
578 L(olp3):
579         lea     4(%edi), %edi           C                               am 3
580         movd    (%esi), %mm7            C                               am 3
581         lea     4(%esi), %esi           C                               am 3
582         mov     %edi, %edx              C rp                            am 3
583         mov     20(%esp), %eax          C up                            am 3
584         movd    (%eax), %mm0            C                               am 3
585         mov     24(%esp), %ecx          C inner loop count              am 3
586         pxor    %mm6, %mm6              C                               am 3
587         pmuludq %mm7, %mm0              C                               am 3
588         movd    4(%eax), %mm1           C                               am 3
589         movd    (%edx), %mm4            C                               am 3
590         pmuludq %mm7, %mm1              C                               am 3
591         movd    8(%eax), %mm2           C                               am 3
592         paddq   %mm0, %mm4              C                               am 3
593         movd    4(%edx), %mm5           C                               am 3
594         jmp     L(lam3)                 C                               am 3
595         ALIGN(16)                       C                               am 3
596 L(lam3):
597         pmuludq %mm7, %mm2              C                               am 3
598         paddq   %mm4, %mm6              C                               am 3
599         movd    12(%eax), %mm3          C                               am 3
600         paddq   %mm1, %mm5              C                               am 3
601         movd    8(%edx), %mm4           C                               am 3
602         movd    %mm6, (%edx)            C                               am 3
603         psrlq   $32, %mm6               C                               am 3
604         pmuludq %mm7, %mm3              C                               am 3
605         paddq   %mm5, %mm6              C                               am 3
606         movd    16(%eax), %mm0          C                               am 3
607         paddq   %mm2, %mm4              C                               am 3
608         movd    12(%edx), %mm5          C                               am 3
609         movd    %mm6, 4(%edx)           C                               am 3
610         psrlq   $32, %mm6               C                               am 3
611         pmuludq %mm7, %mm0              C                               am 3
612         paddq   %mm4, %mm6              C                               am 3
613         movd    20(%eax), %mm1          C                               am 3
614         paddq   %mm3, %mm5              C                               am 3
615         movd    16(%edx), %mm4          C                               am 3
616         movd    %mm6, 8(%edx)           C                               am 3
617         psrlq   $32, %mm6               C                               am 3
618         pmuludq %mm7, %mm1              C                               am 3
619         paddq   %mm5, %mm6              C                               am 3
620         movd    24(%eax), %mm2          C                               am 3
621         paddq   %mm0, %mm4              C                               am 3
622         movd    20(%edx), %mm5          C                               am 3
623         movd    %mm6, 12(%edx)          C                               am 3
624         psrlq   $32, %mm6               C                               am 3
625         lea     16(%eax), %eax          C                               am 3
626         lea     16(%edx), %edx          C                               am 3
627         add     $4, %ecx                C                               am 3
628         jnz     L(lam3)                 C                               am 3
629         pmuludq %mm7, %mm2              C                               am 3
630         paddq   %mm4, %mm6              C                               am 3
631         paddq   %mm1, %mm5              C                               am 3
632         movd    8(%edx), %mm4           C                               am 3
633         movd    %mm6, (%edx)            C                               am 3
634         psrlq   $32, %mm6               C                               am 3
635         paddq   %mm5, %mm6              C                               am 3
636         paddq   %mm2, %mm4              C                               am 3
637 L(x3):  movd    %mm6, 4(%edx)           C                               am 3
638         psrlq   $32, %mm6               C                               am 3
639         paddq   %mm4, %mm6              C                               am 3
640         movd    %mm6, 8(%edx)           C                               am 3
641         psrlq   $32, %mm6               C                               am 3
642         movd    %mm6, 12(%edx)          C                               am 3
643         dec     %ebx                    C                               am 3
644         jnz     L(olp3)                 C                               am 3
645 L(oel3):
646         emms                            C                                  3
647         pop     %edi                    C                                  3
648         pop     %ebx                    C                                  3
649         pop     %esi                    C                                  3
650         ret                             C                                  3
651 EPILOGUE()