1 dnl mpn_mul_basecase for Pentium 4 and P6 models with SSE2 (i.e., 9,D,E,F).
3 dnl Copyright 2001, 2002, 2005, 2007 Free Software Foundation, Inc.
5 dnl This file is part of the GNU MP Library.
7 dnl The GNU MP Library is free software; you can redistribute it and/or modify
8 dnl it under the terms of the GNU Lesser General Public License as published
9 dnl by the Free Software Foundation; either version 3 of the License, or (at
10 dnl your option) any later version.
12 dnl The GNU MP Library is distributed in the hope that it will be useful, but
13 dnl WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
14 dnl or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15 dnl License for more details.
17 dnl You should have received a copy of the GNU Lesser General Public License
18 dnl along with the GNU MP Library. If not, see http://www.gnu.org/licenses/.
20 include(`../config.m4')
23 C * Improve ad-hoc outer loop code and register handling. Some feed-in
24 C scheduling could improve things by several cycles per outer iteration.
25 C * In code for un <= 3, try keeping accumulation operands in registers,
26 C without storing intermediates to rp.
27 C * We might want to keep 32 in a free mm register, since the register form is
28 C 3 bytes and the immediate form is 4 bytes. About 70 bytes to save.
29 C * Look into different loop alignment, we now expand the code about 50 bytes
30 C with possibly needless alignment.
31 C * Perhaps rewrap loops 00,01,02 (6 loops) to allow fall-through entry.
32 C * Use OSP, should solve feed-in latency problems.
33 C * Save a few tens of bytes by doing cross-jumping for Loel0, etc.
34 C * Save around 120 bytes by remapping "m 0", "m 1", "m 2" and "m 3" registers
35 C so that they can share feed-in code, and changing the branch targets from
39 C P6 model 9 (Banias) ?
40 C P6 model 13 (Dothan) 5.24
41 C P6 model 14 (Yonah) ?
42 C P4 model 0-1 (Willamette): 5
43 C P4 model 2 (Northwood): 4.60 at 32 limbs
44 C P4 model 3-4 (Prescott): 4.94 at 32 limbs
55 PROLOGUE(mpn_mul_basecase)
58 mov 12(%esp), %edx C rp
59 mov 16(%esp), %eax C up
60 mov 20(%esp), %ecx C un
61 mov 24(%esp), %esi C vp
62 mov 28(%esp), %ebx C vn
72 L(un1): movd %mm6, (%edx) C un=1
73 psrlq $32, %mm6 C un=1
74 movd %mm6, 4(%edx) C un=1
77 L(un2): movd 4(%eax), %mm1 C un=2
78 pmuludq %mm7, %mm1 C un=2
79 movd %mm6, (%edx) C un=2
80 psrlq $32, %mm6 C un=2
81 paddq %mm1, %mm6 C un=2
82 movd %mm6, 4(%edx) C un=2
83 psrlq $32, %mm6 C un=2
84 movd %mm6, 8(%edx) C un=2
87 movd 4(%esi), %mm7 C un=2
88 movd (%eax), %mm6 C un=2
89 pmuludq %mm7, %mm6 C un=2
90 movd 4(%eax), %mm1 C un=2
91 movd 4(%edx), %mm4 C un=2
92 pmuludq %mm7, %mm1 C un=2
93 movd 8(%edx), %mm5 C un=2
94 paddq %mm4, %mm6 C un=2
95 paddq %mm1, %mm5 C un=2
96 movd %mm6, 4(%edx) C un=2
97 psrlq $32, %mm6 C un=2
98 paddq %mm5, %mm6 C un=2
99 movd %mm6, 8(%edx) C un=2
100 psrlq $32, %mm6 C un=2
101 movd %mm6, 12(%edx) C un=2
107 L(un3): movd 4(%eax), %mm1 C un=3
108 pmuludq %mm7, %mm1 C un=3
109 movd 8(%eax), %mm2 C un=3
110 pmuludq %mm7, %mm2 C un=3
111 movd %mm6, (%edx) C un=3
112 psrlq $32, %mm6 C un=3
113 paddq %mm1, %mm6 C un=3
114 movd %mm6, 4(%edx) C un=3
115 psrlq $32, %mm6 C un=3
116 paddq %mm2, %mm6 C un=3
117 movd %mm6, 8(%edx) C un=3
118 psrlq $32, %mm6 C un=3
119 movd %mm6, 12(%edx) C un=3
122 movd 4(%esi), %mm7 C un=3
123 movd (%eax), %mm6 C un=3
124 pmuludq %mm7, %mm6 C un=3
125 movd 4(%eax), %mm1 C un=3
126 movd 4(%edx), %mm4 C un=3
127 pmuludq %mm7, %mm1 C un=3
128 movd 8(%eax), %mm2 C un=3
129 movd 8(%edx), %mm5 C un=3
130 pmuludq %mm7, %mm2 C un=3
131 paddq %mm4, %mm6 C un=3
132 paddq %mm1, %mm5 C un=3
133 movd 12(%edx), %mm4 C un=3
134 movd %mm6, 4(%edx) C un=3
135 psrlq $32, %mm6 C un=3
136 paddq %mm5, %mm6 C un=3
137 paddq %mm2, %mm4 C un=3
138 movd %mm6, 8(%edx) C un=3
139 psrlq $32, %mm6 C un=3
140 paddq %mm4, %mm6 C un=3
141 movd %mm6, 12(%edx) C un=3
142 psrlq $32, %mm6 C un=3
143 movd %mm6, 16(%edx) C un=3
146 movd 8(%esi), %mm7 C un=3
147 movd (%eax), %mm6 C un=3
148 pmuludq %mm7, %mm6 C un=3
149 movd 4(%eax), %mm1 C un=3
150 movd 8(%edx), %mm4 C un=3
151 pmuludq %mm7, %mm1 C un=3
152 movd 8(%eax), %mm2 C un=3
153 movd 12(%edx), %mm5 C un=3
154 pmuludq %mm7, %mm2 C un=3
155 paddq %mm4, %mm6 C un=3
156 paddq %mm1, %mm5 C un=3
157 movd 16(%edx), %mm4 C un=3
158 movd %mm6, 8(%edx) C un=3
159 psrlq $32, %mm6 C un=3
160 paddq %mm5, %mm6 C un=3
161 paddq %mm2, %mm4 C un=3
162 movd %mm6, 12(%edx) C un=3
163 psrlq $32, %mm6 C un=3
164 paddq %mm4, %mm6 C un=3
165 movd %mm6, 16(%edx) C un=3
166 psrlq $32, %mm6 C un=3
167 movd %mm6, 20(%edx) C un=3
179 jmp L(3) C FIXME: one case should fall through
182 L(0): movd (%eax), %mm3 C m 0
183 sub 24(%esp), %ecx C inner loop count m 0
184 mov %ecx, 24(%esp) C update loop count for later m 0
185 pmuludq %mm7, %mm3 C m 0
186 movd 4(%eax), %mm0 C m 0
187 pmuludq %mm7, %mm0 C m 0
188 movd 8(%eax), %mm1 C m 0
192 pmuludq %mm7, %mm4 C m 0
193 paddq %mm0, %mm6 C m 0
194 movd (%eax), %mm3 C m 0
195 movd %mm6, -12(%edx) C m 0
196 psrlq $32, %mm6 C m 0
197 pmuludq %mm7, %mm3 C m 0
198 paddq %mm1, %mm6 C m 0
199 movd 4(%eax), %mm0 C m 0
200 movd %mm6, -8(%edx) C m 0
201 psrlq $32, %mm6 C m 0
202 pmuludq %mm7, %mm0 C m 0
203 paddq %mm4, %mm6 C m 0
204 movd 8(%eax), %mm1 C m 0
205 movd %mm6, -4(%edx) C m 0
206 psrlq $32, %mm6 C m 0
207 L(m00): pmuludq %mm7, %mm1 C m 0
208 paddq %mm3, %mm6 C m 0
209 movd 12(%eax), %mm4 C m 0
210 movd %mm6, (%edx) C m 0
211 psrlq $32, %mm6 C m 0
212 lea 16(%eax), %eax C m 0
213 lea 16(%edx), %edx C m 0
216 pmuludq %mm7, %mm4 C m 0
217 paddq %mm0, %mm6 C m 0
218 movd %mm6, -12(%edx) C m 0
219 psrlq $32, %mm6 C m 0
220 paddq %mm1, %mm6 C m 0
221 mov 16(%esp), %edi C rp 0
225 lea 4(%edi), %edi C am 0
226 movd (%esi), %mm7 C am 0
227 lea 4(%esi), %esi C am 0
228 mov %edi, %edx C rp am 0
229 mov 20(%esp), %eax C up am 0
230 movd (%eax), %mm3 C am 0
231 mov 24(%esp), %ecx C inner loop count am 0
232 pxor %mm6, %mm6 C am 0
233 pmuludq %mm7, %mm3 C am 0
234 movd 4(%eax), %mm0 C am 0
235 movd (%edx), %mm5 C am 0
236 pmuludq %mm7, %mm0 C am 0
237 movd 8(%eax), %mm1 C am 0
238 paddq %mm3, %mm5 C am 0
239 movd 4(%edx), %mm4 C am 0
243 pmuludq %mm7, %mm2 C am 0
244 paddq %mm4, %mm6 C am 0
245 movd (%eax), %mm3 C am 0
246 paddq %mm1, %mm5 C am 0
247 movd -4(%edx), %mm4 C am 0
248 movd %mm6, -12(%edx) C am 0
249 psrlq $32, %mm6 C am 0
250 pmuludq %mm7, %mm3 C am 0
251 paddq %mm5, %mm6 C am 0
252 movd 4(%eax), %mm0 C am 0
253 paddq %mm2, %mm4 C am 0
254 movd (%edx), %mm5 C am 0
255 movd %mm6, -8(%edx) C am 0
256 psrlq $32, %mm6 C am 0
257 pmuludq %mm7, %mm0 C am 0
258 paddq %mm4, %mm6 C am 0
259 movd 8(%eax), %mm1 C am 0
260 paddq %mm3, %mm5 C am 0
261 movd 4(%edx), %mm4 C am 0
262 movd %mm6, -4(%edx) C am 0
263 psrlq $32, %mm6 C am 0
265 pmuludq %mm7, %mm1 C am 0
266 paddq %mm5, %mm6 C am 0
267 movd 12(%eax), %mm2 C am 0
268 paddq %mm0, %mm4 C am 0
269 movd 8(%edx), %mm5 C am 0
270 movd %mm6, (%edx) C am 0
271 psrlq $32, %mm6 C am 0
272 lea 16(%eax), %eax C am 0
273 lea 16(%edx), %edx C am 0
276 pmuludq %mm7, %mm2 C am 0
277 paddq %mm4, %mm6 C am 0
278 paddq %mm1, %mm5 C am 0
279 movd -4(%edx), %mm4 C am 0
280 movd %mm6, -12(%edx) C am 0
281 psrlq $32, %mm6 C am 0
282 paddq %mm5, %mm6 C am 0
283 paddq %mm2, %mm4 C am 0
284 L(x0): movd %mm6, -8(%edx) C am 0
285 psrlq $32, %mm6 C am 0
286 paddq %mm4, %mm6 C am 0
287 movd %mm6, -4(%edx) C am 0
288 psrlq $32, %mm6 C am 0
289 movd %mm6, (%edx) C am 0
300 L(1): movd (%eax), %mm4 C m 1
301 sub 24(%esp), %ecx C m 1
302 mov %ecx, 24(%esp) C update loop count for later m 1
303 pmuludq %mm7, %mm4 C m 1
304 movd 4(%eax), %mm3 C m 1
305 pmuludq %mm7, %mm3 C m 1
306 movd 8(%eax), %mm0 C m 1
310 pmuludq %mm7, %mm4 C m 1
311 paddq %mm0, %mm6 C m 1
312 movd 4(%eax), %mm3 C m 1
313 movd %mm6, -8(%edx) C m 1
314 psrlq $32, %mm6 C m 1
315 pmuludq %mm7, %mm3 C m 1
316 paddq %mm1, %mm6 C m 1
317 movd 8(%eax), %mm0 C m 1
318 movd %mm6, -4(%edx) C m 1
319 psrlq $32, %mm6 C m 1
320 L(m01): pmuludq %mm7, %mm0 C m 1
321 paddq %mm4, %mm6 C m 1
322 movd 12(%eax), %mm1 C m 1
323 movd %mm6, (%edx) C m 1
324 psrlq $32, %mm6 C m 1
325 pmuludq %mm7, %mm1 C m 1
326 paddq %mm3, %mm6 C m 1
327 movd 16(%eax), %mm4 C m 1
328 movd %mm6, 4(%edx) C m 1
329 psrlq $32, %mm6 C m 1
330 lea 16(%eax), %eax C m 1
331 lea 16(%edx), %edx C m 1
334 pmuludq %mm7, %mm4 C m 1
335 paddq %mm0, %mm6 C m 1
336 movd %mm6, -8(%edx) C m 1
337 psrlq $32, %mm6 C m 1
338 paddq %mm1, %mm6 C m 1
339 mov 16(%esp), %edi C rp 1
343 lea 4(%edi), %edi C am 1
344 movd (%esi), %mm7 C am 1
345 lea 4(%esi), %esi C am 1
346 mov %edi, %edx C rp am 1
347 mov 20(%esp), %eax C up am 1
348 movd (%eax), %mm2 C am 1
349 mov 24(%esp), %ecx C inner loop count am 1
350 pxor %mm6, %mm6 C am 1
351 pmuludq %mm7, %mm2 C am 1
352 movd 4(%eax), %mm3 C am 1
353 movd (%edx), %mm4 C am 1
354 pmuludq %mm7, %mm3 C am 1
355 movd 8(%eax), %mm0 C am 1
356 paddq %mm2, %mm4 C am 1
357 movd 4(%edx), %mm5 C am 1
361 pmuludq %mm7, %mm2 C am 1
362 paddq %mm4, %mm6 C am 1
363 movd 4(%eax), %mm3 C am 1
364 paddq %mm1, %mm5 C am 1
365 movd (%edx), %mm4 C am 1
366 movd %mm6, -8(%edx) C am 1
367 psrlq $32, %mm6 C am 1
368 pmuludq %mm7, %mm3 C am 1
369 paddq %mm5, %mm6 C am 1
370 movd 8(%eax), %mm0 C am 1
371 paddq %mm2, %mm4 C am 1
372 movd 4(%edx), %mm5 C am 1
373 movd %mm6, -4(%edx) C am 1
374 psrlq $32, %mm6 C am 1
376 pmuludq %mm7, %mm0 C am 1
377 paddq %mm4, %mm6 C am 1
378 movd 12(%eax), %mm1 C am 1
379 paddq %mm3, %mm5 C am 1
380 movd 8(%edx), %mm4 C am 1
381 movd %mm6, (%edx) C am 1
382 psrlq $32, %mm6 C am 1
383 pmuludq %mm7, %mm1 C am 1
384 paddq %mm5, %mm6 C am 1
385 movd 16(%eax), %mm2 C am 1
386 paddq %mm0, %mm4 C am 1
387 movd 12(%edx), %mm5 C am 1
388 movd %mm6, 4(%edx) C am 1
389 psrlq $32, %mm6 C am 1
390 lea 16(%eax), %eax C am 1
391 lea 16(%edx), %edx C am 1
394 pmuludq %mm7, %mm2 C am 1
395 paddq %mm4, %mm6 C am 1
396 paddq %mm1, %mm5 C am 1
397 movd (%edx), %mm4 C am 1
398 movd %mm6, -8(%edx) C am 1
399 psrlq $32, %mm6 C am 1
400 paddq %mm5, %mm6 C am 1
401 paddq %mm2, %mm4 C am 1
402 L(x1): movd %mm6, -4(%edx) C am 1
403 psrlq $32, %mm6 C am 1
404 paddq %mm4, %mm6 C am 1
405 movd %mm6, (%edx) C am 1
406 psrlq $32, %mm6 C am 1
407 movd %mm6, 4(%edx) C am 1
418 L(2): movd (%eax), %mm1 C m 2
419 sub 24(%esp), %ecx C m 2
420 mov %ecx, 24(%esp) C update loop count for later m 2
421 pmuludq %mm7, %mm1 C m 2
422 movd 4(%eax), %mm4 C m 2
423 pmuludq %mm7, %mm4 C m 2
424 movd 8(%eax), %mm3 C m 2
428 pmuludq %mm7, %mm4 C m 2
429 paddq %mm0, %mm6 C m 2
430 movd 8(%eax), %mm3 C m 2
431 movd %mm6, -4(%edx) C m 2
432 psrlq $32, %mm6 C m 2
433 L(m10): pmuludq %mm7, %mm3 C m 2
434 paddq %mm1, %mm6 C m 2
435 movd 12(%eax), %mm0 C m 2
436 movd %mm6, (%edx) C m 2
437 psrlq $32, %mm6 C m 2
438 pmuludq %mm7, %mm0 C m 2
439 paddq %mm4, %mm6 C m 2
440 movd 16(%eax), %mm1 C m 2
441 movd %mm6, 4(%edx) C m 2
442 psrlq $32, %mm6 C m 2
443 pmuludq %mm7, %mm1 C m 2
444 paddq %mm3, %mm6 C m 2
445 movd 20(%eax), %mm4 C m 2
446 movd %mm6, 8(%edx) C m 2
447 psrlq $32, %mm6 C m 2
448 lea 16(%eax), %eax C m 2
449 lea 16(%edx), %edx C m 2
452 pmuludq %mm7, %mm4 C m 2
453 paddq %mm0, %mm6 C m 2
454 movd %mm6, -4(%edx) C m 2
455 psrlq $32, %mm6 C m 2
456 paddq %mm1, %mm6 C m 2
457 mov 16(%esp), %edi C rp 2
461 lea 4(%edi), %edi C am 2
462 movd (%esi), %mm7 C am 2
463 lea 4(%esi), %esi C am 2
464 mov %edi, %edx C rp am 2
465 mov 20(%esp), %eax C up am 2
466 movd (%eax), %mm1 C am 2
467 mov 24(%esp), %ecx C inner loop count am 2
468 pxor %mm6, %mm6 C am 2
469 pmuludq %mm7, %mm1 C am 2
470 movd 4(%eax), %mm2 C am 2
471 movd (%edx), %mm5 C am 2
472 pmuludq %mm7, %mm2 C am 2
473 movd 8(%eax), %mm3 C am 2
474 paddq %mm1, %mm5 C am 2
475 movd 4(%edx), %mm4 C am 2
479 pmuludq %mm7, %mm2 C am 2
480 paddq %mm4, %mm6 C am 2
481 movd 8(%eax), %mm3 C am 2
482 paddq %mm1, %mm5 C am 2
483 movd 4(%edx), %mm4 C am 2
484 movd %mm6, -4(%edx) C am 2
485 psrlq $32, %mm6 C am 2
487 pmuludq %mm7, %mm3 C am 2
488 paddq %mm5, %mm6 C am 2
489 movd 12(%eax), %mm0 C am 2
490 paddq %mm2, %mm4 C am 2
491 movd 8(%edx), %mm5 C am 2
492 movd %mm6, (%edx) C am 2
493 psrlq $32, %mm6 C am 2
494 pmuludq %mm7, %mm0 C am 2
495 paddq %mm4, %mm6 C am 2
496 movd 16(%eax), %mm1 C am 2
497 paddq %mm3, %mm5 C am 2
498 movd 12(%edx), %mm4 C am 2
499 movd %mm6, 4(%edx) C am 2
500 psrlq $32, %mm6 C am 2
501 pmuludq %mm7, %mm1 C am 2
502 paddq %mm5, %mm6 C am 2
503 movd 20(%eax), %mm2 C am 2
504 paddq %mm0, %mm4 C am 2
505 movd 16(%edx), %mm5 C am 2
506 movd %mm6, 8(%edx) C am 2
507 psrlq $32, %mm6 C am 2
508 lea 16(%eax), %eax C am 2
509 lea 16(%edx), %edx C am 2
512 pmuludq %mm7, %mm2 C am 2
513 paddq %mm4, %mm6 C am 2
514 paddq %mm1, %mm5 C am 2
515 movd 4(%edx), %mm4 C am 2
516 movd %mm6, -4(%edx) C am 2
517 psrlq $32, %mm6 C am 2
518 paddq %mm5, %mm6 C am 2
519 paddq %mm2, %mm4 C am 2
520 L(x2): movd %mm6, (%edx) C am 2
521 psrlq $32, %mm6 C am 2
522 paddq %mm4, %mm6 C am 2
523 movd %mm6, 4(%edx) C am 2
524 psrlq $32, %mm6 C am 2
525 movd %mm6, 8(%edx) C am 2
536 L(3): movd (%eax), %mm0 C m 3
537 sub 24(%esp), %ecx C m 3
538 mov %ecx, 24(%esp) C update loop count for later m 3
539 pmuludq %mm7, %mm0 C m 3
540 movd 4(%eax), %mm1 C m 3
541 pmuludq %mm7, %mm1 C m 3
542 movd 8(%eax), %mm4 C m 3
546 pmuludq %mm7, %mm4 C m 3
547 paddq %mm0, %mm6 C m 3
548 movd 12(%eax), %mm3 C m 3
549 movd %mm6, (%edx) C m 3
550 psrlq $32, %mm6 C m 3
551 pmuludq %mm7, %mm3 C m 3
552 paddq %mm1, %mm6 C m 3
553 movd 16(%eax), %mm0 C m 3
554 movd %mm6, 4(%edx) C m 3
555 psrlq $32, %mm6 C m 3
556 pmuludq %mm7, %mm0 C m 3
557 paddq %mm4, %mm6 C m 3
558 movd 20(%eax), %mm1 C m 3
559 movd %mm6, 8(%edx) C m 3
560 psrlq $32, %mm6 C m 3
561 pmuludq %mm7, %mm1 C m 3
562 paddq %mm3, %mm6 C m 3
563 movd 24(%eax), %mm4 C m 3
564 movd %mm6, 12(%edx) C m 3
565 psrlq $32, %mm6 C m 3
566 lea 16(%eax), %eax C m 3
567 lea 16(%edx), %edx C m 3
570 pmuludq %mm7, %mm4 C m 3
571 paddq %mm0, %mm6 C m 3
572 movd %mm6, (%edx) C m 3
573 psrlq $32, %mm6 C m 3
574 paddq %mm1, %mm6 C m 3
575 mov 16(%esp), %edi C rp 3
579 lea 4(%edi), %edi C am 3
580 movd (%esi), %mm7 C am 3
581 lea 4(%esi), %esi C am 3
582 mov %edi, %edx C rp am 3
583 mov 20(%esp), %eax C up am 3
584 movd (%eax), %mm0 C am 3
585 mov 24(%esp), %ecx C inner loop count am 3
586 pxor %mm6, %mm6 C am 3
587 pmuludq %mm7, %mm0 C am 3
588 movd 4(%eax), %mm1 C am 3
589 movd (%edx), %mm4 C am 3
590 pmuludq %mm7, %mm1 C am 3
591 movd 8(%eax), %mm2 C am 3
592 paddq %mm0, %mm4 C am 3
593 movd 4(%edx), %mm5 C am 3
597 pmuludq %mm7, %mm2 C am 3
598 paddq %mm4, %mm6 C am 3
599 movd 12(%eax), %mm3 C am 3
600 paddq %mm1, %mm5 C am 3
601 movd 8(%edx), %mm4 C am 3
602 movd %mm6, (%edx) C am 3
603 psrlq $32, %mm6 C am 3
604 pmuludq %mm7, %mm3 C am 3
605 paddq %mm5, %mm6 C am 3
606 movd 16(%eax), %mm0 C am 3
607 paddq %mm2, %mm4 C am 3
608 movd 12(%edx), %mm5 C am 3
609 movd %mm6, 4(%edx) C am 3
610 psrlq $32, %mm6 C am 3
611 pmuludq %mm7, %mm0 C am 3
612 paddq %mm4, %mm6 C am 3
613 movd 20(%eax), %mm1 C am 3
614 paddq %mm3, %mm5 C am 3
615 movd 16(%edx), %mm4 C am 3
616 movd %mm6, 8(%edx) C am 3
617 psrlq $32, %mm6 C am 3
618 pmuludq %mm7, %mm1 C am 3
619 paddq %mm5, %mm6 C am 3
620 movd 24(%eax), %mm2 C am 3
621 paddq %mm0, %mm4 C am 3
622 movd 20(%edx), %mm5 C am 3
623 movd %mm6, 12(%edx) C am 3
624 psrlq $32, %mm6 C am 3
625 lea 16(%eax), %eax C am 3
626 lea 16(%edx), %edx C am 3
629 pmuludq %mm7, %mm2 C am 3
630 paddq %mm4, %mm6 C am 3
631 paddq %mm1, %mm5 C am 3
632 movd 8(%edx), %mm4 C am 3
633 movd %mm6, (%edx) C am 3
634 psrlq $32, %mm6 C am 3
635 paddq %mm5, %mm6 C am 3
636 paddq %mm2, %mm4 C am 3
637 L(x3): movd %mm6, 4(%edx) C am 3
638 psrlq $32, %mm6 C am 3
639 paddq %mm4, %mm6 C am 3
640 movd %mm6, 8(%edx) C am 3
641 psrlq $32, %mm6 C am 3
642 movd %mm6, 12(%edx) C am 3