1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
45 #define OLD_M 4 + STACK + ARGS(%esi)
46 #define OLD_N 8 + STACK + ARGS(%esi)
47 #define OLD_K 12 + STACK + ARGS(%esi)
48 #define OLD_ALPHA 16 + STACK + ARGS(%esi)
49 #define OLD_A 24 + STACK + ARGS(%esi)
50 #define OLD_B 28 + STACK + ARGS(%esi)
51 #define OLD_C 32 + STACK + ARGS(%esi)
52 #define OLD_LDC 36 + STACK + ARGS(%esi)
53 #define OLD_OFFT 40 + STACK + ARGS(%esi)
61 #define OLD_STACK 40(%esp)
62 #define OFFSET 44(%esp)
65 #define AORIG 56(%esp)
66 #define BORIG 60(%esp)
67 #define BUFFER 128(%esp)
69 #define STACK_ALIGN 4096
70 #define STACK_OFFSET 1024
72 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
73 #define PREFETCH prefetch
74 #define PREFETCHSIZE (8 * 10 + 4)
83 #define KERNEL1(address) \
86 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
87 movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
90 movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
92 mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
94 movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
96 movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0
98 #define KERNEL2(address) \
100 addpd %xmm3, %xmm4; \
101 movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
102 mulpd %xmm0, %xmm3; \
103 addpd %xmm3, %xmm5; \
104 movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
105 mulpd %xmm0, %xmm3; \
106 mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
107 addpd %xmm3, %xmm6; \
108 movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
109 addpd %xmm0, %xmm7; \
110 movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
112 #define KERNEL3(address) \
113 mulpd %xmm0, %xmm2; \
114 addpd %xmm2, %xmm4; \
115 movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
116 mulpd %xmm0, %xmm2; \
117 addpd %xmm2, %xmm5; \
118 movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
119 mulpd %xmm0, %xmm2; \
120 mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
121 addpd %xmm2, %xmm6; \
122 movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
123 addpd %xmm0, %xmm7; \
124 movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0
126 #define KERNEL4(address) \
127 mulpd %xmm0, %xmm3; \
128 addpd %xmm3, %xmm4; \
129 movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
130 mulpd %xmm0, %xmm3; \
131 addpd %xmm3, %xmm5; \
132 movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
133 mulpd %xmm0, %xmm3; \
134 mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
135 addpd %xmm3, %xmm6; \
136 movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
137 addpd %xmm0, %xmm7; \
138 movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0
140 #define KERNEL5(address) \
141 PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \
142 mulpd %xmm1, %xmm2; \
143 addpd %xmm2, %xmm4; \
144 movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
145 mulpd %xmm1, %xmm2; \
146 addpd %xmm2, %xmm5; \
147 movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
148 mulpd %xmm1, %xmm2; \
149 mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
150 addpd %xmm2, %xmm6; \
151 movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
152 addpd %xmm1, %xmm7; \
153 movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1
155 #define KERNEL6(address) \
156 mulpd %xmm1, %xmm3; \
157 addpd %xmm3, %xmm4; \
158 movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
159 mulpd %xmm1, %xmm3; \
160 addpd %xmm3, %xmm5; \
161 movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
162 mulpd %xmm1, %xmm3; \
163 mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
164 addpd %xmm3, %xmm6; \
165 movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
166 addpd %xmm1, %xmm7; \
167 movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1
169 #define KERNEL7(address) \
170 mulpd %xmm1, %xmm2; \
171 addpd %xmm2, %xmm4; \
172 movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
173 mulpd %xmm1, %xmm2; \
174 addpd %xmm2, %xmm5; \
175 movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
176 mulpd %xmm1, %xmm2; \
177 mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
178 addpd %xmm2, %xmm6; \
179 movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
180 addpd %xmm1, %xmm7; \
181 movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1
183 #define KERNEL8(address) \
184 mulpd %xmm1, %xmm3; \
185 addpd %xmm3, %xmm4; \
186 movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
187 mulpd %xmm1, %xmm3; \
188 addpd %xmm3, %xmm5; \
189 movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
190 mulpd %xmm1, %xmm3; \
191 mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
192 addpd %xmm3, %xmm6; \
193 movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
194 addpd %xmm1, %xmm7; \
195 movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
208 movl %esp, %esi # save old stack
210 subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
211 andl $-STACK_ALIGN, %esp
212 addl $STACK_OFFSET, %esp
237 leal (, LDC, SIZE), LDC
241 leal (, %eax, SIZE), %eax
249 leal (, %eax, SIZE), %eax
284 sall $2 + BASE_SHIFT, %eax
288 #if defined(LN) || defined(RT)
291 leal (, %eax, SIZE), %eax
293 leal (BB, %eax, 8), BB
301 #if defined(LT) || defined(RN)
312 #define COPYPREFETCH 40
314 prefetchnta (COPYPREFETCH) * SIZE(B)
316 movq 0 * SIZE(B), %mm0
317 movq 1 * SIZE(B), %mm1
318 movq 2 * SIZE(B), %mm2
319 movq 3 * SIZE(B), %mm3
320 movq 4 * SIZE(B), %mm4
321 movq 5 * SIZE(B), %mm5
322 movq 6 * SIZE(B), %mm6
323 movq 7 * SIZE(B), %mm7
325 movq %mm0, 0 * SIZE(BB)
326 movq %mm0, 1 * SIZE(BB)
327 movq %mm1, 2 * SIZE(BB)
328 movq %mm1, 3 * SIZE(BB)
329 movq %mm2, 4 * SIZE(BB)
330 movq %mm2, 5 * SIZE(BB)
331 movq %mm3, 6 * SIZE(BB)
332 movq %mm3, 7 * SIZE(BB)
334 movq %mm4, 8 * SIZE(BB)
335 movq %mm4, 9 * SIZE(BB)
336 movq %mm5, 10 * SIZE(BB)
337 movq %mm5, 11 * SIZE(BB)
338 movq %mm6, 12 * SIZE(BB)
339 movq %mm6, 13 * SIZE(BB)
340 movq %mm7, 14 * SIZE(BB)
341 movq %mm7, 15 * SIZE(BB)
350 #if defined(LT) || defined(RN)
360 movq 0 * SIZE(B), %mm0
361 movq 1 * SIZE(B), %mm1
362 movq 2 * SIZE(B), %mm2
363 movq 3 * SIZE(B), %mm3
365 movq %mm0, 0 * SIZE(BB)
366 movq %mm0, 1 * SIZE(BB)
367 movq %mm1, 2 * SIZE(BB)
368 movq %mm1, 3 * SIZE(BB)
369 movq %mm2, 4 * SIZE(BB)
370 movq %mm2, 5 * SIZE(BB)
371 movq %mm3, 6 * SIZE(BB)
372 movq %mm3, 7 * SIZE(BB)
378 #if defined(LT) || defined(RN)
385 leal (, LDC, 4), %eax
396 testl $1, %ebx # i = (m >> 2)
401 sall $BASE_SHIFT, %eax
405 #if defined(LN) || defined(RT)
408 leal (AA, %eax, SIZE), AA
413 #if defined(LN) || defined(RT)
415 sall $3 + BASE_SHIFT, %eax
424 movlpd 0 * SIZE(AA), %xmm0
425 movlpd 4 * SIZE(AA), %xmm1
426 movlpd 0 * SIZE(BB), %xmm2
427 movlpd 8 * SIZE(BB), %xmm3
429 #if defined(LT) || defined(RN)
442 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
443 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
445 movlpd 2 * SIZE(BB), %xmm2
448 movlpd 4 * SIZE(BB), %xmm2
450 mulsd 6 * SIZE(BB), %xmm0
452 movlpd 16 * SIZE(BB), %xmm2
454 movlpd 1 * SIZE(AA), %xmm0
457 movlpd 10 * SIZE(BB), %xmm3
460 movlpd 12 * SIZE(BB), %xmm3
462 mulsd 14 * SIZE(BB), %xmm0
464 movlpd 24 * SIZE(BB), %xmm3
466 movlpd 2 * SIZE(AA), %xmm0
469 movlpd 18 * SIZE(BB), %xmm2
472 movlpd 20 * SIZE(BB), %xmm2
474 mulsd 22 * SIZE(BB), %xmm0
476 movlpd 32 * SIZE(BB), %xmm2
478 movlpd 3 * SIZE(AA), %xmm0
481 movlpd 26 * SIZE(BB), %xmm3
484 movlpd 28 * SIZE(BB), %xmm3
486 mulsd 30 * SIZE(BB), %xmm0
488 movlpd 40 * SIZE(BB), %xmm3
490 movlpd 8 * SIZE(AA), %xmm0
491 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
492 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
496 movlpd 34 * SIZE(BB), %xmm2
499 movlpd 36 * SIZE(BB), %xmm2
501 mulsd 38 * SIZE(BB), %xmm1
503 movlpd 48 * SIZE(BB), %xmm2
505 movlpd 5 * SIZE(AA), %xmm1
508 movlpd 42 * SIZE(BB), %xmm3
511 movlpd 44 * SIZE(BB), %xmm3
513 mulsd 46 * SIZE(BB), %xmm1
515 movlpd 56 * SIZE(BB), %xmm3
517 movlpd 6 * SIZE(AA), %xmm1
520 movlpd 50 * SIZE(BB), %xmm2
523 movlpd 52 * SIZE(BB), %xmm2
525 mulsd 54 * SIZE(BB), %xmm1
527 movlpd 64 * SIZE(BB), %xmm2
529 movlpd 7 * SIZE(AA), %xmm1
532 movlpd 58 * SIZE(BB), %xmm3
535 movlpd 60 * SIZE(BB), %xmm3
537 mulsd 62 * SIZE(BB), %xmm1
539 movlpd 72 * SIZE(BB), %xmm3
542 movlpd 12 * SIZE(AA), %xmm1
549 #if defined(LT) || defined(RN)
555 andl $7, %eax # if (k & 1)
562 movlpd 2 * SIZE(BB), %xmm2
565 movlpd 4 * SIZE(BB), %xmm2
567 mulsd 6 * SIZE(BB), %xmm0
569 movlpd 8 * SIZE(BB), %xmm2
571 movlpd 1 * SIZE(AA), %xmm0
580 #if defined(LN) || defined(RT)
592 leal (, %eax, SIZE), %eax
595 leal (BB, %eax, 8), BB
598 #if defined(LN) || defined(LT)
599 unpcklpd %xmm5, %xmm4
600 unpcklpd %xmm7, %xmm6
602 movapd 0 * SIZE(B), %xmm2
603 movapd 2 * SIZE(B), %xmm5
608 movlpd 0 * SIZE(AA), %xmm0
609 movlpd 1 * SIZE(AA), %xmm1
610 movlpd 2 * SIZE(AA), %xmm2
611 movlpd 3 * SIZE(AA), %xmm3
620 movlpd 0 * SIZE(AA), %xmm4
621 movhpd 0 * SIZE(AA), %xmm4
627 movlpd 0 * SIZE(AA), %xmm4
628 movhpd 0 * SIZE(AA), %xmm4
634 movlpd 0 * SIZE(B), %xmm4
636 movlpd 1 * SIZE(B), %xmm4
639 movlpd 2 * SIZE(B), %xmm4
642 movlpd 3 * SIZE(B), %xmm4
646 movlpd 5 * SIZE(B), %xmm4
648 movlpd 6 * SIZE(B), %xmm4
651 movlpd 7 * SIZE(B), %xmm4
655 movlpd 10 * SIZE(B), %xmm4
657 movlpd 11 * SIZE(B), %xmm4
661 movlpd 15 * SIZE(B), %xmm4
666 movlpd 15 * SIZE(B), %xmm4
668 movlpd 14 * SIZE(B), %xmm4
671 movlpd 13 * SIZE(B), %xmm4
674 movlpd 12 * SIZE(B), %xmm4
678 movlpd 10 * SIZE(B), %xmm4
680 movlpd 9 * SIZE(B), %xmm4
683 movlpd 8 * SIZE(B), %xmm4
687 movlpd 5 * SIZE(B), %xmm4
689 movlpd 4 * SIZE(B), %xmm4
693 movlpd 0 * SIZE(B), %xmm4
697 #if defined(LN) || defined(LT)
698 movapd %xmm2, 0 * SIZE(B)
699 movapd %xmm5, 2 * SIZE(B)
701 movlpd %xmm2, 0 * SIZE(BB)
702 movlpd %xmm2, 1 * SIZE(BB)
703 movhpd %xmm2, 2 * SIZE(BB)
704 movhpd %xmm2, 3 * SIZE(BB)
705 movlpd %xmm5, 4 * SIZE(BB)
706 movlpd %xmm5, 5 * SIZE(BB)
707 movhpd %xmm5, 6 * SIZE(BB)
708 movhpd %xmm5, 7 * SIZE(BB)
710 movlpd %xmm0, 0 * SIZE(AA)
711 movlpd %xmm1, 1 * SIZE(AA)
712 movlpd %xmm2, 2 * SIZE(AA)
713 movlpd %xmm3, 3 * SIZE(AA)
720 leal (LDC, LDC, 2), %eax
722 #if defined(LN) || defined(LT)
723 movlpd %xmm2, 0 * SIZE(CO1)
724 movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
725 movlpd %xmm5, 0 * SIZE(CO1, LDC, 2)
726 movhpd %xmm5, 0 * SIZE(CO1, %eax, 1)
728 movlpd %xmm0, 0 * SIZE(CO1)
729 movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
730 movlpd %xmm2, 0 * SIZE(CO1, LDC, 2)
731 movlpd %xmm3, 0 * SIZE(CO1, %eax, 1)
738 #if defined(LT) || defined(RN)
741 leal (AA,%eax, SIZE), AA
759 sall $BASE_SHIFT, %eax
766 sarl $1, %ebx # i = (m >> 2)
773 sall $1 + BASE_SHIFT, %eax
777 #if defined(LN) || defined(RT)
780 leal (, %eax, SIZE), %eax
781 leal (AA, %eax, 2), AA
786 #if defined(LN) || defined(RT)
788 sall $3 + BASE_SHIFT, %eax
797 movapd 0 * SIZE(AA), %xmm0
798 movapd 8 * SIZE(AA), %xmm1
799 movapd 0 * SIZE(BB), %xmm2
800 movapd 8 * SIZE(BB), %xmm3
802 leal (LDC, LDC, 2), %eax
805 prefetchw -2 * SIZE(CO1)
806 prefetchw -2 * SIZE(CO1, LDC)
807 prefetchw -2 * SIZE(CO1, LDC, 2)
808 prefetchw -2 * SIZE(CO1, %eax)
810 prefetchw 1 * SIZE(CO1)
811 prefetchw 1 * SIZE(CO1, LDC)
812 prefetchw 1 * SIZE(CO1, LDC, 2)
813 prefetchw 1 * SIZE(CO1, %eax)
816 #if defined(LT) || defined(RN)
907 addl $128 * 4 * SIZE, BB
908 addl $128 * 1 * SIZE, AA
914 leal (AA, %eax, 1), AA
915 leal (BB, %eax, 4), BB
941 #if defined(LT) || defined(RN)
947 andl $7, %eax # if (k & 1)
955 movapd 2 * SIZE(BB), %xmm2
958 movapd 4 * SIZE(BB), %xmm2
960 mulpd 6 * SIZE(BB), %xmm0
962 movapd 8 * SIZE(BB), %xmm2
964 movapd 2 * SIZE(AA), %xmm0
973 #if defined(LN) || defined(RT)
985 leal (, %eax, SIZE), %eax
986 leal (AA, %eax, 2), AA
988 leal (BB, %eax, 8), BB
991 #if defined(LN) || defined(LT)
993 unpcklpd %xmm5, %xmm4
994 unpckhpd %xmm5, %xmm0
997 unpcklpd %xmm7, %xmm6
998 unpckhpd %xmm7, %xmm1
1000 movapd 0 * SIZE(B), %xmm2
1001 movapd 2 * SIZE(B), %xmm5
1002 movapd 4 * SIZE(B), %xmm3
1003 movapd 6 * SIZE(B), %xmm7
1010 movapd 0 * SIZE(AA), %xmm0
1011 movapd 2 * SIZE(AA), %xmm1
1012 movapd 4 * SIZE(AA), %xmm2
1013 movapd 6 * SIZE(AA), %xmm3
1022 movlpd 3 * SIZE(AA), %xmm4
1023 movhpd 3 * SIZE(AA), %xmm4
1027 movlpd 2 * SIZE(AA), %xmm4
1028 movhpd 2 * SIZE(AA), %xmm4
1035 movlpd 0 * SIZE(AA), %xmm4
1036 movhpd 0 * SIZE(AA), %xmm4
1043 movlpd 0 * SIZE(AA), %xmm4
1044 movhpd 0 * SIZE(AA), %xmm4
1048 movlpd 1 * SIZE(AA), %xmm4
1049 movhpd 1 * SIZE(AA), %xmm4
1056 movlpd 3 * SIZE(AA), %xmm4
1057 movhpd 3 * SIZE(AA), %xmm4
1063 movlpd 0 * SIZE(B), %xmm4
1064 movhpd 0 * SIZE(B), %xmm4
1066 movlpd 1 * SIZE(B), %xmm4
1067 movhpd 1 * SIZE(B), %xmm4
1070 movlpd 2 * SIZE(B), %xmm4
1071 movhpd 2 * SIZE(B), %xmm4
1074 movlpd 3 * SIZE(B), %xmm4
1075 movhpd 3 * SIZE(B), %xmm4
1079 movlpd 5 * SIZE(B), %xmm4
1080 movhpd 5 * SIZE(B), %xmm4
1082 movlpd 6 * SIZE(B), %xmm4
1083 movhpd 6 * SIZE(B), %xmm4
1086 movlpd 7 * SIZE(B), %xmm4
1087 movhpd 7 * SIZE(B), %xmm4
1091 movlpd 10 * SIZE(B), %xmm4
1092 movhpd 10 * SIZE(B), %xmm4
1094 movlpd 11 * SIZE(B), %xmm4
1095 movhpd 11 * SIZE(B), %xmm4
1099 movlpd 15 * SIZE(B), %xmm4
1100 movhpd 15 * SIZE(B), %xmm4
1105 movlpd 15 * SIZE(B), %xmm4
1106 movhpd 15 * SIZE(B), %xmm4
1108 movlpd 14 * SIZE(B), %xmm4
1109 movhpd 14 * SIZE(B), %xmm4
1112 movlpd 13 * SIZE(B), %xmm4
1113 movhpd 13 * SIZE(B), %xmm4
1116 movlpd 12 * SIZE(B), %xmm4
1117 movhpd 12 * SIZE(B), %xmm4
1121 movlpd 10 * SIZE(B), %xmm4
1122 movhpd 10 * SIZE(B), %xmm4
1124 movlpd 9 * SIZE(B), %xmm4
1125 movhpd 9 * SIZE(B), %xmm4
1128 movlpd 8 * SIZE(B), %xmm4
1129 movhpd 8 * SIZE(B), %xmm4
1133 movlpd 5 * SIZE(B), %xmm4
1134 movhpd 5 * SIZE(B), %xmm4
1136 movlpd 4 * SIZE(B), %xmm4
1137 movhpd 4 * SIZE(B), %xmm4
1141 movlpd 0 * SIZE(B), %xmm4
1142 movhpd 0 * SIZE(B), %xmm4
1146 #if defined(LN) || defined(LT)
1147 movapd %xmm2, 0 * SIZE(B)
1148 movapd %xmm5, 2 * SIZE(B)
1149 movapd %xmm3, 4 * SIZE(B)
1150 movapd %xmm7, 6 * SIZE(B)
1152 movlpd %xmm2, 0 * SIZE(BB)
1153 movlpd %xmm2, 1 * SIZE(BB)
1154 movhpd %xmm2, 2 * SIZE(BB)
1155 movhpd %xmm2, 3 * SIZE(BB)
1156 movlpd %xmm5, 4 * SIZE(BB)
1157 movlpd %xmm5, 5 * SIZE(BB)
1158 movhpd %xmm5, 6 * SIZE(BB)
1159 movhpd %xmm5, 7 * SIZE(BB)
1160 movlpd %xmm3, 8 * SIZE(BB)
1161 movlpd %xmm3, 9 * SIZE(BB)
1162 movhpd %xmm3, 10 * SIZE(BB)
1163 movhpd %xmm3, 11 * SIZE(BB)
1164 movlpd %xmm7, 12 * SIZE(BB)
1165 movlpd %xmm7, 13 * SIZE(BB)
1166 movhpd %xmm7, 14 * SIZE(BB)
1167 movhpd %xmm7, 15 * SIZE(BB)
1169 movapd %xmm0, 0 * SIZE(AA)
1170 movapd %xmm1, 2 * SIZE(AA)
1171 movapd %xmm2, 4 * SIZE(AA)
1172 movapd %xmm3, 6 * SIZE(AA)
1179 leal (LDC, LDC, 2), %eax
1181 #if defined(LN) || defined(LT)
1182 movlpd %xmm2, 0 * SIZE(CO1)
1183 movlpd %xmm3, 1 * SIZE(CO1)
1184 movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
1185 movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
1186 movlpd %xmm5, 0 * SIZE(CO1, LDC, 2)
1187 movlpd %xmm7, 1 * SIZE(CO1, LDC, 2)
1188 movhpd %xmm5, 0 * SIZE(CO1, %eax, 1)
1189 movhpd %xmm7, 1 * SIZE(CO1, %eax, 1)
1191 movlpd %xmm0, 0 * SIZE(CO1)
1192 movhpd %xmm0, 1 * SIZE(CO1)
1193 movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
1194 movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
1195 movlpd %xmm2, 0 * SIZE(CO1, LDC, 2)
1196 movhpd %xmm2, 1 * SIZE(CO1, LDC, 2)
1197 movlpd %xmm3, 0 * SIZE(CO1, %eax, 1)
1198 movhpd %xmm3, 1 * SIZE(CO1, %eax, 1)
1205 #if defined(LT) || defined(RN)
1208 leal (,%eax, SIZE), %eax
1209 leal (AA, %eax, 2), AA
1227 sall $1 + BASE_SHIFT, %eax
1238 leal (, %eax, SIZE), %eax
1239 leal (B, %eax, 4), B
1242 #if defined(LT) || defined(RN)
1245 leal (,%eax, SIZE), %eax
1246 leal (B, %eax, 4), B
1275 sall $1 + BASE_SHIFT, %eax
1279 #if defined(LN) || defined(RT)
1282 leal (, %eax, SIZE), %eax
1283 leal (B, %eax, 2), B
1284 leal (BB, %eax, 4), BB
1292 #if defined(LT) || defined(RN)
1303 #define COPYPREFETCH 40
1305 prefetchnta (COPYPREFETCH) * SIZE(B)
1307 movq 0 * SIZE(B), %mm0
1308 movq 1 * SIZE(B), %mm1
1309 movq 2 * SIZE(B), %mm2
1310 movq 3 * SIZE(B), %mm3
1311 movq 4 * SIZE(B), %mm4
1312 movq 5 * SIZE(B), %mm5
1313 movq 6 * SIZE(B), %mm6
1314 movq 7 * SIZE(B), %mm7
1316 movq %mm0, 0 * SIZE(BB)
1317 movq %mm0, 1 * SIZE(BB)
1318 movq %mm1, 2 * SIZE(BB)
1319 movq %mm1, 3 * SIZE(BB)
1320 movq %mm2, 4 * SIZE(BB)
1321 movq %mm2, 5 * SIZE(BB)
1322 movq %mm3, 6 * SIZE(BB)
1323 movq %mm3, 7 * SIZE(BB)
1325 movq %mm4, 8 * SIZE(BB)
1326 movq %mm4, 9 * SIZE(BB)
1327 movq %mm5, 10 * SIZE(BB)
1328 movq %mm5, 11 * SIZE(BB)
1329 movq %mm6, 12 * SIZE(BB)
1330 movq %mm6, 13 * SIZE(BB)
1331 movq %mm7, 14 * SIZE(BB)
1332 movq %mm7, 15 * SIZE(BB)
1341 #if defined(LT) || defined(RN)
1353 movq 0 * SIZE(B), %mm0
1354 movq 1 * SIZE(B), %mm1
1356 movq %mm0, 0 * SIZE(BB)
1357 movq %mm0, 1 * SIZE(BB)
1358 movq %mm1, 2 * SIZE(BB)
1359 movq %mm1, 3 * SIZE(BB)
1368 #if defined(LT) || defined(RN)
1375 leal (, LDC, 2), %eax
1386 testl $1, %ebx # i = (m >> 2)
1391 sall $BASE_SHIFT, %eax
1395 #if defined(LN) || defined(RT)
1398 leal (AA, %eax, SIZE), AA
1403 #if defined(LN) || defined(RT)
1405 sall $2 + BASE_SHIFT, %eax
1414 movlpd 0 * SIZE(AA), %xmm0
1415 movlpd 4 * SIZE(AA), %xmm1
1416 movlpd 0 * SIZE(BB), %xmm2
1417 movlpd 8 * SIZE(BB), %xmm3
1419 #if defined(LT) || defined(RN)
1431 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
1432 mulsd 2 * SIZE(BB), %xmm0
1434 movlpd 4 * SIZE(BB), %xmm2
1436 movlpd 1 * SIZE(AA), %xmm0
1439 mulsd 6 * SIZE(BB), %xmm0
1441 movlpd 16 * SIZE(BB), %xmm2
1443 movlpd 2 * SIZE(AA), %xmm0
1446 mulsd 10 * SIZE(BB), %xmm0
1448 movlpd 12 * SIZE(BB), %xmm3
1450 movlpd 3 * SIZE(AA), %xmm0
1453 mulsd 14 * SIZE(BB), %xmm0
1455 movlpd 24 * SIZE(BB), %xmm3
1457 movlpd 8 * SIZE(AA), %xmm0
1460 mulsd 18 * SIZE(BB), %xmm1
1462 movlpd 20 * SIZE(BB), %xmm2
1464 movlpd 5 * SIZE(AA), %xmm1
1467 mulsd 22 * SIZE(BB), %xmm1
1469 movlpd 32 * SIZE(BB), %xmm2
1471 movlpd 6 * SIZE(AA), %xmm1
1474 mulsd 26 * SIZE(BB), %xmm1
1476 movlpd 28 * SIZE(BB), %xmm3
1478 movlpd 7 * SIZE(AA), %xmm1
1481 mulsd 30 * SIZE(BB), %xmm1
1483 movlpd 40 * SIZE(BB), %xmm3
1485 movlpd 12 * SIZE(AA), %xmm1
1494 #if defined(LT) || defined(RN)
1500 andl $7, %eax # if (k & 1)
1506 mulsd 2 * SIZE(BB), %xmm0
1508 movlpd 4 * SIZE(BB), %xmm2
1510 movlpd 1 * SIZE(AA), %xmm0
1522 #if defined(LN) || defined(RT)
1534 leal (, %eax, SIZE), %eax
1536 leal (B, %eax, 2), B
1537 leal (BB, %eax, 4), BB
1540 #if defined(LN) || defined(LT)
1541 unpcklpd %xmm5, %xmm4
1543 movapd 0 * SIZE(B), %xmm2
1547 movlpd 0 * SIZE(AA), %xmm0
1548 movlpd 1 * SIZE(AA), %xmm1
1555 movlpd 0 * SIZE(AA), %xmm4
1556 movhpd 0 * SIZE(AA), %xmm4
1561 movlpd 0 * SIZE(AA), %xmm4
1562 movhpd 0 * SIZE(AA), %xmm4
1567 movlpd 0 * SIZE(B), %xmm4
1569 movlpd 1 * SIZE(B), %xmm4
1573 movlpd 3 * SIZE(B), %xmm4
1578 movlpd 3 * SIZE(B), %xmm4
1580 movlpd 2 * SIZE(B), %xmm4
1584 movlpd 0 * SIZE(B), %xmm4
1588 #if defined(LN) || defined(LT)
1589 movapd %xmm2, 0 * SIZE(B)
1591 movlpd %xmm2, 0 * SIZE(BB)
1592 movlpd %xmm2, 1 * SIZE(BB)
1593 movhpd %xmm2, 2 * SIZE(BB)
1594 movhpd %xmm2, 3 * SIZE(BB)
1596 movlpd %xmm0, 0 * SIZE(AA)
1597 movlpd %xmm1, 1 * SIZE(AA)
1604 #if defined(LN) || defined(LT)
1605 movlpd %xmm2, 0 * SIZE(CO1)
1606 movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
1608 movlpd %xmm0, 0 * SIZE(CO1)
1609 movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
1616 #if defined(LT) || defined(RN)
1619 leal (AA,%eax, SIZE), AA
1637 sall $BASE_SHIFT, %eax
1644 sarl $1, %ebx # i = (m >> 2)
1651 sall $1 + BASE_SHIFT, %eax
1655 #if defined(LN) || defined(RT)
1658 leal (, %eax, SIZE), %eax
1659 leal (AA, %eax, 2), AA
1664 #if defined(LN) || defined(RT)
1666 sall $2 + BASE_SHIFT, %eax
1675 movapd 0 * SIZE(AA), %xmm0
1676 movapd 8 * SIZE(AA), %xmm1
1677 movapd 0 * SIZE(BB), %xmm2
1678 movapd 8 * SIZE(BB), %xmm3
1681 prefetchw -2 * SIZE(CO1)
1682 prefetchw -2 * SIZE(CO1, LDC)
1684 prefetchw 1 * SIZE(CO1)
1685 prefetchw 1 * SIZE(CO1, LDC)
1688 #if defined(LT) || defined(RN)
1700 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
1701 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
1703 mulpd 2 * SIZE(BB), %xmm0
1705 movapd 4 * SIZE(BB), %xmm2
1707 movapd 2 * SIZE(AA), %xmm0
1710 mulpd 6 * SIZE(BB), %xmm0
1712 movapd 16 * SIZE(BB), %xmm2
1714 movapd 4 * SIZE(AA), %xmm0
1717 mulpd 10 * SIZE(BB), %xmm0
1719 movapd 12 * SIZE(BB), %xmm3
1721 movapd 6 * SIZE(AA), %xmm0
1724 mulpd 14 * SIZE(BB), %xmm0
1726 movapd 24 * SIZE(BB), %xmm3
1728 movapd 16 * SIZE(AA), %xmm0
1730 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
1731 prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
1734 mulpd 18 * SIZE(BB), %xmm1
1736 movapd 20 * SIZE(BB), %xmm2
1738 movapd 10 * SIZE(AA), %xmm1
1741 mulpd 22 * SIZE(BB), %xmm1
1743 movapd 32 * SIZE(BB), %xmm2
1745 movapd 12 * SIZE(AA), %xmm1
1748 mulpd 26 * SIZE(BB), %xmm1
1750 movapd 28 * SIZE(BB), %xmm3
1752 movapd 14 * SIZE(AA), %xmm1
1755 mulpd 30 * SIZE(BB), %xmm1
1757 movapd 40 * SIZE(BB), %xmm3
1759 movapd 24 * SIZE(AA), %xmm1
1768 #if defined(LT) || defined(RN)
1774 andl $7, %eax # if (k & 1)
1781 mulpd 2 * SIZE(BB), %xmm0
1783 movapd 4 * SIZE(BB), %xmm2
1785 movapd 2 * SIZE(AA), %xmm0
1797 #if defined(LN) || defined(RT)
1809 leal (, %eax, SIZE), %eax
1810 leal (AA, %eax, 2), AA
1811 leal (B, %eax, 2), B
1812 leal (BB, %eax, 4), BB
1815 #if defined(LN) || defined(LT)
1817 unpcklpd %xmm5, %xmm4
1818 unpckhpd %xmm5, %xmm0
1820 movapd 0 * SIZE(B), %xmm2
1821 movapd 2 * SIZE(B), %xmm3
1826 movapd 0 * SIZE(AA), %xmm0
1827 movapd 2 * SIZE(AA), %xmm1
1834 movlpd 3 * SIZE(AA), %xmm4
1835 movhpd 3 * SIZE(AA), %xmm4
1838 movlpd 2 * SIZE(AA), %xmm4
1839 movhpd 2 * SIZE(AA), %xmm4
1843 movlpd 0 * SIZE(AA), %xmm4
1844 movhpd 0 * SIZE(AA), %xmm4
1850 movlpd 0 * SIZE(AA), %xmm4
1851 movhpd 0 * SIZE(AA), %xmm4
1854 movlpd 1 * SIZE(AA), %xmm4
1855 movhpd 1 * SIZE(AA), %xmm4
1859 movlpd 3 * SIZE(AA), %xmm4
1860 movhpd 3 * SIZE(AA), %xmm4
1865 movlpd 0 * SIZE(B), %xmm4
1866 movhpd 0 * SIZE(B), %xmm4
1868 movlpd 1 * SIZE(B), %xmm4
1869 movhpd 1 * SIZE(B), %xmm4
1873 movlpd 3 * SIZE(B), %xmm4
1874 movhpd 3 * SIZE(B), %xmm4
1879 movlpd 3 * SIZE(B), %xmm4
1880 movhpd 3 * SIZE(B), %xmm4
1882 movlpd 2 * SIZE(B), %xmm4
1883 movhpd 2 * SIZE(B), %xmm4
1887 movlpd 0 * SIZE(B), %xmm4
1888 movhpd 0 * SIZE(B), %xmm4
1892 #if defined(LN) || defined(LT)
1893 movapd %xmm2, 0 * SIZE(B)
1894 movapd %xmm3, 2 * SIZE(B)
1896 movlpd %xmm2, 0 * SIZE(BB)
1897 movlpd %xmm2, 1 * SIZE(BB)
1898 movhpd %xmm2, 2 * SIZE(BB)
1899 movhpd %xmm2, 3 * SIZE(BB)
1900 movlpd %xmm3, 4 * SIZE(BB)
1901 movlpd %xmm3, 5 * SIZE(BB)
1902 movhpd %xmm3, 6 * SIZE(BB)
1903 movhpd %xmm3, 7 * SIZE(BB)
1905 movapd %xmm0, 0 * SIZE(AA)
1906 movapd %xmm1, 2 * SIZE(AA)
1913 #if defined(LN) || defined(LT)
1914 movlpd %xmm2, 0 * SIZE(CO1)
1915 movlpd %xmm3, 1 * SIZE(CO1)
1916 movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
1917 movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
1919 movlpd %xmm0, 0 * SIZE(CO1)
1920 movhpd %xmm0, 1 * SIZE(CO1)
1921 movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
1922 movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
1929 #if defined(LT) || defined(RN)
1932 leal (,%eax, SIZE), %eax
1933 leal (AA, %eax, 2), AA
1951 sall $1 + BASE_SHIFT, %eax
1962 leal (, %eax, SIZE), %eax
1963 leal (B, %eax, 2), B
1966 #if defined(LT) || defined(RN)
1969 leal (,%eax, SIZE), %eax
1970 leal (B, %eax, 2), B
1996 sall $BASE_SHIFT, %eax
2000 #if defined(LN) || defined(RT)
2003 leal (, %eax, SIZE), %eax
2004 leal (B, %eax, 1), B
2005 leal (BB, %eax, 2), BB
2013 #if defined(LT) || defined(RN)
2024 #define COPYPREFETCH 40
2026 prefetchnta (COPYPREFETCH) * SIZE(B)
2028 movq 0 * SIZE(B), %mm0
2029 movq 1 * SIZE(B), %mm1
2030 movq 2 * SIZE(B), %mm2
2031 movq 3 * SIZE(B), %mm3
2032 movq 4 * SIZE(B), %mm4
2033 movq 5 * SIZE(B), %mm5
2034 movq 6 * SIZE(B), %mm6
2035 movq 7 * SIZE(B), %mm7
2037 movq %mm0, 0 * SIZE(BB)
2038 movq %mm0, 1 * SIZE(BB)
2039 movq %mm1, 2 * SIZE(BB)
2040 movq %mm1, 3 * SIZE(BB)
2041 movq %mm2, 4 * SIZE(BB)
2042 movq %mm2, 5 * SIZE(BB)
2043 movq %mm3, 6 * SIZE(BB)
2044 movq %mm3, 7 * SIZE(BB)
2046 movq %mm4, 8 * SIZE(BB)
2047 movq %mm4, 9 * SIZE(BB)
2048 movq %mm5, 10 * SIZE(BB)
2049 movq %mm5, 11 * SIZE(BB)
2050 movq %mm6, 12 * SIZE(BB)
2051 movq %mm6, 13 * SIZE(BB)
2052 movq %mm7, 14 * SIZE(BB)
2053 movq %mm7, 15 * SIZE(BB)
2062 #if defined(LT) || defined(RN)
2074 movq 0 * SIZE(B), %mm0
2076 movq %mm0, 0 * SIZE(BB)
2077 movq %mm0, 1 * SIZE(BB)
2086 #if defined(LT) || defined(RN)
2102 testl $1, %ebx # i = (m >> 2)
2107 sall $BASE_SHIFT, %eax
2111 #if defined(LN) || defined(RT)
2114 leal (AA, %eax, SIZE), AA
2119 #if defined(LN) || defined(RT)
2121 sall $1 + BASE_SHIFT, %eax
2130 movlpd 0 * SIZE(AA), %xmm0
2131 movlpd 4 * SIZE(AA), %xmm1
2132 movlpd 0 * SIZE(BB), %xmm2
2133 movlpd 8 * SIZE(BB), %xmm3
2135 #if defined(LT) || defined(RN)
2147 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
2148 movlpd 1 * SIZE(AA), %xmm0
2149 mulsd 2 * SIZE(BB), %xmm0
2151 movlpd 16 * SIZE(BB), %xmm2
2153 movlpd 2 * SIZE(AA), %xmm0
2154 mulsd 4 * SIZE(BB), %xmm0
2156 movlpd 3 * SIZE(AA), %xmm0
2157 mulsd 6 * SIZE(BB), %xmm0
2159 movlpd 8 * SIZE(AA), %xmm0
2161 movlpd 5 * SIZE(AA), %xmm1
2162 mulsd 10 * SIZE(BB), %xmm1
2164 movlpd 24 * SIZE(BB), %xmm3
2166 movlpd 6 * SIZE(AA), %xmm1
2167 mulsd 12 * SIZE(BB), %xmm1
2169 movlpd 7 * SIZE(AA), %xmm1
2170 mulsd 14 * SIZE(BB), %xmm1
2172 movlpd 12 * SIZE(AA), %xmm1
2181 #if defined(LT) || defined(RN)
2187 andl $7, %eax # if (k & 1)
2194 movlpd 2 * SIZE(BB), %xmm2
2195 movlpd 1 * SIZE(AA), %xmm0
2208 #if defined(LN) || defined(RT)
2220 leal (, %eax, SIZE), %eax
2223 leal (BB, %eax, 2), BB
2226 #if defined(LN) || defined(LT)
2227 movlpd 0 * SIZE(B), %xmm2
2230 movlpd 0 * SIZE(AA), %xmm0
2235 movlpd 0 * SIZE(AA), %xmm4
2240 movlpd 0 * SIZE(AA), %xmm4
2245 movlpd 0 * SIZE(B), %xmm4
2250 movlpd 0 * SIZE(B), %xmm4
2254 #if defined(LN) || defined(LT)
2255 movlpd %xmm2, 0 * SIZE(B)
2257 movlpd %xmm2, 0 * SIZE(BB)
2258 movlpd %xmm2, 1 * SIZE(BB)
2260 movlpd %xmm0, 0 * SIZE(AA)
2267 #if defined(LN) || defined(LT)
2268 movlpd %xmm2, 0 * SIZE(CO1)
2270 movlpd %xmm0, 0 * SIZE(CO1)
2277 #if defined(LT) || defined(RN)
2280 leal (AA,%eax, SIZE), AA
2298 sall $BASE_SHIFT, %eax
2305 sarl $1, %ebx # i = (m >> 2)
2312 sall $1 + BASE_SHIFT, %eax
2316 #if defined(LN) || defined(RT)
2319 leal (, %eax, SIZE), %eax
2320 leal (AA, %eax, 2), AA
2325 #if defined(LN) || defined(RT)
2327 sall $1 + BASE_SHIFT, %eax
2336 movapd 0 * SIZE(AA), %xmm0
2337 movapd 8 * SIZE(AA), %xmm1
2338 movapd 0 * SIZE(BB), %xmm2
2339 movapd 8 * SIZE(BB), %xmm3
2342 prefetchw -2 * SIZE(CO1)
2344 prefetchw 1 * SIZE(CO1)
2347 #if defined(LT) || defined(RN)
2360 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
2361 movapd 16 * SIZE(BB), %xmm2
2363 movapd 2 * SIZE(AA), %xmm0
2364 mulpd 2 * SIZE(BB), %xmm0
2366 movapd 4 * SIZE(AA), %xmm0
2367 mulpd 4 * SIZE(BB), %xmm0
2369 movapd 6 * SIZE(AA), %xmm0
2370 mulpd 6 * SIZE(BB), %xmm0
2373 movapd 16 * SIZE(AA), %xmm0
2374 prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
2377 movapd 24 * SIZE(BB), %xmm3
2379 movapd 10 * SIZE(AA), %xmm1
2380 mulpd 10 * SIZE(BB), %xmm1
2382 movapd 12 * SIZE(AA), %xmm1
2383 mulpd 12 * SIZE(BB), %xmm1
2385 movapd 14 * SIZE(AA), %xmm1
2386 mulpd 14 * SIZE(BB), %xmm1
2388 movapd 24 * SIZE(AA), %xmm1
2397 #if defined(LT) || defined(RN)
2403 andl $7, %eax # if (k & 1)
2411 movapd 2 * SIZE(AA), %xmm0
2412 movapd 2 * SIZE(BB), %xmm2
2421 #if defined(LN) || defined(RT)
2433 leal (, %eax, SIZE), %eax
2434 leal (AA, %eax, 2), AA
2435 leal (B, %eax, 1), B
2436 leal (BB, %eax, 2), BB
2439 #if defined(LN) || defined(LT)
2440 movapd 0 * SIZE(B), %xmm2
2444 movapd 0 * SIZE(AA), %xmm0
2451 unpckhpd %xmm3, %xmm3
2453 movlpd 3 * SIZE(AA), %xmm4
2456 movlpd 2 * SIZE(AA), %xmm4
2460 movlpd 0 * SIZE(AA), %xmm4
2463 unpcklpd %xmm3, %xmm2
2468 unpckhpd %xmm3, %xmm3
2470 movlpd 0 * SIZE(AA), %xmm4
2473 movlpd 1 * SIZE(AA), %xmm4
2477 movlpd 3 * SIZE(AA), %xmm4
2480 unpcklpd %xmm3, %xmm2
2484 movlpd 0 * SIZE(B), %xmm4
2485 movhpd 0 * SIZE(B), %xmm4
2490 movlpd 0 * SIZE(B), %xmm4
2491 movhpd 0 * SIZE(B), %xmm4
2495 #if defined(LN) || defined(LT)
2496 movapd %xmm2, 0 * SIZE(B)
2498 movlpd %xmm2, 0 * SIZE(BB)
2499 movlpd %xmm2, 1 * SIZE(BB)
2500 movhpd %xmm2, 2 * SIZE(BB)
2501 movhpd %xmm2, 3 * SIZE(BB)
2503 movapd %xmm0, 0 * SIZE(AA)
2510 #if defined(LN) || defined(LT)
2511 movlpd %xmm2, 0 * SIZE(CO1)
2512 movhpd %xmm2, 1 * SIZE(CO1)
2514 movlpd %xmm0, 0 * SIZE(CO1)
2515 movhpd %xmm0, 1 * SIZE(CO1)
2522 #if defined(LT) || defined(RN)
2525 leal (,%eax, SIZE), %eax
2526 leal (AA, %eax, 2), AA
2544 sall $1 + BASE_SHIFT, %eax
2555 leal (B, %eax, SIZE), B
2558 #if defined(LT) || defined(RN)
2561 leal (B,%eax, SIZE), B
2574 movl OLD_STACK, %esp