1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
45 #define OLD_M 4 + STACK + ARGS(%esi)
46 #define OLD_N 8 + STACK + ARGS(%esi)
47 #define OLD_K 12 + STACK + ARGS(%esi)
48 #define OLD_ALPHA 16 + STACK + ARGS(%esi)
49 #define OLD_A 24 + STACK + ARGS(%esi)
50 #define OLD_B 28 + STACK + ARGS(%esi)
51 #define OLD_C 32 + STACK + ARGS(%esi)
52 #define OLD_LDC 36 + STACK + ARGS(%esi)
53 #define OLD_OFFT 40 + STACK + ARGS(%esi)
61 #define OLD_STACK 40(%esp)
62 #define OFFSET 44(%esp)
65 #define AORIG 56(%esp)
66 #define BORIG 60(%esp)
67 #define BUFFER 128(%esp)
69 #define STACK_ALIGN 4096
70 #define STACK_OFFSET 1024
72 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
73 #define PREFETCH prefetch
74 #define PREFETCHSIZE (8 * 10 + 4)
83 #define KERNEL1(address) \
86 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
87 movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
90 movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
92 mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
94 movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
96 movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0
98 #define KERNEL2(address) \
100 addpd %xmm3, %xmm4; \
101 movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
102 mulpd %xmm0, %xmm3; \
103 addpd %xmm3, %xmm5; \
104 movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
105 mulpd %xmm0, %xmm3; \
106 mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
107 addpd %xmm3, %xmm6; \
108 movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
109 addpd %xmm0, %xmm7; \
110 movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
112 #define KERNEL3(address) \
113 mulpd %xmm0, %xmm2; \
114 addpd %xmm2, %xmm4; \
115 movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
116 mulpd %xmm0, %xmm2; \
117 addpd %xmm2, %xmm5; \
118 movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
119 mulpd %xmm0, %xmm2; \
120 mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
121 addpd %xmm2, %xmm6; \
122 movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
123 addpd %xmm0, %xmm7; \
124 movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0
126 #define KERNEL4(address) \
127 mulpd %xmm0, %xmm3; \
128 addpd %xmm3, %xmm4; \
129 movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
130 mulpd %xmm0, %xmm3; \
131 addpd %xmm3, %xmm5; \
132 movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
133 mulpd %xmm0, %xmm3; \
134 mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
135 addpd %xmm3, %xmm6; \
136 movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
137 addpd %xmm0, %xmm7; \
138 movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0
140 #define KERNEL5(address) \
141 PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \
142 mulpd %xmm1, %xmm2; \
143 addpd %xmm2, %xmm4; \
144 movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
145 mulpd %xmm1, %xmm2; \
146 addpd %xmm2, %xmm5; \
147 movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
148 mulpd %xmm1, %xmm2; \
149 mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
150 addpd %xmm2, %xmm6; \
151 movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
152 addpd %xmm1, %xmm7; \
153 movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1
155 #define KERNEL6(address) \
156 mulpd %xmm1, %xmm3; \
157 addpd %xmm3, %xmm4; \
158 movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
159 mulpd %xmm1, %xmm3; \
160 addpd %xmm3, %xmm5; \
161 movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
162 mulpd %xmm1, %xmm3; \
163 mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
164 addpd %xmm3, %xmm6; \
165 movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
166 addpd %xmm1, %xmm7; \
167 movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1
169 #define KERNEL7(address) \
170 mulpd %xmm1, %xmm2; \
171 addpd %xmm2, %xmm4; \
172 movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
173 mulpd %xmm1, %xmm2; \
174 addpd %xmm2, %xmm5; \
175 movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
176 mulpd %xmm1, %xmm2; \
177 mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
178 addpd %xmm2, %xmm6; \
179 movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
180 addpd %xmm1, %xmm7; \
181 movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1
183 #define KERNEL8(address) \
184 mulpd %xmm1, %xmm3; \
185 addpd %xmm3, %xmm4; \
186 movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
187 mulpd %xmm1, %xmm3; \
188 addpd %xmm3, %xmm5; \
189 movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
190 mulpd %xmm1, %xmm3; \
191 mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
192 addpd %xmm3, %xmm6; \
193 movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
194 addpd %xmm1, %xmm7; \
195 movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
208 movl %esp, %esi # save old stack
210 subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
211 andl $-STACK_ALIGN, %esp
212 addl $STACK_OFFSET, %esp
237 leal (, LDC, SIZE), LDC
241 leal (, %eax, SIZE), %eax
249 leal (, %eax, SIZE), %eax
280 sall $BASE_SHIFT, %eax
284 #if defined(LN) || defined(RT)
287 leal (, %eax, SIZE), %eax
289 leal (BB, %eax, 2), BB
297 #if defined(LT) || defined(RN)
308 #define COPYPREFETCH 40
310 prefetchnta (COPYPREFETCH) * SIZE(B)
312 movq 0 * SIZE(B), %mm0
313 movq 1 * SIZE(B), %mm1
314 movq 2 * SIZE(B), %mm2
315 movq 3 * SIZE(B), %mm3
316 movq 4 * SIZE(B), %mm4
317 movq 5 * SIZE(B), %mm5
318 movq 6 * SIZE(B), %mm6
319 movq 7 * SIZE(B), %mm7
321 movq %mm0, 0 * SIZE(BB)
322 movq %mm0, 1 * SIZE(BB)
323 movq %mm1, 2 * SIZE(BB)
324 movq %mm1, 3 * SIZE(BB)
325 movq %mm2, 4 * SIZE(BB)
326 movq %mm2, 5 * SIZE(BB)
327 movq %mm3, 6 * SIZE(BB)
328 movq %mm3, 7 * SIZE(BB)
330 movq %mm4, 8 * SIZE(BB)
331 movq %mm4, 9 * SIZE(BB)
332 movq %mm5, 10 * SIZE(BB)
333 movq %mm5, 11 * SIZE(BB)
334 movq %mm6, 12 * SIZE(BB)
335 movq %mm6, 13 * SIZE(BB)
336 movq %mm7, 14 * SIZE(BB)
337 movq %mm7, 15 * SIZE(BB)
346 #if defined(LT) || defined(RN)
358 movq 0 * SIZE(B), %mm0
360 movq %mm0, 0 * SIZE(BB)
361 movq %mm0, 1 * SIZE(BB)
370 #if defined(LT) || defined(RN)
386 sarl $1, %ebx # i = (m >> 2)
393 sall $1 + BASE_SHIFT, %eax
397 #if defined(LN) || defined(RT)
400 leal (, %eax, SIZE), %eax
401 leal (AA, %eax, 2), AA
406 #if defined(LN) || defined(RT)
408 sall $1 + BASE_SHIFT, %eax
417 movapd 0 * SIZE(AA), %xmm0
418 movapd 8 * SIZE(AA), %xmm1
419 movapd 0 * SIZE(BB), %xmm2
420 movapd 8 * SIZE(BB), %xmm3
423 prefetchw -2 * SIZE(CO1)
425 prefetchw 1 * SIZE(CO1)
428 #if defined(LT) || defined(RN)
441 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
442 movapd 16 * SIZE(BB), %xmm2
444 movapd 2 * SIZE(AA), %xmm0
445 mulpd 2 * SIZE(BB), %xmm0
447 movapd 4 * SIZE(AA), %xmm0
448 mulpd 4 * SIZE(BB), %xmm0
450 movapd 6 * SIZE(AA), %xmm0
451 mulpd 6 * SIZE(BB), %xmm0
454 movapd 16 * SIZE(AA), %xmm0
455 prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
458 movapd 24 * SIZE(BB), %xmm3
460 movapd 10 * SIZE(AA), %xmm1
461 mulpd 10 * SIZE(BB), %xmm1
463 movapd 12 * SIZE(AA), %xmm1
464 mulpd 12 * SIZE(BB), %xmm1
466 movapd 14 * SIZE(AA), %xmm1
467 mulpd 14 * SIZE(BB), %xmm1
469 movapd 24 * SIZE(AA), %xmm1
478 #if defined(LT) || defined(RN)
484 andl $7, %eax # if (k & 1)
492 movapd 2 * SIZE(AA), %xmm0
493 movapd 2 * SIZE(BB), %xmm2
502 #if defined(LN) || defined(RT)
514 leal (, %eax, SIZE), %eax
515 leal (AA, %eax, 2), AA
517 leal (BB, %eax, 2), BB
520 #if defined(LN) || defined(LT)
521 movapd 0 * SIZE(B), %xmm2
525 movapd 0 * SIZE(AA), %xmm0
532 unpckhpd %xmm3, %xmm3
534 movlpd 3 * SIZE(AA), %xmm4
537 movlpd 2 * SIZE(AA), %xmm4
541 movlpd 0 * SIZE(AA), %xmm4
544 unpcklpd %xmm3, %xmm2
549 unpckhpd %xmm3, %xmm3
551 movlpd 0 * SIZE(AA), %xmm4
554 movlpd 1 * SIZE(AA), %xmm4
558 movlpd 3 * SIZE(AA), %xmm4
561 unpcklpd %xmm3, %xmm2
565 movlpd 0 * SIZE(B), %xmm4
566 movhpd 0 * SIZE(B), %xmm4
571 movlpd 0 * SIZE(B), %xmm4
572 movhpd 0 * SIZE(B), %xmm4
576 #if defined(LN) || defined(LT)
577 movapd %xmm2, 0 * SIZE(B)
579 movlpd %xmm2, 0 * SIZE(BB)
580 movlpd %xmm2, 1 * SIZE(BB)
581 movhpd %xmm2, 2 * SIZE(BB)
582 movhpd %xmm2, 3 * SIZE(BB)
584 movapd %xmm0, 0 * SIZE(AA)
591 #if defined(LN) || defined(LT)
592 movlpd %xmm2, 0 * SIZE(CO1)
593 movhpd %xmm2, 1 * SIZE(CO1)
595 movlpd %xmm0, 0 * SIZE(CO1)
596 movhpd %xmm0, 1 * SIZE(CO1)
603 #if defined(LT) || defined(RN)
606 leal (,%eax, SIZE), %eax
607 leal (AA, %eax, 2), AA
625 sall $1 + BASE_SHIFT, %eax
635 testl $1, %ebx # i = (m >> 2)
640 sall $BASE_SHIFT, %eax
644 #if defined(LN) || defined(RT)
647 leal (AA, %eax, SIZE), AA
652 #if defined(LN) || defined(RT)
654 sall $1 + BASE_SHIFT, %eax
663 movlpd 0 * SIZE(AA), %xmm0
664 movlpd 4 * SIZE(AA), %xmm1
665 movlpd 0 * SIZE(BB), %xmm2
666 movlpd 8 * SIZE(BB), %xmm3
668 #if defined(LT) || defined(RN)
680 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
681 movlpd 1 * SIZE(AA), %xmm0
682 mulsd 2 * SIZE(BB), %xmm0
684 movlpd 16 * SIZE(BB), %xmm2
686 movlpd 2 * SIZE(AA), %xmm0
687 mulsd 4 * SIZE(BB), %xmm0
689 movlpd 3 * SIZE(AA), %xmm0
690 mulsd 6 * SIZE(BB), %xmm0
692 movlpd 8 * SIZE(AA), %xmm0
694 movlpd 5 * SIZE(AA), %xmm1
695 mulsd 10 * SIZE(BB), %xmm1
697 movlpd 24 * SIZE(BB), %xmm3
699 movlpd 6 * SIZE(AA), %xmm1
700 mulsd 12 * SIZE(BB), %xmm1
702 movlpd 7 * SIZE(AA), %xmm1
703 mulsd 14 * SIZE(BB), %xmm1
705 movlpd 12 * SIZE(AA), %xmm1
714 #if defined(LT) || defined(RN)
720 andl $7, %eax # if (k & 1)
727 movlpd 2 * SIZE(BB), %xmm2
728 movlpd 1 * SIZE(AA), %xmm0
741 #if defined(LN) || defined(RT)
753 leal (, %eax, SIZE), %eax
756 leal (BB, %eax, 2), BB
759 #if defined(LN) || defined(LT)
760 movlpd 0 * SIZE(B), %xmm2
763 movlpd 0 * SIZE(AA), %xmm0
768 movlpd 0 * SIZE(AA), %xmm4
773 movlpd 0 * SIZE(AA), %xmm4
778 movlpd 0 * SIZE(B), %xmm4
783 movlpd 0 * SIZE(B), %xmm4
787 #if defined(LN) || defined(LT)
788 movlpd %xmm2, 0 * SIZE(B)
790 movlpd %xmm2, 0 * SIZE(BB)
791 movlpd %xmm2, 1 * SIZE(BB)
793 movlpd %xmm0, 0 * SIZE(AA)
800 #if defined(LN) || defined(LT)
801 movlpd %xmm2, 0 * SIZE(CO1)
803 movlpd %xmm0, 0 * SIZE(CO1)
810 #if defined(LT) || defined(RN)
813 leal (AA,%eax, SIZE), AA
831 sall $BASE_SHIFT, %eax
839 leal (B, %eax, SIZE), B
842 #if defined(LT) || defined(RN)
845 leal (B,%eax, SIZE), B
872 sall $1 + BASE_SHIFT, %eax
876 #if defined(LN) || defined(RT)
879 leal (, %eax, SIZE), %eax
881 leal (BB, %eax, 4), BB
889 #if defined(LT) || defined(RN)
900 #define COPYPREFETCH 40
902 prefetchnta (COPYPREFETCH) * SIZE(B)
904 movq 0 * SIZE(B), %mm0
905 movq 1 * SIZE(B), %mm1
906 movq 2 * SIZE(B), %mm2
907 movq 3 * SIZE(B), %mm3
908 movq 4 * SIZE(B), %mm4
909 movq 5 * SIZE(B), %mm5
910 movq 6 * SIZE(B), %mm6
911 movq 7 * SIZE(B), %mm7
913 movq %mm0, 0 * SIZE(BB)
914 movq %mm0, 1 * SIZE(BB)
915 movq %mm1, 2 * SIZE(BB)
916 movq %mm1, 3 * SIZE(BB)
917 movq %mm2, 4 * SIZE(BB)
918 movq %mm2, 5 * SIZE(BB)
919 movq %mm3, 6 * SIZE(BB)
920 movq %mm3, 7 * SIZE(BB)
922 movq %mm4, 8 * SIZE(BB)
923 movq %mm4, 9 * SIZE(BB)
924 movq %mm5, 10 * SIZE(BB)
925 movq %mm5, 11 * SIZE(BB)
926 movq %mm6, 12 * SIZE(BB)
927 movq %mm6, 13 * SIZE(BB)
928 movq %mm7, 14 * SIZE(BB)
929 movq %mm7, 15 * SIZE(BB)
938 #if defined(LT) || defined(RN)
950 movq 0 * SIZE(B), %mm0
951 movq 1 * SIZE(B), %mm1
953 movq %mm0, 0 * SIZE(BB)
954 movq %mm0, 1 * SIZE(BB)
955 movq %mm1, 2 * SIZE(BB)
956 movq %mm1, 3 * SIZE(BB)
965 #if defined(LT) || defined(RN)
972 leal (, LDC, 2), %eax
983 sarl $1, %ebx # i = (m >> 2)
990 sall $1 + BASE_SHIFT, %eax
994 #if defined(LN) || defined(RT)
997 leal (, %eax, SIZE), %eax
998 leal (AA, %eax, 2), AA
1003 #if defined(LN) || defined(RT)
1005 sall $2 + BASE_SHIFT, %eax
1014 movapd 0 * SIZE(AA), %xmm0
1015 movapd 8 * SIZE(AA), %xmm1
1016 movapd 0 * SIZE(BB), %xmm2
1017 movapd 8 * SIZE(BB), %xmm3
1020 prefetchw -2 * SIZE(CO1)
1021 prefetchw -2 * SIZE(CO1, LDC)
1023 prefetchw 1 * SIZE(CO1)
1024 prefetchw 1 * SIZE(CO1, LDC)
1027 #if defined(LT) || defined(RN)
1039 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
1040 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
1042 mulpd 2 * SIZE(BB), %xmm0
1044 movapd 4 * SIZE(BB), %xmm2
1046 movapd 2 * SIZE(AA), %xmm0
1049 mulpd 6 * SIZE(BB), %xmm0
1051 movapd 16 * SIZE(BB), %xmm2
1053 movapd 4 * SIZE(AA), %xmm0
1056 mulpd 10 * SIZE(BB), %xmm0
1058 movapd 12 * SIZE(BB), %xmm3
1060 movapd 6 * SIZE(AA), %xmm0
1063 mulpd 14 * SIZE(BB), %xmm0
1065 movapd 24 * SIZE(BB), %xmm3
1067 movapd 16 * SIZE(AA), %xmm0
1069 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
1070 prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
1073 mulpd 18 * SIZE(BB), %xmm1
1075 movapd 20 * SIZE(BB), %xmm2
1077 movapd 10 * SIZE(AA), %xmm1
1080 mulpd 22 * SIZE(BB), %xmm1
1082 movapd 32 * SIZE(BB), %xmm2
1084 movapd 12 * SIZE(AA), %xmm1
1087 mulpd 26 * SIZE(BB), %xmm1
1089 movapd 28 * SIZE(BB), %xmm3
1091 movapd 14 * SIZE(AA), %xmm1
1094 mulpd 30 * SIZE(BB), %xmm1
1096 movapd 40 * SIZE(BB), %xmm3
1098 movapd 24 * SIZE(AA), %xmm1
1107 #if defined(LT) || defined(RN)
1113 andl $7, %eax # if (k & 1)
1120 mulpd 2 * SIZE(BB), %xmm0
1122 movapd 4 * SIZE(BB), %xmm2
1124 movapd 2 * SIZE(AA), %xmm0
1136 #if defined(LN) || defined(RT)
1148 leal (, %eax, SIZE), %eax
1149 leal (AA, %eax, 2), AA
1150 leal (B, %eax, 2), B
1151 leal (BB, %eax, 4), BB
1154 #if defined(LN) || defined(LT)
1156 unpcklpd %xmm5, %xmm4
1157 unpckhpd %xmm5, %xmm0
1159 movapd 0 * SIZE(B), %xmm2
1160 movapd 2 * SIZE(B), %xmm3
1165 movapd 0 * SIZE(AA), %xmm0
1166 movapd 2 * SIZE(AA), %xmm1
1173 movlpd 3 * SIZE(AA), %xmm4
1174 movhpd 3 * SIZE(AA), %xmm4
1177 movlpd 2 * SIZE(AA), %xmm4
1178 movhpd 2 * SIZE(AA), %xmm4
1182 movlpd 0 * SIZE(AA), %xmm4
1183 movhpd 0 * SIZE(AA), %xmm4
1189 movlpd 0 * SIZE(AA), %xmm4
1190 movhpd 0 * SIZE(AA), %xmm4
1193 movlpd 1 * SIZE(AA), %xmm4
1194 movhpd 1 * SIZE(AA), %xmm4
1198 movlpd 3 * SIZE(AA), %xmm4
1199 movhpd 3 * SIZE(AA), %xmm4
1204 movlpd 0 * SIZE(B), %xmm4
1205 movhpd 0 * SIZE(B), %xmm4
1207 movlpd 1 * SIZE(B), %xmm4
1208 movhpd 1 * SIZE(B), %xmm4
1212 movlpd 3 * SIZE(B), %xmm4
1213 movhpd 3 * SIZE(B), %xmm4
1218 movlpd 3 * SIZE(B), %xmm4
1219 movhpd 3 * SIZE(B), %xmm4
1221 movlpd 2 * SIZE(B), %xmm4
1222 movhpd 2 * SIZE(B), %xmm4
1226 movlpd 0 * SIZE(B), %xmm4
1227 movhpd 0 * SIZE(B), %xmm4
1231 #if defined(LN) || defined(LT)
1232 movapd %xmm2, 0 * SIZE(B)
1233 movapd %xmm3, 2 * SIZE(B)
1235 movlpd %xmm2, 0 * SIZE(BB)
1236 movlpd %xmm2, 1 * SIZE(BB)
1237 movhpd %xmm2, 2 * SIZE(BB)
1238 movhpd %xmm2, 3 * SIZE(BB)
1239 movlpd %xmm3, 4 * SIZE(BB)
1240 movlpd %xmm3, 5 * SIZE(BB)
1241 movhpd %xmm3, 6 * SIZE(BB)
1242 movhpd %xmm3, 7 * SIZE(BB)
1244 movapd %xmm0, 0 * SIZE(AA)
1245 movapd %xmm1, 2 * SIZE(AA)
1252 #if defined(LN) || defined(LT)
1253 movlpd %xmm2, 0 * SIZE(CO1)
1254 movlpd %xmm3, 1 * SIZE(CO1)
1255 movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
1256 movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
1258 movlpd %xmm0, 0 * SIZE(CO1)
1259 movhpd %xmm0, 1 * SIZE(CO1)
1260 movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
1261 movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
1268 #if defined(LT) || defined(RN)
1271 leal (,%eax, SIZE), %eax
1272 leal (AA, %eax, 2), AA
1290 sall $1 + BASE_SHIFT, %eax
1300 testl $1, %ebx # i = (m >> 2)
1305 sall $BASE_SHIFT, %eax
1309 #if defined(LN) || defined(RT)
1312 leal (AA, %eax, SIZE), AA
1317 #if defined(LN) || defined(RT)
1319 sall $2 + BASE_SHIFT, %eax
1328 movlpd 0 * SIZE(AA), %xmm0
1329 movlpd 4 * SIZE(AA), %xmm1
1330 movlpd 0 * SIZE(BB), %xmm2
1331 movlpd 8 * SIZE(BB), %xmm3
1333 #if defined(LT) || defined(RN)
1345 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
1346 mulsd 2 * SIZE(BB), %xmm0
1348 movlpd 4 * SIZE(BB), %xmm2
1350 movlpd 1 * SIZE(AA), %xmm0
1353 mulsd 6 * SIZE(BB), %xmm0
1355 movlpd 16 * SIZE(BB), %xmm2
1357 movlpd 2 * SIZE(AA), %xmm0
1360 mulsd 10 * SIZE(BB), %xmm0
1362 movlpd 12 * SIZE(BB), %xmm3
1364 movlpd 3 * SIZE(AA), %xmm0
1367 mulsd 14 * SIZE(BB), %xmm0
1369 movlpd 24 * SIZE(BB), %xmm3
1371 movlpd 8 * SIZE(AA), %xmm0
1374 mulsd 18 * SIZE(BB), %xmm1
1376 movlpd 20 * SIZE(BB), %xmm2
1378 movlpd 5 * SIZE(AA), %xmm1
1381 mulsd 22 * SIZE(BB), %xmm1
1383 movlpd 32 * SIZE(BB), %xmm2
1385 movlpd 6 * SIZE(AA), %xmm1
1388 mulsd 26 * SIZE(BB), %xmm1
1390 movlpd 28 * SIZE(BB), %xmm3
1392 movlpd 7 * SIZE(AA), %xmm1
1395 mulsd 30 * SIZE(BB), %xmm1
1397 movlpd 40 * SIZE(BB), %xmm3
1399 movlpd 12 * SIZE(AA), %xmm1
1408 #if defined(LT) || defined(RN)
1414 andl $7, %eax # if (k & 1)
1420 mulsd 2 * SIZE(BB), %xmm0
1422 movlpd 4 * SIZE(BB), %xmm2
1424 movlpd 1 * SIZE(AA), %xmm0
1436 #if defined(LN) || defined(RT)
1448 leal (, %eax, SIZE), %eax
1450 leal (B, %eax, 2), B
1451 leal (BB, %eax, 4), BB
1454 #if defined(LN) || defined(LT)
1455 unpcklpd %xmm5, %xmm4
1457 movapd 0 * SIZE(B), %xmm2
1461 movlpd 0 * SIZE(AA), %xmm0
1462 movlpd 1 * SIZE(AA), %xmm1
1469 movlpd 0 * SIZE(AA), %xmm4
1470 movhpd 0 * SIZE(AA), %xmm4
1475 movlpd 0 * SIZE(AA), %xmm4
1476 movhpd 0 * SIZE(AA), %xmm4
1481 movlpd 0 * SIZE(B), %xmm4
1483 movlpd 1 * SIZE(B), %xmm4
1487 movlpd 3 * SIZE(B), %xmm4
1492 movlpd 3 * SIZE(B), %xmm4
1494 movlpd 2 * SIZE(B), %xmm4
1498 movlpd 0 * SIZE(B), %xmm4
1502 #if defined(LN) || defined(LT)
1503 movapd %xmm2, 0 * SIZE(B)
1505 movlpd %xmm2, 0 * SIZE(BB)
1506 movlpd %xmm2, 1 * SIZE(BB)
1507 movhpd %xmm2, 2 * SIZE(BB)
1508 movhpd %xmm2, 3 * SIZE(BB)
1510 movlpd %xmm0, 0 * SIZE(AA)
1511 movlpd %xmm1, 1 * SIZE(AA)
1518 #if defined(LN) || defined(LT)
1519 movlpd %xmm2, 0 * SIZE(CO1)
1520 movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
1522 movlpd %xmm0, 0 * SIZE(CO1)
1523 movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
1530 #if defined(LT) || defined(RN)
1533 leal (AA,%eax, SIZE), AA
1551 sall $BASE_SHIFT, %eax
1559 leal (, %eax, SIZE), %eax
1560 leal (B, %eax, 2), B
1563 #if defined(LT) || defined(RN)
1566 leal (,%eax, SIZE), %eax
1567 leal (B, %eax, 2), B
1597 sall $2 + BASE_SHIFT, %eax
1601 #if defined(LN) || defined(RT)
1604 leal (, %eax, SIZE), %eax
1605 leal (B, %eax, 4), B
1606 leal (BB, %eax, 8), BB
1614 #if defined(LT) || defined(RN)
1625 #define COPYPREFETCH 40
1627 prefetchnta (COPYPREFETCH) * SIZE(B)
1629 movq 0 * SIZE(B), %mm0
1630 movq 1 * SIZE(B), %mm1
1631 movq 2 * SIZE(B), %mm2
1632 movq 3 * SIZE(B), %mm3
1633 movq 4 * SIZE(B), %mm4
1634 movq 5 * SIZE(B), %mm5
1635 movq 6 * SIZE(B), %mm6
1636 movq 7 * SIZE(B), %mm7
1638 movq %mm0, 0 * SIZE(BB)
1639 movq %mm0, 1 * SIZE(BB)
1640 movq %mm1, 2 * SIZE(BB)
1641 movq %mm1, 3 * SIZE(BB)
1642 movq %mm2, 4 * SIZE(BB)
1643 movq %mm2, 5 * SIZE(BB)
1644 movq %mm3, 6 * SIZE(BB)
1645 movq %mm3, 7 * SIZE(BB)
1647 movq %mm4, 8 * SIZE(BB)
1648 movq %mm4, 9 * SIZE(BB)
1649 movq %mm5, 10 * SIZE(BB)
1650 movq %mm5, 11 * SIZE(BB)
1651 movq %mm6, 12 * SIZE(BB)
1652 movq %mm6, 13 * SIZE(BB)
1653 movq %mm7, 14 * SIZE(BB)
1654 movq %mm7, 15 * SIZE(BB)
1663 #if defined(LT) || defined(RN)
1673 movq 0 * SIZE(B), %mm0
1674 movq 1 * SIZE(B), %mm1
1675 movq 2 * SIZE(B), %mm2
1676 movq 3 * SIZE(B), %mm3
1678 movq %mm0, 0 * SIZE(BB)
1679 movq %mm0, 1 * SIZE(BB)
1680 movq %mm1, 2 * SIZE(BB)
1681 movq %mm1, 3 * SIZE(BB)
1682 movq %mm2, 4 * SIZE(BB)
1683 movq %mm2, 5 * SIZE(BB)
1684 movq %mm3, 6 * SIZE(BB)
1685 movq %mm3, 7 * SIZE(BB)
1691 #if defined(LT) || defined(RN)
1698 leal (, LDC, 4), %eax
1709 sarl $1, %ebx # i = (m >> 2)
1716 sall $1 + BASE_SHIFT, %eax
1720 #if defined(LN) || defined(RT)
1723 leal (, %eax, SIZE), %eax
1724 leal (AA, %eax, 2), AA
1729 #if defined(LN) || defined(RT)
1731 sall $3 + BASE_SHIFT, %eax
1740 movapd 0 * SIZE(AA), %xmm0
1741 movapd 8 * SIZE(AA), %xmm1
1742 movapd 0 * SIZE(BB), %xmm2
1743 movapd 8 * SIZE(BB), %xmm3
1745 leal (LDC, LDC, 2), %eax
1748 prefetchw -2 * SIZE(CO1)
1749 prefetchw -2 * SIZE(CO1, LDC)
1750 prefetchw -2 * SIZE(CO1, LDC, 2)
1751 prefetchw -2 * SIZE(CO1, %eax)
1753 prefetchw 1 * SIZE(CO1)
1754 prefetchw 1 * SIZE(CO1, LDC)
1755 prefetchw 1 * SIZE(CO1, LDC, 2)
1756 prefetchw 1 * SIZE(CO1, %eax)
1759 #if defined(LT) || defined(RN)
1850 addl $128 * 4 * SIZE, BB
1851 addl $128 * 1 * SIZE, AA
1857 leal (AA, %eax, 1), AA
1858 leal (BB, %eax, 4), BB
1885 #if defined(LT) || defined(RN)
1891 andl $7, %eax # if (k & 1)
1899 movapd 2 * SIZE(BB), %xmm2
1902 movapd 4 * SIZE(BB), %xmm2
1904 mulpd 6 * SIZE(BB), %xmm0
1906 movapd 8 * SIZE(BB), %xmm2
1908 movapd 2 * SIZE(AA), %xmm0
1917 #if defined(LN) || defined(RT)
1929 leal (, %eax, SIZE), %eax
1930 leal (AA, %eax, 2), AA
1931 leal (B, %eax, 4), B
1932 leal (BB, %eax, 8), BB
1935 #if defined(LN) || defined(LT)
1937 unpcklpd %xmm5, %xmm4
1938 unpckhpd %xmm5, %xmm0
1941 unpcklpd %xmm7, %xmm6
1942 unpckhpd %xmm7, %xmm1
1944 movapd 0 * SIZE(B), %xmm2
1945 movapd 2 * SIZE(B), %xmm5
1946 movapd 4 * SIZE(B), %xmm3
1947 movapd 6 * SIZE(B), %xmm7
1954 movapd 0 * SIZE(AA), %xmm0
1955 movapd 2 * SIZE(AA), %xmm1
1956 movapd 4 * SIZE(AA), %xmm2
1957 movapd 6 * SIZE(AA), %xmm3
1966 movlpd 3 * SIZE(AA), %xmm4
1967 movhpd 3 * SIZE(AA), %xmm4
1971 movlpd 2 * SIZE(AA), %xmm4
1972 movhpd 2 * SIZE(AA), %xmm4
1979 movlpd 0 * SIZE(AA), %xmm4
1980 movhpd 0 * SIZE(AA), %xmm4
1987 movlpd 0 * SIZE(AA), %xmm4
1988 movhpd 0 * SIZE(AA), %xmm4
1992 movlpd 1 * SIZE(AA), %xmm4
1993 movhpd 1 * SIZE(AA), %xmm4
2000 movlpd 3 * SIZE(AA), %xmm4
2001 movhpd 3 * SIZE(AA), %xmm4
2007 movlpd 0 * SIZE(B), %xmm4
2008 movhpd 0 * SIZE(B), %xmm4
2010 movlpd 1 * SIZE(B), %xmm4
2011 movhpd 1 * SIZE(B), %xmm4
2014 movlpd 2 * SIZE(B), %xmm4
2015 movhpd 2 * SIZE(B), %xmm4
2018 movlpd 3 * SIZE(B), %xmm4
2019 movhpd 3 * SIZE(B), %xmm4
2023 movlpd 5 * SIZE(B), %xmm4
2024 movhpd 5 * SIZE(B), %xmm4
2026 movlpd 6 * SIZE(B), %xmm4
2027 movhpd 6 * SIZE(B), %xmm4
2030 movlpd 7 * SIZE(B), %xmm4
2031 movhpd 7 * SIZE(B), %xmm4
2035 movlpd 10 * SIZE(B), %xmm4
2036 movhpd 10 * SIZE(B), %xmm4
2038 movlpd 11 * SIZE(B), %xmm4
2039 movhpd 11 * SIZE(B), %xmm4
2043 movlpd 15 * SIZE(B), %xmm4
2044 movhpd 15 * SIZE(B), %xmm4
2049 movlpd 15 * SIZE(B), %xmm4
2050 movhpd 15 * SIZE(B), %xmm4
2052 movlpd 14 * SIZE(B), %xmm4
2053 movhpd 14 * SIZE(B), %xmm4
2056 movlpd 13 * SIZE(B), %xmm4
2057 movhpd 13 * SIZE(B), %xmm4
2060 movlpd 12 * SIZE(B), %xmm4
2061 movhpd 12 * SIZE(B), %xmm4
2065 movlpd 10 * SIZE(B), %xmm4
2066 movhpd 10 * SIZE(B), %xmm4
2068 movlpd 9 * SIZE(B), %xmm4
2069 movhpd 9 * SIZE(B), %xmm4
2072 movlpd 8 * SIZE(B), %xmm4
2073 movhpd 8 * SIZE(B), %xmm4
2077 movlpd 5 * SIZE(B), %xmm4
2078 movhpd 5 * SIZE(B), %xmm4
2080 movlpd 4 * SIZE(B), %xmm4
2081 movhpd 4 * SIZE(B), %xmm4
2085 movlpd 0 * SIZE(B), %xmm4
2086 movhpd 0 * SIZE(B), %xmm4
2090 #if defined(LN) || defined(LT)
2091 movapd %xmm2, 0 * SIZE(B)
2092 movapd %xmm5, 2 * SIZE(B)
2093 movapd %xmm3, 4 * SIZE(B)
2094 movapd %xmm7, 6 * SIZE(B)
2096 movlpd %xmm2, 0 * SIZE(BB)
2097 movlpd %xmm2, 1 * SIZE(BB)
2098 movhpd %xmm2, 2 * SIZE(BB)
2099 movhpd %xmm2, 3 * SIZE(BB)
2100 movlpd %xmm5, 4 * SIZE(BB)
2101 movlpd %xmm5, 5 * SIZE(BB)
2102 movhpd %xmm5, 6 * SIZE(BB)
2103 movhpd %xmm5, 7 * SIZE(BB)
2104 movlpd %xmm3, 8 * SIZE(BB)
2105 movlpd %xmm3, 9 * SIZE(BB)
2106 movhpd %xmm3, 10 * SIZE(BB)
2107 movhpd %xmm3, 11 * SIZE(BB)
2108 movlpd %xmm7, 12 * SIZE(BB)
2109 movlpd %xmm7, 13 * SIZE(BB)
2110 movhpd %xmm7, 14 * SIZE(BB)
2111 movhpd %xmm7, 15 * SIZE(BB)
2113 movapd %xmm0, 0 * SIZE(AA)
2114 movapd %xmm1, 2 * SIZE(AA)
2115 movapd %xmm2, 4 * SIZE(AA)
2116 movapd %xmm3, 6 * SIZE(AA)
2123 leal (LDC, LDC, 2), %eax
2125 #if defined(LN) || defined(LT)
2126 movlpd %xmm2, 0 * SIZE(CO1)
2127 movlpd %xmm3, 1 * SIZE(CO1)
2128 movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
2129 movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
2130 movlpd %xmm5, 0 * SIZE(CO1, LDC, 2)
2131 movlpd %xmm7, 1 * SIZE(CO1, LDC, 2)
2132 movhpd %xmm5, 0 * SIZE(CO1, %eax, 1)
2133 movhpd %xmm7, 1 * SIZE(CO1, %eax, 1)
2135 movlpd %xmm0, 0 * SIZE(CO1)
2136 movhpd %xmm0, 1 * SIZE(CO1)
2137 movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
2138 movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
2139 movlpd %xmm2, 0 * SIZE(CO1, LDC, 2)
2140 movhpd %xmm2, 1 * SIZE(CO1, LDC, 2)
2141 movlpd %xmm3, 0 * SIZE(CO1, %eax, 1)
2142 movhpd %xmm3, 1 * SIZE(CO1, %eax, 1)
2149 #if defined(LT) || defined(RN)
2152 leal (,%eax, SIZE), %eax
2153 leal (AA, %eax, 2), AA
2171 sall $1 + BASE_SHIFT, %eax
2181 testl $1, %ebx # i = (m >> 2)
2186 sall $BASE_SHIFT, %eax
2190 #if defined(LN) || defined(RT)
2193 leal (AA, %eax, SIZE), AA
2198 #if defined(LN) || defined(RT)
2200 sall $3 + BASE_SHIFT, %eax
2209 movlpd 0 * SIZE(AA), %xmm0
2210 movlpd 4 * SIZE(AA), %xmm1
2211 movlpd 0 * SIZE(BB), %xmm2
2212 movlpd 8 * SIZE(BB), %xmm3
2214 #if defined(LT) || defined(RN)
2227 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
2228 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
2230 movlpd 2 * SIZE(BB), %xmm2
2233 movlpd 4 * SIZE(BB), %xmm2
2235 mulsd 6 * SIZE(BB), %xmm0
2237 movlpd 16 * SIZE(BB), %xmm2
2239 movlpd 1 * SIZE(AA), %xmm0
2242 movlpd 10 * SIZE(BB), %xmm3
2245 movlpd 12 * SIZE(BB), %xmm3
2247 mulsd 14 * SIZE(BB), %xmm0
2249 movlpd 24 * SIZE(BB), %xmm3
2251 movlpd 2 * SIZE(AA), %xmm0
2254 movlpd 18 * SIZE(BB), %xmm2
2257 movlpd 20 * SIZE(BB), %xmm2
2259 mulsd 22 * SIZE(BB), %xmm0
2261 movlpd 32 * SIZE(BB), %xmm2
2263 movlpd 3 * SIZE(AA), %xmm0
2266 movlpd 26 * SIZE(BB), %xmm3
2269 movlpd 28 * SIZE(BB), %xmm3
2271 mulsd 30 * SIZE(BB), %xmm0
2273 movlpd 40 * SIZE(BB), %xmm3
2275 movlpd 8 * SIZE(AA), %xmm0
2276 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
2277 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
2281 movlpd 34 * SIZE(BB), %xmm2
2284 movlpd 36 * SIZE(BB), %xmm2
2286 mulsd 38 * SIZE(BB), %xmm1
2288 movlpd 48 * SIZE(BB), %xmm2
2290 movlpd 5 * SIZE(AA), %xmm1
2293 movlpd 42 * SIZE(BB), %xmm3
2296 movlpd 44 * SIZE(BB), %xmm3
2298 mulsd 46 * SIZE(BB), %xmm1
2300 movlpd 56 * SIZE(BB), %xmm3
2302 movlpd 6 * SIZE(AA), %xmm1
2305 movlpd 50 * SIZE(BB), %xmm2
2308 movlpd 52 * SIZE(BB), %xmm2
2310 mulsd 54 * SIZE(BB), %xmm1
2312 movlpd 64 * SIZE(BB), %xmm2
2314 movlpd 7 * SIZE(AA), %xmm1
2317 movlpd 58 * SIZE(BB), %xmm3
2320 movlpd 60 * SIZE(BB), %xmm3
2322 mulsd 62 * SIZE(BB), %xmm1
2324 movlpd 72 * SIZE(BB), %xmm3
2327 movlpd 12 * SIZE(AA), %xmm1
2334 #if defined(LT) || defined(RN)
2340 andl $7, %eax # if (k & 1)
2347 movlpd 2 * SIZE(BB), %xmm2
2350 movlpd 4 * SIZE(BB), %xmm2
2352 mulsd 6 * SIZE(BB), %xmm0
2354 movlpd 8 * SIZE(BB), %xmm2
2356 movlpd 1 * SIZE(AA), %xmm0
2365 #if defined(LN) || defined(RT)
2377 leal (, %eax, SIZE), %eax
2379 leal (B, %eax, 4), B
2380 leal (BB, %eax, 8), BB
2383 #if defined(LN) || defined(LT)
2384 unpcklpd %xmm5, %xmm4
2385 unpcklpd %xmm7, %xmm6
2387 movapd 0 * SIZE(B), %xmm2
2388 movapd 2 * SIZE(B), %xmm5
2393 movlpd 0 * SIZE(AA), %xmm0
2394 movlpd 1 * SIZE(AA), %xmm1
2395 movlpd 2 * SIZE(AA), %xmm2
2396 movlpd 3 * SIZE(AA), %xmm3
2405 movlpd 0 * SIZE(AA), %xmm4
2406 movhpd 0 * SIZE(AA), %xmm4
2412 movlpd 0 * SIZE(AA), %xmm4
2413 movhpd 0 * SIZE(AA), %xmm4
2419 movlpd 0 * SIZE(B), %xmm4
2421 movlpd 1 * SIZE(B), %xmm4
2424 movlpd 2 * SIZE(B), %xmm4
2427 movlpd 3 * SIZE(B), %xmm4
2431 movlpd 5 * SIZE(B), %xmm4
2433 movlpd 6 * SIZE(B), %xmm4
2436 movlpd 7 * SIZE(B), %xmm4
2440 movlpd 10 * SIZE(B), %xmm4
2442 movlpd 11 * SIZE(B), %xmm4
2446 movlpd 15 * SIZE(B), %xmm4
2451 movlpd 15 * SIZE(B), %xmm4
2453 movlpd 14 * SIZE(B), %xmm4
2456 movlpd 13 * SIZE(B), %xmm4
2459 movlpd 12 * SIZE(B), %xmm4
2463 movlpd 10 * SIZE(B), %xmm4
2465 movlpd 9 * SIZE(B), %xmm4
2468 movlpd 8 * SIZE(B), %xmm4
2472 movlpd 5 * SIZE(B), %xmm4
2474 movlpd 4 * SIZE(B), %xmm4
2478 movlpd 0 * SIZE(B), %xmm4
2482 #if defined(LN) || defined(LT)
2483 movapd %xmm2, 0 * SIZE(B)
2484 movapd %xmm5, 2 * SIZE(B)
2486 movlpd %xmm2, 0 * SIZE(BB)
2487 movlpd %xmm2, 1 * SIZE(BB)
2488 movhpd %xmm2, 2 * SIZE(BB)
2489 movhpd %xmm2, 3 * SIZE(BB)
2490 movlpd %xmm5, 4 * SIZE(BB)
2491 movlpd %xmm5, 5 * SIZE(BB)
2492 movhpd %xmm5, 6 * SIZE(BB)
2493 movhpd %xmm5, 7 * SIZE(BB)
2495 movlpd %xmm0, 0 * SIZE(AA)
2496 movlpd %xmm1, 1 * SIZE(AA)
2497 movlpd %xmm2, 2 * SIZE(AA)
2498 movlpd %xmm3, 3 * SIZE(AA)
2505 leal (LDC, LDC, 2), %eax
2507 #if defined(LN) || defined(LT)
2508 movlpd %xmm2, 0 * SIZE(CO1)
2509 movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
2510 movlpd %xmm5, 0 * SIZE(CO1, LDC, 2)
2511 movhpd %xmm5, 0 * SIZE(CO1, %eax, 1)
2513 movlpd %xmm0, 0 * SIZE(CO1)
2514 movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
2515 movlpd %xmm2, 0 * SIZE(CO1, LDC, 2)
2516 movlpd %xmm3, 0 * SIZE(CO1, %eax, 1)
2523 #if defined(LT) || defined(RN)
2526 leal (AA,%eax, SIZE), AA
2544 sall $BASE_SHIFT, %eax
2552 leal (, %eax, SIZE), %eax
2553 leal (B, %eax, 4), B
2556 #if defined(LT) || defined(RN)
2559 leal (,%eax, SIZE), %eax
2560 leal (B, %eax, 4), B
2577 movl OLD_STACK, %esp