1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
44 #define OLD_M 4 + STACK(%esi)
45 #define OLD_N 8 + STACK(%esi)
46 #define OLD_K 12 + STACK(%esi)
47 #define OLD_A 20 + STACK(%esi)
48 #define OLD_B 24 + STACK(%esi)
49 #define OLD_C 28 + STACK(%esi)
50 #define OLD_LDC 32 + STACK(%esi)
51 #define STACK_OFFT 36 + STACK(%esi)
59 #define OLD_STACK 40(%esp)
60 #define OFFSET 44(%esp)
63 #define AORIG 56(%esp)
64 #define BORIG 60(%esp)
65 #define BUFFER 128(%esp)
67 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
68 #define PREFETCH prefetch
69 #define PREFETCHW prefetchw
70 #define PREFETCHSIZE (16 * 10 + 8)
73 #if defined(PENTIUM4) || defined(PENTIUMM)
74 #define PREFETCH prefetcht0
75 #define PREFETCHW prefetcht0
76 #define PREFETCHSIZE 96
79 #if defined(PENRYN) || defined(DUNNINGTON)
80 #define PREFETCH prefetcht0
81 #define PREFETCHW prefetcht0
82 #define PREFETCHSIZE 96
91 #if defined(OPTERON) || !defined(HAVE_SSE2)
99 #define KERNEL1(address) \
100 mulps %xmm0, %xmm2; \
101 addps %xmm2, %xmm4; \
102 movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
103 mulps %xmm0, %xmm2; \
104 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
105 addps %xmm2, %xmm5; \
106 movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
107 mulps %xmm0, %xmm2; \
108 mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
109 addps %xmm2, %xmm6; \
110 movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
111 addps %xmm0, %xmm7; \
112 movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
114 #define KERNEL2(address) \
115 mulps %xmm0, %xmm3; \
116 addps %xmm3, %xmm4; \
117 movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
118 mulps %xmm0, %xmm3; \
119 addps %xmm3, %xmm5; \
120 movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
121 mulps %xmm0, %xmm3; \
122 mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
123 addps %xmm3, %xmm6; \
124 movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
125 addps %xmm0, %xmm7; \
126 movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
128 #define KERNEL3(address) \
129 mulps %xmm0, %xmm2; \
130 addps %xmm2, %xmm4; \
131 movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
132 mulps %xmm0, %xmm2; \
133 addps %xmm2, %xmm5; \
134 movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
135 mulps %xmm0, %xmm2; \
136 mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
137 addps %xmm2, %xmm6; \
138 movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
139 addps %xmm0, %xmm7; \
140 movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
142 #define KERNEL4(address) \
143 mulps %xmm0, %xmm3; \
144 addps %xmm3, %xmm4; \
145 movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
146 mulps %xmm0, %xmm3; \
147 addps %xmm3, %xmm5; \
148 movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
149 mulps %xmm0, %xmm3; \
150 mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
151 addps %xmm3, %xmm6; \
152 movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
153 addps %xmm0, %xmm7; \
154 movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
156 #define KERNEL5(address) \
157 mulps %xmm1, %xmm2; \
158 addps %xmm2, %xmm4; \
159 movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
160 mulps %xmm1, %xmm2; \
161 addps %xmm2, %xmm5; \
162 movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
163 mulps %xmm1, %xmm2; \
164 mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
165 addps %xmm2, %xmm6; \
166 movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
167 addps %xmm1, %xmm7; \
168 movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
170 #define KERNEL6(address) \
171 mulps %xmm1, %xmm3; \
172 addps %xmm3, %xmm4; \
173 movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
174 mulps %xmm1, %xmm3; \
175 addps %xmm3, %xmm5; \
176 movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
177 mulps %xmm1, %xmm3; \
178 mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
179 addps %xmm3, %xmm6; \
180 movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
181 addps %xmm1, %xmm7; \
182 movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
184 #define KERNEL7(address) \
185 mulps %xmm1, %xmm2; \
186 addps %xmm2, %xmm4; \
187 movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
188 mulps %xmm1, %xmm2; \
189 addps %xmm2, %xmm5; \
190 movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
191 mulps %xmm1, %xmm2; \
192 mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
193 addps %xmm2, %xmm6; \
194 movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
195 addps %xmm1, %xmm7; \
196 movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
198 #define KERNEL8(address) \
199 mulps %xmm1, %xmm3; \
200 addps %xmm3, %xmm4; \
201 movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
202 mulps %xmm1, %xmm3; \
203 addps %xmm3, %xmm5; \
204 movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
205 mulps %xmm1, %xmm3; \
206 mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
207 addps %xmm3, %xmm6; \
208 movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
209 addps %xmm1, %xmm7; \
210 movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
223 subl $128 + LOCAL_BUFFER_SIZE, %esp
238 movss STACK_OFFT, %xmm4
249 leal (, LDC, SIZE), LDC
253 leal (, %eax, SIZE), %eax
261 leal (, %eax, SIZE), %eax
292 sall $BASE_SHIFT, %eax
296 #if defined(LN) || defined(RT)
299 sall $BASE_SHIFT, %eax
301 leal (BB, %eax, 4), BB
309 #if defined(LT) || defined(RN)
320 movsd 0 * SIZE(B), %xmm3
321 movhps 2 * SIZE(B), %xmm3
322 movsd 4 * SIZE(B), %xmm7
323 movhps 6 * SIZE(B), %xmm7
325 pshufd $0x00, %xmm3, %xmm0
326 pshufd $0x55, %xmm3, %xmm1
327 pshufd $0xaa, %xmm3, %xmm2
328 pshufd $0xff, %xmm3, %xmm3
330 pshufd $0x00, %xmm7, %xmm4
331 pshufd $0x55, %xmm7, %xmm5
332 pshufd $0xaa, %xmm7, %xmm6
333 pshufd $0xff, %xmm7, %xmm7
335 movaps %xmm0, 0 * SIZE(BB)
336 movaps %xmm1, 4 * SIZE(BB)
337 movaps %xmm2, 8 * SIZE(BB)
338 movaps %xmm3, 12 * SIZE(BB)
339 movaps %xmm4, 16 * SIZE(BB)
340 movaps %xmm5, 20 * SIZE(BB)
341 movaps %xmm6, 24 * SIZE(BB)
342 movaps %xmm7, 28 * SIZE(BB)
351 #if defined(LT) || defined(RN)
363 movss 0 * SIZE(B), %xmm3
365 pshufd $0x00, %xmm3, %xmm0
367 movaps %xmm0, 0 * SIZE(BB)
376 #if defined(LT) || defined(RN)
392 sarl $2, %ebx # i = (m >> 2)
399 sall $2 + BASE_SHIFT, %eax
403 #if defined(LN) || defined(RT)
406 leal (, %eax, SIZE), %eax
407 leal (AA, %eax, 4), AA
412 #if defined(LN) || defined(RT)
414 sall $BASE_SHIFT, %eax
415 leal (BB, %eax, 4), BB
423 movaps 0 * SIZE(AA), %xmm0
424 movaps 16 * SIZE(AA), %xmm1
425 movaps 0 * SIZE(BB), %xmm2
426 movaps 16 * SIZE(BB), %xmm3
428 PREFETCHW 3 * SIZE(CO1)
430 #if defined(LT) || defined(RN)
442 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
443 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
445 movaps 4 * SIZE(AA), %xmm0
447 movaps 32 * SIZE(BB), %xmm2
448 mulps 4 * SIZE(BB), %xmm0
450 movaps 8 * SIZE(AA), %xmm0
451 mulps 8 * SIZE(BB), %xmm0
453 movaps 12 * SIZE(AA), %xmm0
454 mulps 12 * SIZE(BB), %xmm0
456 movaps 32 * SIZE(AA), %xmm0
457 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
458 prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
461 movaps 20 * SIZE(AA), %xmm1
463 movaps 48 * SIZE(BB), %xmm3
464 mulps 20 * SIZE(BB), %xmm1
466 movaps 24 * SIZE(AA), %xmm1
467 mulps 24 * SIZE(BB), %xmm1
469 movaps 28 * SIZE(AA), %xmm1
470 mulps 28 * SIZE(BB), %xmm1
472 movaps 48 * SIZE(AA), %xmm1
481 #if defined(LT) || defined(RN)
487 andl $7, %eax # if (k & 1)
495 movaps 4 * SIZE(AA), %xmm0
496 movaps 4 * SIZE(BB), %xmm2
509 #if defined(LN) || defined(RT)
521 sall $ BASE_SHIFT, %eax
522 leal (AA, %eax, 4), AA
524 leal (BB, %eax, 4), BB
527 #if defined(LN) || defined(LT)
529 unpcklps %xmm6, %xmm4
530 unpckhps %xmm6, %xmm0
533 unpcklps %xmm7, %xmm5
534 unpckhps %xmm7, %xmm1
537 unpcklps %xmm5, %xmm4
538 unpckhps %xmm5, %xmm6
541 unpcklps %xmm1, %xmm0
542 unpckhps %xmm1, %xmm2
544 movss 0 * SIZE(B), %xmm1
545 movss 1 * SIZE(B), %xmm3
546 movss 2 * SIZE(B), %xmm5
547 movss 3 * SIZE(B), %xmm7
554 movaps 0 * SIZE(AA), %xmm0
560 movaps 12 * SIZE(AA), %xmm4
561 pshufd $0xff, %xmm4, %xmm6
563 pshufd $0xaa, %xmm4, %xmm6
566 pshufd $0x55, %xmm4, %xmm6
569 pshufd $0x00, %xmm4, %xmm6
573 movaps 8 * SIZE(AA), %xmm4
574 pshufd $0xaa, %xmm4, %xmm6
576 pshufd $0x55, %xmm4, %xmm6
579 pshufd $0x00, %xmm4, %xmm6
583 movaps 4 * SIZE(AA), %xmm4
584 pshufd $0x55, %xmm4, %xmm6
586 pshufd $0x00, %xmm4, %xmm6
590 movaps 0 * SIZE(AA), %xmm4
591 pshufd $0x00, %xmm4, %xmm6
596 movaps 0 * SIZE(AA), %xmm4
597 pshufd $0x00, %xmm4, %xmm6
600 pshufd $0x55, %xmm4, %xmm6
603 pshufd $0xaa, %xmm4, %xmm6
606 pshufd $0xff, %xmm4, %xmm6
610 movaps 4 * SIZE(AA), %xmm4
611 pshufd $0x55, %xmm4, %xmm6
613 pshufd $0xaa, %xmm4, %xmm6
616 pshufd $0xff, %xmm4, %xmm6
620 movaps 8 * SIZE(AA), %xmm4
621 pshufd $0xaa, %xmm4, %xmm6
623 pshufd $0xff, %xmm4, %xmm6
627 movaps 12 * SIZE(AA), %xmm4
628 pshufd $0xff, %xmm4, %xmm6
632 #if defined(RN) || defined(RT)
633 movss 0 * SIZE(B), %xmm6
634 pshufd $0x00, %xmm6, %xmm7
638 #if defined(LN) || defined(LT)
639 movss %xmm1, 0 * SIZE(B)
640 movss %xmm3, 1 * SIZE(B)
641 movss %xmm5, 2 * SIZE(B)
642 movss %xmm7, 3 * SIZE(B)
644 pshufd $0x00, %xmm1, %xmm0
645 movaps %xmm0, 0 * SIZE(BB)
646 pshufd $0x00, %xmm3, %xmm0
647 movaps %xmm0, 4 * SIZE(BB)
649 pshufd $0x00, %xmm5, %xmm0
650 movaps %xmm0, 8 * SIZE(BB)
651 pshufd $0x00, %xmm7, %xmm0
652 movaps %xmm0, 12 * SIZE(BB)
654 movaps %xmm0, 0 * SIZE(AA)
661 #if defined(LN) || defined(LT)
662 unpcklps %xmm5, %xmm1
663 unpcklps %xmm7, %xmm3
665 unpcklps %xmm3, %xmm1
667 movlps %xmm1, 0 * SIZE(CO1)
668 movhps %xmm1, 2 * SIZE(CO1)
670 movlps %xmm0, 0 * SIZE(CO1)
671 movhps %xmm0, 2 * SIZE(CO1)
678 #if defined(LT) || defined(RN)
681 leal (,%eax, SIZE), %eax
682 leal (AA, %eax, 4), AA
700 sall $2 + BASE_SHIFT, %eax
714 sall $1 + BASE_SHIFT, %eax
718 #if defined(LN) || defined(RT)
721 leal (, %eax, SIZE), %eax
722 leal (AA, %eax, 2), AA
727 #if defined(LN) || defined(RT)
729 sall $BASE_SHIFT, %eax
730 leal (BB, %eax, 4), BB
741 movsd 0 * SIZE(AA), %xmm0
745 movsd 8 * SIZE(AA), %xmm1
746 movaps 0 * SIZE(BB), %xmm2
747 movaps 16 * SIZE(BB), %xmm3
749 #if defined(LT) || defined(RN)
761 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
762 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
764 movsd 2 * SIZE(AA), %xmm0
766 movaps 4 * SIZE(BB), %xmm2
768 movsd 4 * SIZE(AA), %xmm0
770 movaps 8 * SIZE(BB), %xmm2
772 movsd 6 * SIZE(AA), %xmm0
774 movaps 12 * SIZE(BB), %xmm2
776 movsd 16 * SIZE(AA), %xmm0
778 movaps 32 * SIZE(BB), %xmm2
780 movsd 10 * SIZE(AA), %xmm1
782 movaps 20 * SIZE(BB), %xmm3
784 movsd 12 * SIZE(AA), %xmm1
786 movaps 24 * SIZE(BB), %xmm3
788 movsd 14 * SIZE(AA), %xmm1
790 movaps 28 * SIZE(BB), %xmm3
792 movsd 24 * SIZE(AA), %xmm1
794 movaps 48 * SIZE(BB), %xmm3
803 #if defined(LT) || defined(RN)
809 andl $7, %eax # if (k & 1)
817 movsd 2 * SIZE(AA), %xmm0
818 movaps 4 * SIZE(BB), %xmm2
831 #if defined(LN) || defined(RT)
843 sall $ BASE_SHIFT, %eax
844 leal (AA, %eax, 2), AA
846 leal (BB, %eax, 4), BB
849 #if defined(LN) || defined(LT)
850 pshufd $1, %xmm4, %xmm6
852 movss 0 * SIZE(B), %xmm1
853 movss 1 * SIZE(B), %xmm3
861 movsd 0 * SIZE(AA), %xmm0
867 movaps 0 * SIZE(AA), %xmm4
868 pshufd $0xff, %xmm4, %xmm6
870 pshufd $0xaa, %xmm4, %xmm6
874 pshufd $0x00, %xmm4, %xmm6
879 movaps 0 * SIZE(AA), %xmm4
880 pshufd $0x00, %xmm4, %xmm6
882 pshufd $0x55, %xmm4, %xmm6
886 pshufd $0xff, %xmm4, %xmm6
890 #if defined(RN) || defined(RT)
891 movss 0 * SIZE(B), %xmm6
892 pshufd $0x00, %xmm6, %xmm7
896 #if defined(LN) || defined(LT)
897 movss %xmm1, 0 * SIZE(B)
898 movss %xmm3, 1 * SIZE(B)
900 pshufd $0x00, %xmm1, %xmm0
901 movaps %xmm0, 0 * SIZE(BB)
902 pshufd $0x00, %xmm3, %xmm0
903 movaps %xmm0, 4 * SIZE(BB)
905 movlps %xmm0, 0 * SIZE(AA)
912 #if defined(LN) || defined(LT)
913 movss %xmm1, 0 * SIZE(CO1)
914 movss %xmm3, 1 * SIZE(CO1)
916 movlps %xmm0, 0 * SIZE(CO1)
923 #if defined(LT) || defined(RN)
926 leal (,%eax, SIZE), %eax
927 leal (AA, %eax, 2), AA
945 sall $1 + BASE_SHIFT, %eax
956 sall $BASE_SHIFT, %eax
960 #if defined(LN) || defined(RT)
963 leal (AA, %eax, SIZE), AA
968 #if defined(LN) || defined(RT)
970 sall $BASE_SHIFT, %eax
971 leal (BB, %eax, 4), BB
979 movss 0 * SIZE(AA), %xmm0
980 movss 4 * SIZE(AA), %xmm1
981 movss 0 * SIZE(BB), %xmm2
982 movss 16 * SIZE(BB), %xmm3
984 #if defined(LT) || defined(RN)
996 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
997 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
999 movss 1 * SIZE(AA), %xmm0
1001 movss 32 * SIZE(BB), %xmm2
1002 mulss 4 * SIZE(BB), %xmm0
1004 movss 2 * SIZE(AA), %xmm0
1005 mulss 8 * SIZE(BB), %xmm0
1007 movss 3 * SIZE(AA), %xmm0
1008 mulss 12 * SIZE(BB), %xmm0
1010 movss 8 * SIZE(AA), %xmm0
1012 movss 5 * SIZE(AA), %xmm1
1014 movss 48 * SIZE(BB), %xmm3
1015 mulss 20 * SIZE(BB), %xmm1
1017 movss 6 * SIZE(AA), %xmm1
1018 mulss 24 * SIZE(BB), %xmm1
1020 movss 7 * SIZE(AA), %xmm1
1021 mulss 28 * SIZE(BB), %xmm1
1023 movss 12 * SIZE(AA), %xmm1
1032 #if defined(LT) || defined(RN)
1038 andl $7, %eax # if (k & 1)
1045 movss 1 * SIZE(AA), %xmm0
1047 movss 4 * SIZE(BB), %xmm2
1060 #if defined(LN) || defined(RT)
1068 sall $ BASE_SHIFT, %eax
1069 leal (AA, %eax, 1), AA
1070 leal (B, %eax, 1), B
1071 leal (BB, %eax, 4), BB
1074 #if defined(LN) || defined(LT)
1075 movss 0 * SIZE(B), %xmm1
1078 movss 0 * SIZE(AA), %xmm0
1082 #if defined(LN) || defined(LT)
1083 mulss 0 * SIZE(AA), %xmm1
1086 #if defined(RN) || defined(RT)
1087 mulss 0 * SIZE(B), %xmm0
1090 #if defined(LN) || defined(LT)
1091 movss %xmm1, 0 * SIZE(B)
1093 pshufd $0x00, %xmm1, %xmm0
1094 movaps %xmm0, 0 * SIZE(BB)
1096 movss %xmm0, 0 * SIZE(AA)
1103 #if defined(LN) || defined(LT)
1104 movss %xmm1, 0 * SIZE(CO1)
1106 movss %xmm0, 0 * SIZE(CO1)
1113 #if defined(LT) || defined(RN)
1116 leal (AA, %eax, SIZE), AA
1134 sall $BASE_SHIFT, %eax
1142 leal (B, %eax, SIZE), B
1145 #if defined(LT) || defined(RN)
1148 leal (B, %eax, SIZE), B
1174 sall $1 + BASE_SHIFT, %eax
1178 #if defined(LN) || defined(RT)
1181 sall $1 + BASE_SHIFT, %eax
1182 leal (B, %eax, 1), B
1183 leal (BB, %eax, 4), BB
1191 #if defined(LT) || defined(RN)
1202 movaps 0 * SIZE(B), %xmm3
1203 movaps 4 * SIZE(B), %xmm7
1205 pshufd $0x00, %xmm3, %xmm0
1206 pshufd $0x55, %xmm3, %xmm1
1207 pshufd $0xaa, %xmm3, %xmm2
1208 pshufd $0xff, %xmm3, %xmm3
1210 pshufd $0x00, %xmm7, %xmm4
1211 pshufd $0x55, %xmm7, %xmm5
1212 pshufd $0xaa, %xmm7, %xmm6
1213 pshufd $0xff, %xmm7, %xmm7
1215 movaps %xmm0, 0 * SIZE(BB)
1216 movaps %xmm1, 4 * SIZE(BB)
1217 movaps %xmm2, 8 * SIZE(BB)
1218 movaps %xmm3, 12 * SIZE(BB)
1219 movaps %xmm4, 16 * SIZE(BB)
1220 movaps %xmm5, 20 * SIZE(BB)
1221 movaps %xmm6, 24 * SIZE(BB)
1222 movaps %xmm7, 28 * SIZE(BB)
1225 addl $32 * SIZE, %ecx
1231 #if defined(LT) || defined(RN)
1243 movsd 0 * SIZE(B), %xmm3
1245 pshufd $0x00, %xmm3, %xmm0
1246 pshufd $0x55, %xmm3, %xmm1
1248 movaps %xmm0, 0 * SIZE(BB)
1249 movaps %xmm1, 4 * SIZE(BB)
1252 addl $8 * SIZE, %ecx
1258 #if defined(LT) || defined(RN)
1265 leal (, LDC, 2), %eax
1276 sarl $2, %ebx # i = (m >> 2)
1283 sall $2 + BASE_SHIFT, %eax
1287 #if defined(LN) || defined(RT)
1290 leal (, %eax, SIZE), %eax
1291 leal (AA, %eax, 4), AA
1296 #if defined(LN) || defined(RT)
1298 sall $1 + BASE_SHIFT, %eax
1299 leal (BB, %eax, 4), BB
1307 movaps 0 * SIZE(AA), %xmm0
1308 movaps 16 * SIZE(AA), %xmm1
1309 movaps 0 * SIZE(BB), %xmm2
1310 movaps 16 * SIZE(BB), %xmm3
1312 PREFETCHW 3 * SIZE(CO1)
1313 PREFETCHW 3 * SIZE(CO1, LDC)
1315 #if defined(LT) || defined(RN)
1327 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
1328 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
1330 mulps 4 * SIZE(BB), %xmm0
1332 movaps 8 * SIZE(BB), %xmm2
1334 movaps 4 * SIZE(AA), %xmm0
1337 mulps 12 * SIZE(BB), %xmm0
1339 movaps 32 * SIZE(BB), %xmm2
1341 movaps 8 * SIZE(AA), %xmm0
1344 mulps 20 * SIZE(BB), %xmm0
1346 movaps 24 * SIZE(BB), %xmm3
1348 movaps 12 * SIZE(AA), %xmm0
1351 mulps 28 * SIZE(BB), %xmm0
1353 movaps 48 * SIZE(BB), %xmm3
1355 movaps 32 * SIZE(AA), %xmm0
1357 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
1358 prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
1361 mulps 36 * SIZE(BB), %xmm1
1363 movaps 40 * SIZE(BB), %xmm2
1365 movaps 20 * SIZE(AA), %xmm1
1368 mulps 44 * SIZE(BB), %xmm1
1370 movaps 64 * SIZE(BB), %xmm2
1372 movaps 24 * SIZE(AA), %xmm1
1375 mulps 52 * SIZE(BB), %xmm1
1377 movaps 56 * SIZE(BB), %xmm3
1379 movaps 28 * SIZE(AA), %xmm1
1382 mulps 60 * SIZE(BB), %xmm1
1384 movaps 80 * SIZE(BB), %xmm3
1386 movaps 48 * SIZE(AA), %xmm1
1395 #if defined(LT) || defined(RN)
1401 andl $7, %eax # if (k & 1)
1408 mulps 4 * SIZE(BB), %xmm0
1410 movaps 8 * SIZE(BB), %xmm2
1412 movaps 4 * SIZE(AA), %xmm0
1421 #if defined(LN) || defined(RT)
1433 sall $1 + BASE_SHIFT, %eax
1434 leal (AA, %eax, 2), AA
1435 leal (B, %eax, 1), B
1436 leal (BB, %eax, 4), BB
1439 #if defined(LN) || defined(LT)
1441 unpcklps %xmm6, %xmm4
1442 unpckhps %xmm6, %xmm0
1445 unpcklps %xmm7, %xmm5
1446 unpckhps %xmm7, %xmm1
1449 unpcklps %xmm5, %xmm4
1450 unpckhps %xmm5, %xmm6
1453 unpcklps %xmm1, %xmm0
1454 unpckhps %xmm1, %xmm2
1459 movsd 0 * SIZE(B), %xmm1
1463 movsd 2 * SIZE(B), %xmm3
1467 movsd 4 * SIZE(B), %xmm5
1471 movsd 6 * SIZE(B), %xmm7
1478 movaps 0 * SIZE(AA), %xmm0
1479 movaps 4 * SIZE(AA), %xmm1
1486 movaps 12 * SIZE(AA), %xmm4
1487 pshufd $0xff, %xmm4, %xmm6
1489 pshufd $0xaa, %xmm4, %xmm6
1492 pshufd $0x55, %xmm4, %xmm6
1495 pshufd $0x00, %xmm4, %xmm6
1499 movaps 8 * SIZE(AA), %xmm4
1500 pshufd $0xaa, %xmm4, %xmm6
1502 pshufd $0x55, %xmm4, %xmm6
1505 pshufd $0x00, %xmm4, %xmm6
1509 movaps 4 * SIZE(AA), %xmm4
1510 pshufd $0x55, %xmm4, %xmm6
1512 pshufd $0x00, %xmm4, %xmm6
1516 movaps 0 * SIZE(AA), %xmm4
1517 pshufd $0x00, %xmm4, %xmm6
1522 movaps 0 * SIZE(AA), %xmm4
1523 pshufd $0x00, %xmm4, %xmm6
1526 pshufd $0x55, %xmm4, %xmm6
1529 pshufd $0xaa, %xmm4, %xmm6
1532 pshufd $0xff, %xmm4, %xmm6
1536 movaps 4 * SIZE(AA), %xmm4
1537 pshufd $0x55, %xmm4, %xmm6
1539 pshufd $0xaa, %xmm4, %xmm6
1542 pshufd $0xff, %xmm4, %xmm6
1546 movaps 8 * SIZE(AA), %xmm4
1547 pshufd $0xaa, %xmm4, %xmm6
1549 pshufd $0xff, %xmm4, %xmm6
1553 movaps 12 * SIZE(AA), %xmm4
1554 pshufd $0xff, %xmm4, %xmm6
1559 movaps 0 * SIZE(B), %xmm6
1560 pshufd $0x00, %xmm6, %xmm7
1562 pshufd $0x55, %xmm6, %xmm7
1566 pshufd $0xff, %xmm6, %xmm7
1571 movaps 0 * SIZE(B), %xmm6
1572 pshufd $0xff, %xmm6, %xmm7
1574 pshufd $0xaa, %xmm6, %xmm7
1578 pshufd $0x00, %xmm6, %xmm7
1582 #if defined(LN) || defined(LT)
1583 movlps %xmm1, 0 * SIZE(B)
1584 movlps %xmm3, 2 * SIZE(B)
1585 movlps %xmm5, 4 * SIZE(B)
1586 movlps %xmm7, 6 * SIZE(B)
1588 pshufd $0x00, %xmm1, %xmm0
1589 pshufd $0x55, %xmm1, %xmm2
1590 movaps %xmm0, 0 * SIZE(BB)
1591 movaps %xmm2, 4 * SIZE(BB)
1593 pshufd $0x00, %xmm3, %xmm0
1594 pshufd $0x55, %xmm3, %xmm2
1595 movaps %xmm0, 8 * SIZE(BB)
1596 movaps %xmm2, 12 * SIZE(BB)
1598 pshufd $0x00, %xmm5, %xmm0
1599 pshufd $0x55, %xmm5, %xmm2
1600 movaps %xmm0, 16 * SIZE(BB)
1601 movaps %xmm2, 20 * SIZE(BB)
1603 pshufd $0x00, %xmm7, %xmm0
1604 pshufd $0x55, %xmm7, %xmm2
1605 movaps %xmm0, 24 * SIZE(BB)
1606 movaps %xmm2, 28 * SIZE(BB)
1608 movaps %xmm0, 0 * SIZE(AA)
1609 movaps %xmm1, 4 * SIZE(AA)
1616 #if defined(LN) || defined(LT)
1617 unpcklps %xmm5, %xmm1
1618 unpcklps %xmm7, %xmm3
1621 unpcklps %xmm3, %xmm1
1622 unpckhps %xmm3, %xmm2
1624 movlps %xmm1, 0 * SIZE(CO1)
1625 movhps %xmm1, 2 * SIZE(CO1)
1626 movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
1627 movhps %xmm2, 2 * SIZE(CO1, LDC, 1)
1629 movlps %xmm0, 0 * SIZE(CO1)
1630 movhps %xmm0, 2 * SIZE(CO1)
1631 movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
1632 movhps %xmm1, 2 * SIZE(CO1, LDC, 1)
1639 #if defined(LT) || defined(RN)
1642 leal (,%eax, SIZE), %eax
1643 leal (AA, %eax, 4), AA
1661 sall $2 + BASE_SHIFT, %eax
1675 sall $1 + BASE_SHIFT, %eax
1679 #if defined(LN) || defined(RT)
1682 leal (, %eax, SIZE), %eax
1683 leal (AA, %eax, 2), AA
1688 #if defined(LN) || defined(RT)
1690 sall $1 + BASE_SHIFT, %eax
1691 leal (BB, %eax, 4), BB
1702 movsd 0 * SIZE(AA), %xmm0
1706 movsd 8 * SIZE(AA), %xmm1
1707 movaps 0 * SIZE(BB), %xmm2
1708 movaps 16 * SIZE(BB), %xmm3
1710 #if defined(LT) || defined(RN)
1721 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
1722 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
1727 movaps 4 * SIZE(BB), %xmm2
1729 movsd 2 * SIZE(AA), %xmm0
1731 movaps 8 * SIZE(BB), %xmm2
1735 movaps 12 * SIZE(BB), %xmm2
1737 movsd 4 * SIZE(AA), %xmm0
1739 movaps 32 * SIZE(BB), %xmm2
1743 movaps 20 * SIZE(BB), %xmm3
1745 movsd 6 * SIZE(AA), %xmm0
1747 movaps 24 * SIZE(BB), %xmm3
1751 movaps 28 * SIZE(BB), %xmm3
1753 movsd 16 * SIZE(AA), %xmm0
1755 movaps 48 * SIZE(BB), %xmm3
1759 movaps 36 * SIZE(BB), %xmm2
1761 movsd 10 * SIZE(AA), %xmm1
1763 movaps 40 * SIZE(BB), %xmm2
1767 movaps 44 * SIZE(BB), %xmm2
1769 movsd 12 * SIZE(AA), %xmm1
1771 movaps 64 * SIZE(BB), %xmm2
1775 movaps 52 * SIZE(BB), %xmm3
1777 movsd 14 * SIZE(AA), %xmm1
1779 movaps 56 * SIZE(BB), %xmm3
1783 movaps 60 * SIZE(BB), %xmm3
1785 movsd 24 * SIZE(AA), %xmm1
1787 movaps 80 * SIZE(BB), %xmm3
1796 #if defined(LT) || defined(RN)
1802 andl $7, %eax # if (k & 1)
1810 movaps 4 * SIZE(BB), %xmm2
1812 movsd 2 * SIZE(AA), %xmm0
1814 movaps 8 * SIZE(BB), %xmm2
1826 #if defined(LN) || defined(RT)
1838 sall $BASE_SHIFT, %eax
1839 leal (AA, %eax, 2), AA
1840 leal (B, %eax, 2), B
1841 leal (BB, %eax, 8), BB
1844 #if defined(LN) || defined(LT)
1845 unpcklps %xmm6, %xmm4
1846 unpcklps %xmm7, %xmm5
1849 unpcklps %xmm5, %xmm4
1850 unpckhps %xmm5, %xmm6
1855 movsd 0 * SIZE(B), %xmm1
1859 movsd 2 * SIZE(B), %xmm3
1867 movsd 0 * SIZE(AA), %xmm0
1871 movsd 2 * SIZE(AA), %xmm1
1878 movaps 0 * SIZE(AA), %xmm4
1879 pshufd $0xff, %xmm4, %xmm6
1881 pshufd $0xaa, %xmm4, %xmm6
1885 pshufd $0x00, %xmm4, %xmm6
1890 movaps 0 * SIZE(AA), %xmm4
1891 pshufd $0x00, %xmm4, %xmm6
1893 pshufd $0x55, %xmm4, %xmm6
1897 pshufd $0xff, %xmm4, %xmm6
1902 movaps 0 * SIZE(B), %xmm6
1903 pshufd $0x00, %xmm6, %xmm7
1905 pshufd $0x55, %xmm6, %xmm7
1909 pshufd $0xff, %xmm6, %xmm7
1914 movaps 0 * SIZE(B), %xmm6
1915 pshufd $0xff, %xmm6, %xmm7
1917 pshufd $0xaa, %xmm6, %xmm7
1921 pshufd $0x00, %xmm6, %xmm7
1925 #if defined(LN) || defined(LT)
1926 movlps %xmm1, 0 * SIZE(B)
1927 movlps %xmm3, 2 * SIZE(B)
1929 pshufd $0x00, %xmm1, %xmm0
1930 pshufd $0x55, %xmm1, %xmm2
1931 movaps %xmm0, 0 * SIZE(BB)
1932 movaps %xmm2, 4 * SIZE(BB)
1934 pshufd $0x00, %xmm3, %xmm0
1935 pshufd $0x55, %xmm3, %xmm2
1936 movaps %xmm0, 8 * SIZE(BB)
1937 movaps %xmm2, 12 * SIZE(BB)
1939 movlps %xmm0, 0 * SIZE(AA)
1940 movlps %xmm1, 2 * SIZE(AA)
1947 #if defined(LN) || defined(LT)
1948 unpcklps %xmm3, %xmm1
1950 movlps %xmm1, 0 * SIZE(CO1)
1951 movhps %xmm1, 0 * SIZE(CO1, LDC)
1953 movlps %xmm0, 0 * SIZE(CO1)
1954 movlps %xmm1, 0 * SIZE(CO1, LDC)
1961 #if defined(LT) || defined(RN)
1964 leal (,%eax, SIZE), %eax
1965 leal (AA, %eax, 2), AA
1983 sall $1 + BASE_SHIFT, %eax
1994 sall $BASE_SHIFT, %eax
1998 #if defined(LN) || defined(RT)
2001 leal (AA, %eax, SIZE), AA
2006 #if defined(LN) || defined(RT)
2008 sall $1 + BASE_SHIFT, %eax
2009 leal (BB, %eax, 4), BB
2017 movss 0 * SIZE(AA), %xmm0
2018 movss 4 * SIZE(AA), %xmm1
2019 movss 0 * SIZE(BB), %xmm2
2020 movss 16 * SIZE(BB), %xmm3
2022 #if defined(LT) || defined(RN)
2034 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
2035 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
2037 mulss 4 * SIZE(BB), %xmm0
2039 movss 8 * SIZE(BB), %xmm2
2041 movss 1 * SIZE(AA), %xmm0
2043 mulss 12 * SIZE(BB), %xmm0
2045 movss 32 * SIZE(BB), %xmm2
2047 movss 2 * SIZE(AA), %xmm0
2049 mulss 20 * SIZE(BB), %xmm0
2051 movss 24 * SIZE(BB), %xmm3
2053 movss 3 * SIZE(AA), %xmm0
2055 mulss 28 * SIZE(BB), %xmm0
2057 movss 48 * SIZE(BB), %xmm3
2059 movss 8 * SIZE(AA), %xmm0
2061 mulss 36 * SIZE(BB), %xmm1
2063 movss 40 * SIZE(BB), %xmm2
2065 movss 5 * SIZE(AA), %xmm1
2067 mulss 44 * SIZE(BB), %xmm1
2069 movss 64 * SIZE(BB), %xmm2
2071 movss 6 * SIZE(AA), %xmm1
2073 mulss 52 * SIZE(BB), %xmm1
2075 movss 56 * SIZE(BB), %xmm3
2077 movss 7 * SIZE(AA), %xmm1
2079 mulss 60 * SIZE(BB), %xmm1
2081 movss 80 * SIZE(BB), %xmm3
2083 movss 12 * SIZE(AA), %xmm1
2092 #if defined(LT) || defined(RN)
2098 andl $7, %eax # if (k & 1)
2105 mulss 4 * SIZE(BB), %xmm0
2107 movss 8 * SIZE(BB), %xmm2
2109 movss 1 * SIZE(AA), %xmm0
2121 #if defined(LN) || defined(RT)
2133 sall $BASE_SHIFT, %eax
2134 leal (AA, %eax, 1), AA
2135 leal (B, %eax, 2), B
2136 leal (BB, %eax, 8), BB
2139 #if defined(LN) || defined(LT)
2140 unpcklps %xmm5, %xmm4
2145 movsd 0 * SIZE(B), %xmm1
2149 movss 0 * SIZE(AA), %xmm0
2150 movss 1 * SIZE(AA), %xmm1
2156 #if defined(LN) || defined(LT)
2157 movss 0 * SIZE(AA), %xmm4
2158 pshufd $0x00, %xmm4, %xmm6
2163 movaps 0 * SIZE(B), %xmm6
2164 pshufd $0x00, %xmm6, %xmm7
2166 pshufd $0x55, %xmm6, %xmm7
2170 pshufd $0xff, %xmm6, %xmm7
2175 movaps 0 * SIZE(B), %xmm6
2176 pshufd $0xff, %xmm6, %xmm7
2178 pshufd $0xaa, %xmm6, %xmm7
2182 pshufd $0x00, %xmm6, %xmm7
2186 #if defined(LN) || defined(LT)
2187 movlps %xmm1, 0 * SIZE(B)
2189 pshufd $0x00, %xmm1, %xmm0
2190 pshufd $0x55, %xmm1, %xmm2
2191 movaps %xmm0, 0 * SIZE(BB)
2192 movaps %xmm2, 4 * SIZE(BB)
2194 movss %xmm0, 0 * SIZE(AA)
2195 movss %xmm1, 1 * SIZE(AA)
2202 #if defined(LN) || defined(LT)
2203 pshufd $1, %xmm1, %xmm3
2205 movss %xmm1, 0 * SIZE(CO1)
2206 movss %xmm3, 0 * SIZE(CO1, LDC)
2208 movss %xmm0, 0 * SIZE(CO1)
2209 movss %xmm1, 0 * SIZE(CO1, LDC)
2216 #if defined(LT) || defined(RN)
2219 leal (AA, %eax, SIZE), AA
2237 sall $BASE_SHIFT, %eax
2245 leal (, %eax, SIZE), %eax
2246 leal (B, %eax, 2), B
2249 #if defined(LT) || defined(RN)
2252 leal (,%eax, SIZE), %eax
2253 leal (B, %eax, 2), B
2282 sall $2 + BASE_SHIFT, %eax
2286 #if defined(LN) || defined(RT)
2289 sall $2 + BASE_SHIFT, %eax
2290 leal (B, %eax, 1), B
2291 leal (BB, %eax, 4), BB
2299 #if defined(LT) || defined(RN)
2310 movaps 0 * SIZE(B), %xmm3
2311 movaps 4 * SIZE(B), %xmm7
2313 pshufd $0x00, %xmm3, %xmm0
2314 pshufd $0x55, %xmm3, %xmm1
2315 pshufd $0xaa, %xmm3, %xmm2
2316 pshufd $0xff, %xmm3, %xmm3
2318 pshufd $0x00, %xmm7, %xmm4
2319 pshufd $0x55, %xmm7, %xmm5
2320 pshufd $0xaa, %xmm7, %xmm6
2321 pshufd $0xff, %xmm7, %xmm7
2323 movaps %xmm0, 0 * SIZE(BB)
2324 movaps %xmm1, 4 * SIZE(BB)
2325 movaps %xmm2, 8 * SIZE(BB)
2326 movaps %xmm3, 12 * SIZE(BB)
2327 movaps %xmm4, 16 * SIZE(BB)
2328 movaps %xmm5, 20 * SIZE(BB)
2329 movaps %xmm6, 24 * SIZE(BB)
2330 movaps %xmm7, 28 * SIZE(BB)
2333 addl $32 * SIZE, %ecx
2339 #if defined(LT) || defined(RN)
2349 movaps 0 * SIZE(B), %xmm3
2351 pshufd $0x00, %xmm3, %xmm0
2352 pshufd $0x55, %xmm3, %xmm1
2353 pshufd $0xaa, %xmm3, %xmm2
2354 pshufd $0xff, %xmm3, %xmm3
2356 movaps %xmm0, 0 * SIZE(BB)
2357 movaps %xmm1, 4 * SIZE(BB)
2358 movaps %xmm2, 8 * SIZE(BB)
2359 movaps %xmm3, 12 * SIZE(BB)
2365 #if defined(LT) || defined(RN)
2372 leal (, LDC, 4), %eax
2383 sarl $2, %ebx # i = (m >> 2)
2390 sall $2 + BASE_SHIFT, %eax
2394 #if defined(LN) || defined(RT)
2397 leal (, %eax, SIZE), %eax
2398 leal (AA, %eax, 4), AA
2403 #if defined(LN) || defined(RT)
2405 sall $2 + BASE_SHIFT, %eax
2406 leal (BB, %eax, 4), BB
2409 movaps 0 * SIZE(AA), %xmm0
2411 movaps 16 * SIZE(AA), %xmm1
2413 movaps 0 * SIZE(BB), %xmm2
2415 movaps 16 * SIZE(BB), %xmm3
2418 leal (LDC, LDC, 2), %eax
2420 PREFETCHW 3 * SIZE(CO1)
2421 PREFETCHW 3 * SIZE(CO1, LDC)
2422 PREFETCHW 3 * SIZE(CO1, LDC, 2)
2423 PREFETCHW 3 * SIZE(CO1, %eax)
2425 #if defined(LT) || defined(RN)
2445 addl $128 * SIZE, BB
2452 #if defined(LT) || defined(RN)
2458 andl $7, %eax # if (k & 1)
2466 movaps 4 * SIZE(BB), %xmm2
2469 movaps 8 * SIZE(BB), %xmm2
2471 mulps 12 * SIZE(BB), %xmm0
2473 movaps 16 * SIZE(BB), %xmm2
2475 movaps 4 * SIZE(AA), %xmm0
2484 #if defined(LN) || defined(RT)
2496 sall $2 + BASE_SHIFT, %eax
2497 leal (AA, %eax, 1), AA
2498 leal (B, %eax, 1), B
2499 leal (BB, %eax, 4), BB
2502 #if defined(LN) || defined(LT)
2504 unpcklps %xmm6, %xmm4
2505 unpckhps %xmm6, %xmm0
2508 unpcklps %xmm7, %xmm5
2509 unpckhps %xmm7, %xmm1
2512 unpcklps %xmm5, %xmm4
2513 unpckhps %xmm5, %xmm6
2516 unpcklps %xmm1, %xmm0
2517 unpckhps %xmm1, %xmm2
2519 movaps 0 * SIZE(B), %xmm1
2520 movaps 4 * SIZE(B), %xmm3
2521 movaps 8 * SIZE(B), %xmm5
2522 movaps 12 * SIZE(B), %xmm7
2529 movaps 0 * SIZE(AA), %xmm0
2530 movaps 4 * SIZE(AA), %xmm1
2531 movaps 8 * SIZE(AA), %xmm2
2532 movaps 12 * SIZE(AA), %xmm3
2541 movaps 12 * SIZE(AA), %xmm4
2542 pshufd $0xff, %xmm4, %xmm6
2544 pshufd $0xaa, %xmm4, %xmm6
2547 pshufd $0x55, %xmm4, %xmm6
2550 pshufd $0x00, %xmm4, %xmm6
2554 movaps 8 * SIZE(AA), %xmm4
2555 pshufd $0xaa, %xmm4, %xmm6
2557 pshufd $0x55, %xmm4, %xmm6
2560 pshufd $0x00, %xmm4, %xmm6
2564 movaps 4 * SIZE(AA), %xmm4
2565 pshufd $0x55, %xmm4, %xmm6
2567 pshufd $0x00, %xmm4, %xmm6
2571 movaps 0 * SIZE(AA), %xmm4
2572 pshufd $0x00, %xmm4, %xmm6
2577 movaps 0 * SIZE(AA), %xmm4
2578 pshufd $0x00, %xmm4, %xmm6
2581 pshufd $0x55, %xmm4, %xmm6
2584 pshufd $0xaa, %xmm4, %xmm6
2587 pshufd $0xff, %xmm4, %xmm6
2591 movaps 4 * SIZE(AA), %xmm4
2592 pshufd $0x55, %xmm4, %xmm6
2594 pshufd $0xaa, %xmm4, %xmm6
2597 pshufd $0xff, %xmm4, %xmm6
2601 movaps 8 * SIZE(AA), %xmm4
2602 pshufd $0xaa, %xmm4, %xmm6
2604 pshufd $0xff, %xmm4, %xmm6
2608 movaps 12 * SIZE(AA), %xmm4
2609 pshufd $0xff, %xmm4, %xmm6
2614 movaps 0 * SIZE(B), %xmm6
2615 pshufd $0x00, %xmm6, %xmm7
2617 pshufd $0x55, %xmm6, %xmm7
2620 pshufd $0xaa, %xmm6, %xmm7
2623 pshufd $0xff, %xmm6, %xmm7
2627 movaps 4 * SIZE(B), %xmm6
2628 pshufd $0x55, %xmm6, %xmm7
2630 pshufd $0xaa, %xmm6, %xmm7
2633 pshufd $0xff, %xmm6, %xmm7
2637 movaps 8 * SIZE(B), %xmm6
2638 pshufd $0xaa, %xmm6, %xmm7
2640 pshufd $0xff, %xmm6, %xmm7
2644 movaps 12 * SIZE(B), %xmm6
2645 pshufd $0xff, %xmm6, %xmm7
2650 movaps 12 * SIZE(B), %xmm6
2651 pshufd $0xff, %xmm6, %xmm7
2653 pshufd $0xaa, %xmm6, %xmm7
2656 pshufd $0x55, %xmm6, %xmm7
2659 pshufd $0x00, %xmm6, %xmm7
2663 movaps 8 * SIZE(B), %xmm6
2664 pshufd $0xaa, %xmm6, %xmm7
2666 pshufd $0x55, %xmm6, %xmm7
2669 pshufd $0x00, %xmm6, %xmm7
2673 movaps 4 * SIZE(B), %xmm6
2674 pshufd $0x55, %xmm6, %xmm7
2676 pshufd $0x00, %xmm6, %xmm7
2680 movaps 0 * SIZE(B), %xmm6
2681 pshufd $0x00, %xmm6, %xmm7
2685 #if defined(LN) || defined(LT)
2686 movaps %xmm1, 0 * SIZE(B)
2687 movaps %xmm3, 4 * SIZE(B)
2688 movaps %xmm5, 8 * SIZE(B)
2689 movaps %xmm7, 12 * SIZE(B)
2691 pshufd $0x00, %xmm1, %xmm0
2692 pshufd $0x55, %xmm1, %xmm2
2693 pshufd $0xaa, %xmm1, %xmm4
2694 pshufd $0xff, %xmm1, %xmm6
2695 movaps %xmm0, 0 * SIZE(BB)
2696 movaps %xmm2, 4 * SIZE(BB)
2697 movaps %xmm4, 8 * SIZE(BB)
2698 movaps %xmm6, 12 * SIZE(BB)
2700 pshufd $0x00, %xmm3, %xmm0
2701 pshufd $0x55, %xmm3, %xmm2
2702 pshufd $0xaa, %xmm3, %xmm4
2703 pshufd $0xff, %xmm3, %xmm6
2704 movaps %xmm0, 16 * SIZE(BB)
2705 movaps %xmm2, 20 * SIZE(BB)
2706 movaps %xmm4, 24 * SIZE(BB)
2707 movaps %xmm6, 28 * SIZE(BB)
2709 pshufd $0x00, %xmm5, %xmm0
2710 pshufd $0x55, %xmm5, %xmm2
2711 pshufd $0xaa, %xmm5, %xmm4
2712 pshufd $0xff, %xmm5, %xmm6
2713 movaps %xmm0, 32 * SIZE(BB)
2714 movaps %xmm2, 36 * SIZE(BB)
2715 movaps %xmm4, 40 * SIZE(BB)
2716 movaps %xmm6, 44 * SIZE(BB)
2718 pshufd $0x00, %xmm7, %xmm0
2719 pshufd $0x55, %xmm7, %xmm2
2720 pshufd $0xaa, %xmm7, %xmm4
2721 pshufd $0xff, %xmm7, %xmm6
2722 movaps %xmm0, 48 * SIZE(BB)
2723 movaps %xmm2, 52 * SIZE(BB)
2724 movaps %xmm4, 56 * SIZE(BB)
2725 movaps %xmm6, 60 * SIZE(BB)
2727 movaps %xmm0, 0 * SIZE(AA)
2728 movaps %xmm1, 4 * SIZE(AA)
2729 movaps %xmm2, 8 * SIZE(AA)
2730 movaps %xmm3, 12 * SIZE(AA)
2737 leal (LDC, LDC, 2), %eax
2739 #if defined(LN) || defined(LT)
2741 unpcklps %xmm5, %xmm1
2742 unpckhps %xmm5, %xmm0
2745 unpcklps %xmm7, %xmm3
2746 unpckhps %xmm7, %xmm4
2749 unpcklps %xmm3, %xmm1
2750 unpckhps %xmm3, %xmm2
2753 unpcklps %xmm4, %xmm0
2754 unpckhps %xmm4, %xmm6
2756 movlps %xmm1, 0 * SIZE(CO1)
2757 movhps %xmm1, 2 * SIZE(CO1)
2758 movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
2759 movhps %xmm2, 2 * SIZE(CO1, LDC, 1)
2760 movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
2761 movhps %xmm0, 2 * SIZE(CO1, LDC, 2)
2762 movlps %xmm6, 0 * SIZE(CO1, %eax, 1)
2763 movhps %xmm6, 2 * SIZE(CO1, %eax, 1)
2765 movlps %xmm0, 0 * SIZE(CO1)
2766 movhps %xmm0, 2 * SIZE(CO1)
2767 movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
2768 movhps %xmm1, 2 * SIZE(CO1, LDC, 1)
2769 movlps %xmm2, 0 * SIZE(CO1, LDC, 2)
2770 movhps %xmm2, 2 * SIZE(CO1, LDC, 2)
2771 movlps %xmm3, 0 * SIZE(CO1, %eax, 1)
2772 movhps %xmm3, 2 * SIZE(CO1, %eax, 1)
2779 #if defined(LT) || defined(RN)
2782 leal (,%eax, SIZE), %eax
2783 leal (AA, %eax, 4), AA
2801 sall $2 + BASE_SHIFT, %eax
2815 sall $1 + BASE_SHIFT, %eax
2819 #if defined(LN) || defined(RT)
2822 leal (, %eax, SIZE), %eax
2823 leal (AA, %eax, 2), AA
2828 #if defined(LN) || defined(RT)
2830 sall $2 + BASE_SHIFT, %eax
2831 leal (BB, %eax, 4), BB
2837 movsd 0 * SIZE(AA), %xmm0
2842 movsd 8 * SIZE(AA), %xmm1
2844 movaps 0 * SIZE(BB), %xmm2
2846 movaps 16 * SIZE(BB), %xmm3
2849 #if defined(LT) || defined(RN)
2862 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
2863 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
2865 movaps 4 * SIZE(BB), %xmm2
2868 movaps 8 * SIZE(BB), %xmm2
2871 movaps 12 * SIZE(BB), %xmm2
2873 movsd 2 * SIZE(AA), %xmm0
2875 movaps 32 * SIZE(BB), %xmm2
2879 movaps 20 * SIZE(BB), %xmm3
2882 movaps 24 * SIZE(BB), %xmm3
2885 movaps 28 * SIZE(BB), %xmm3
2887 movsd 4 * SIZE(AA), %xmm0
2889 movaps 48 * SIZE(BB), %xmm3
2893 movaps 36 * SIZE(BB), %xmm2
2896 movaps 40 * SIZE(BB), %xmm2
2899 movaps 44 * SIZE(BB), %xmm2
2901 movsd 6 * SIZE(AA), %xmm0
2903 movaps 64 * SIZE(BB), %xmm2
2907 movaps 52 * SIZE(BB), %xmm3
2910 movaps 56 * SIZE(BB), %xmm3
2913 movaps 60 * SIZE(BB), %xmm3
2915 movsd 16 * SIZE(AA), %xmm0
2917 movaps 80 * SIZE(BB), %xmm3
2921 movaps 68 * SIZE(BB), %xmm2
2924 movaps 72 * SIZE(BB), %xmm2
2927 movaps 76 * SIZE(BB), %xmm2
2929 movsd 10 * SIZE(AA), %xmm1
2931 movaps 96 * SIZE(BB), %xmm2
2935 movaps 84 * SIZE(BB), %xmm3
2938 movaps 88 * SIZE(BB), %xmm3
2941 movaps 92 * SIZE(BB), %xmm3
2943 movsd 12 * SIZE(AA), %xmm1
2945 movaps 112 * SIZE(BB), %xmm3
2949 movaps 100 * SIZE(BB), %xmm2
2952 movaps 104 * SIZE(BB), %xmm2
2955 movaps 108 * SIZE(BB), %xmm2
2957 movsd 14 * SIZE(AA), %xmm1
2959 movaps 128 * SIZE(BB), %xmm2
2963 movaps 116 * SIZE(BB), %xmm3
2966 movaps 120 * SIZE(BB), %xmm3
2969 movaps 124 * SIZE(BB), %xmm3
2971 movsd 24 * SIZE(AA), %xmm1
2973 movaps 144 * SIZE(BB), %xmm3
2975 addl $ 16 * SIZE, AA
2976 addl $128 * SIZE, BB
2982 #if defined(LT) || defined(RN)
2988 andl $7, %eax # if (k & 1)
2996 movaps 4 * SIZE(BB), %xmm2
2999 movaps 8 * SIZE(BB), %xmm2
3002 movaps 12 * SIZE(BB), %xmm2
3004 movsd 2 * SIZE(AA), %xmm0
3006 movaps 16 * SIZE(BB), %xmm2
3015 #if defined(LN) || defined(RT)
3027 sall $1 + BASE_SHIFT, %eax
3028 leal (AA, %eax, 1), AA
3029 leal (B, %eax, 2), B
3030 leal (BB, %eax, 8), BB
3033 #if defined(LN) || defined(LT)
3034 unpcklps %xmm6, %xmm4
3035 unpcklps %xmm7, %xmm5
3038 unpcklps %xmm5, %xmm4
3039 unpckhps %xmm5, %xmm6
3041 movaps 0 * SIZE(B), %xmm1
3042 movaps 4 * SIZE(B), %xmm3
3050 movsd 0 * SIZE(AA), %xmm0
3054 movsd 2 * SIZE(AA), %xmm1
3058 movsd 4 * SIZE(AA), %xmm2
3062 movsd 6 * SIZE(AA), %xmm3
3071 movaps 0 * SIZE(AA), %xmm4
3072 pshufd $0xff, %xmm4, %xmm6
3074 pshufd $0xaa, %xmm4, %xmm6
3078 pshufd $0x00, %xmm4, %xmm6
3083 movaps 0 * SIZE(AA), %xmm4
3084 pshufd $0x00, %xmm4, %xmm6
3087 pshufd $0x55, %xmm4, %xmm6
3091 pshufd $0xff, %xmm4, %xmm6
3096 movaps 0 * SIZE(B), %xmm6
3097 pshufd $0x00, %xmm6, %xmm7
3099 pshufd $0x55, %xmm6, %xmm7
3102 pshufd $0xaa, %xmm6, %xmm7
3105 pshufd $0xff, %xmm6, %xmm7
3109 movaps 4 * SIZE(B), %xmm6
3110 pshufd $0x55, %xmm6, %xmm7
3112 pshufd $0xaa, %xmm6, %xmm7
3115 pshufd $0xff, %xmm6, %xmm7
3119 movaps 8 * SIZE(B), %xmm6
3120 pshufd $0xaa, %xmm6, %xmm7
3122 pshufd $0xff, %xmm6, %xmm7
3126 movaps 12 * SIZE(B), %xmm6
3127 pshufd $0xff, %xmm6, %xmm7
3132 movaps 12 * SIZE(B), %xmm6
3133 pshufd $0xff, %xmm6, %xmm7
3135 pshufd $0xaa, %xmm6, %xmm7
3138 pshufd $0x55, %xmm6, %xmm7
3141 pshufd $0x00, %xmm6, %xmm7
3145 movaps 8 * SIZE(B), %xmm6
3146 pshufd $0xaa, %xmm6, %xmm7
3148 pshufd $0x55, %xmm6, %xmm7
3151 pshufd $0x00, %xmm6, %xmm7
3155 movaps 4 * SIZE(B), %xmm6
3156 pshufd $0x55, %xmm6, %xmm7
3158 pshufd $0x00, %xmm6, %xmm7
3162 movaps 0 * SIZE(B), %xmm6
3163 pshufd $0x00, %xmm6, %xmm7
3167 #if defined(LN) || defined(LT)
3168 movaps %xmm1, 0 * SIZE(B)
3169 movaps %xmm3, 4 * SIZE(B)
3171 pshufd $0x00, %xmm1, %xmm0
3172 pshufd $0x55, %xmm1, %xmm2
3173 pshufd $0xaa, %xmm1, %xmm4
3174 pshufd $0xff, %xmm1, %xmm6
3175 movaps %xmm0, 0 * SIZE(BB)
3176 movaps %xmm2, 4 * SIZE(BB)
3177 movaps %xmm4, 8 * SIZE(BB)
3178 movaps %xmm6, 12 * SIZE(BB)
3180 pshufd $0x00, %xmm3, %xmm0
3181 pshufd $0x55, %xmm3, %xmm2
3182 pshufd $0xaa, %xmm3, %xmm4
3183 pshufd $0xff, %xmm3, %xmm6
3184 movaps %xmm0, 16 * SIZE(BB)
3185 movaps %xmm2, 20 * SIZE(BB)
3186 movaps %xmm4, 24 * SIZE(BB)
3187 movaps %xmm6, 28 * SIZE(BB)
3189 movlps %xmm0, 0 * SIZE(AA)
3190 movlps %xmm1, 2 * SIZE(AA)
3191 movlps %xmm2, 4 * SIZE(AA)
3192 movlps %xmm3, 6 * SIZE(AA)
3199 leal (LDC, LDC, 2), %eax
3201 #if defined(LN) || defined(LT)
3203 unpcklps %xmm5, %xmm1
3204 unpckhps %xmm5, %xmm0
3207 unpcklps %xmm7, %xmm3
3208 unpckhps %xmm7, %xmm4
3211 unpcklps %xmm3, %xmm1
3212 unpckhps %xmm3, %xmm2
3215 unpcklps %xmm4, %xmm0
3216 unpckhps %xmm4, %xmm6
3218 movlps %xmm1, 0 * SIZE(CO1)
3219 movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
3220 movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
3221 movlps %xmm6, 0 * SIZE(CO1, %eax, 1)
3223 movlps %xmm0, 0 * SIZE(CO1)
3224 movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
3225 movlps %xmm2, 0 * SIZE(CO1, LDC, 2)
3226 movlps %xmm3, 0 * SIZE(CO1, %eax, 1)
3233 #if defined(LT) || defined(RN)
3236 leal (,%eax, SIZE), %eax
3237 leal (AA, %eax, 2), AA
3255 sall $1 + BASE_SHIFT, %eax
3266 sall $BASE_SHIFT, %eax
3270 #if defined(LN) || defined(RT)
3273 leal (AA, %eax, SIZE), AA
3278 #if defined(LN) || defined(RT)
3280 sall $2 + BASE_SHIFT, %eax
3281 leal (BB, %eax, 4), BB
3284 movss 0 * SIZE(AA), %xmm0
3286 movss 4 * SIZE(AA), %xmm1
3288 movss 0 * SIZE(BB), %xmm2
3290 movss 16 * SIZE(BB), %xmm3
3293 #if defined(LT) || defined(RN)
3306 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
3307 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
3309 movss 4 * SIZE(BB), %xmm2
3312 movss 8 * SIZE(BB), %xmm2
3314 mulss 12 * SIZE(BB), %xmm0
3316 movss 32 * SIZE(BB), %xmm2
3318 movss 1 * SIZE(AA), %xmm0
3322 movss 20 * SIZE(BB), %xmm3
3325 movss 24 * SIZE(BB), %xmm3
3327 mulss 28 * SIZE(BB), %xmm0
3329 movss 48 * SIZE(BB), %xmm3
3331 movss 2 * SIZE(AA), %xmm0
3335 movss 36 * SIZE(BB), %xmm2
3338 movss 40 * SIZE(BB), %xmm2
3340 mulss 44 * SIZE(BB), %xmm0
3342 movss 64 * SIZE(BB), %xmm2
3344 movss 3 * SIZE(AA), %xmm0
3348 movss 52 * SIZE(BB), %xmm3
3351 movss 56 * SIZE(BB), %xmm3
3353 mulss 60 * SIZE(BB), %xmm0
3355 movss 80 * SIZE(BB), %xmm3
3357 movss 8 * SIZE(AA), %xmm0
3361 movss 68 * SIZE(BB), %xmm2
3364 movss 72 * SIZE(BB), %xmm2
3366 mulss 76 * SIZE(BB), %xmm1
3368 movss 96 * SIZE(BB), %xmm2
3370 movss 5 * SIZE(AA), %xmm1
3374 movss 84 * SIZE(BB), %xmm3
3377 movss 88 * SIZE(BB), %xmm3
3379 mulss 92 * SIZE(BB), %xmm1
3381 movss 112 * SIZE(BB), %xmm3
3383 movss 6 * SIZE(AA), %xmm1
3387 movss 100 * SIZE(BB), %xmm2
3390 movss 104 * SIZE(BB), %xmm2
3392 mulss 108 * SIZE(BB), %xmm1
3394 movss 128 * SIZE(BB), %xmm2
3396 movss 7 * SIZE(AA), %xmm1
3400 movss 116 * SIZE(BB), %xmm3
3403 movss 120 * SIZE(BB), %xmm3
3405 mulss 124 * SIZE(BB), %xmm1
3407 movss 144 * SIZE(BB), %xmm3
3409 movss 12 * SIZE(AA), %xmm1
3412 addl $128 * SIZE, BB
3418 #if defined(LT) || defined(RN)
3424 andl $7, %eax # if (k & 1)
3432 movss 4 * SIZE(BB), %xmm2
3435 movss 8 * SIZE(BB), %xmm2
3437 mulss 12 * SIZE(BB), %xmm0
3439 movss 16 * SIZE(BB), %xmm2
3441 movss 1 * SIZE(AA), %xmm0
3450 #if defined(LN) || defined(RT)
3462 leal (AA, %eax, SIZE), AA
3464 sall $2 + BASE_SHIFT, %eax
3465 leal (B, %eax, 1), B
3466 leal (BB, %eax, 4), BB
3469 #if defined(LN) || defined(LT)
3470 unpcklps %xmm6, %xmm4
3471 unpcklps %xmm7, %xmm5
3472 unpcklps %xmm5, %xmm4
3474 movaps 0 * SIZE(B), %xmm1
3478 movss 0 * SIZE(AA), %xmm0
3479 movss 1 * SIZE(AA), %xmm1
3480 movss 2 * SIZE(AA), %xmm2
3481 movss 3 * SIZE(AA), %xmm3
3489 #if defined(LN) || defined(LT)
3490 movss 0 * SIZE(AA), %xmm4
3491 pshufd $0x00, %xmm4, %xmm6
3496 movaps 0 * SIZE(B), %xmm6
3497 pshufd $0x00, %xmm6, %xmm7
3499 pshufd $0x55, %xmm6, %xmm7
3502 pshufd $0xaa, %xmm6, %xmm7
3505 pshufd $0xff, %xmm6, %xmm7
3509 movaps 4 * SIZE(B), %xmm6
3510 pshufd $0x55, %xmm6, %xmm7
3512 pshufd $0xaa, %xmm6, %xmm7
3515 pshufd $0xff, %xmm6, %xmm7
3519 movaps 8 * SIZE(B), %xmm6
3520 pshufd $0xaa, %xmm6, %xmm7
3522 pshufd $0xff, %xmm6, %xmm7
3526 movaps 12 * SIZE(B), %xmm6
3527 pshufd $0xff, %xmm6, %xmm7
3532 movaps 12 * SIZE(B), %xmm6
3533 pshufd $0xff, %xmm6, %xmm7
3535 pshufd $0xaa, %xmm6, %xmm7
3538 pshufd $0x55, %xmm6, %xmm7
3541 pshufd $0x00, %xmm6, %xmm7
3545 movaps 8 * SIZE(B), %xmm6
3546 pshufd $0xaa, %xmm6, %xmm7
3548 pshufd $0x55, %xmm6, %xmm7
3551 pshufd $0x00, %xmm6, %xmm7
3555 movaps 4 * SIZE(B), %xmm6
3556 pshufd $0x55, %xmm6, %xmm7
3558 pshufd $0x00, %xmm6, %xmm7
3562 movaps 0 * SIZE(B), %xmm6
3563 pshufd $0x00, %xmm6, %xmm7
3567 #if defined(LN) || defined(LT)
3568 movaps %xmm1, 0 * SIZE(B)
3570 pshufd $0x00, %xmm1, %xmm0
3571 pshufd $0x55, %xmm1, %xmm2
3572 pshufd $0xaa, %xmm1, %xmm4
3573 pshufd $0xff, %xmm1, %xmm6
3574 movaps %xmm0, 0 * SIZE(BB)
3575 movaps %xmm2, 4 * SIZE(BB)
3576 movaps %xmm4, 8 * SIZE(BB)
3577 movaps %xmm6, 12 * SIZE(BB)
3579 movss %xmm0, 0 * SIZE(AA)
3580 movss %xmm1, 1 * SIZE(AA)
3581 movss %xmm2, 2 * SIZE(AA)
3582 movss %xmm3, 3 * SIZE(AA)
3589 leal (LDC, LDC, 2), %eax
3591 #if defined(LN) || defined(LT)
3593 unpcklps %xmm5, %xmm1
3594 unpckhps %xmm5, %xmm0
3597 unpcklps %xmm7, %xmm3
3598 unpckhps %xmm7, %xmm4
3601 unpcklps %xmm3, %xmm1
3602 unpckhps %xmm3, %xmm2
3605 unpcklps %xmm4, %xmm0
3606 unpckhps %xmm4, %xmm6
3608 movss %xmm1, 0 * SIZE(CO1)
3609 movss %xmm2, 0 * SIZE(CO1, LDC, 1)
3610 movss %xmm0, 0 * SIZE(CO1, LDC, 2)
3611 movss %xmm6, 0 * SIZE(CO1, %eax, 1)
3613 movss %xmm0, 0 * SIZE(CO1)
3614 movss %xmm1, 0 * SIZE(CO1, LDC, 1)
3615 movss %xmm2, 0 * SIZE(CO1, LDC, 2)
3616 movss %xmm3, 0 * SIZE(CO1, %eax, 1)
3623 #if defined(LT) || defined(RN)
3626 leal (AA, %eax, SIZE), AA
3644 sall $BASE_SHIFT, %eax
3652 leal (, %eax, SIZE), %eax
3653 leal (B, %eax, 4), B
3656 #if defined(LT) || defined(RN)
3659 leal (,%eax, SIZE), %eax
3660 leal (B, %eax, 4), B
3676 movl OLD_STACK, %esp