1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
44 #define OLD_M 4 + STACK(%esi)
45 #define OLD_N 8 + STACK(%esi)
46 #define OLD_K 12 + STACK(%esi)
47 #define OLD_A 20 + STACK(%esi)
48 #define OLD_B 24 + STACK(%esi)
49 #define OLD_C 28 + STACK(%esi)
50 #define OLD_LDC 32 + STACK(%esi)
51 #define STACK_OFFT 36 + STACK(%esi)
59 #define OLD_STACK 40(%esp)
60 #define OFFSET 44(%esp)
63 #define AORIG 56(%esp)
64 #define BORIG 60(%esp)
65 #define BUFFER 128(%esp)
67 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
68 #define PREFETCH prefetch
69 #define PREFETCHW prefetchw
70 #define PREFETCHSIZE (16 * 10 + 8)
73 #if defined(PENTIUM4) || defined(PENTIUMM)
74 #define PREFETCH prefetcht0
75 #define PREFETCHW prefetcht0
76 #define PREFETCHSIZE 96
79 #if defined(PENRYN) || defined(DUNNINGTON)
80 #define PREFETCH prefetcht0
81 #define PREFETCHW prefetcht0
82 #define PREFETCHSIZE 96
91 #if defined(OPTERON) || !defined(HAVE_SSE2)
99 #define KERNEL1(address) \
100 mulps %xmm0, %xmm2; \
101 addps %xmm2, %xmm4; \
102 movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
103 mulps %xmm0, %xmm2; \
104 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
105 addps %xmm2, %xmm5; \
106 movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
107 mulps %xmm0, %xmm2; \
108 mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
109 addps %xmm2, %xmm6; \
110 movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
111 addps %xmm0, %xmm7; \
112 movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
114 #define KERNEL2(address) \
115 mulps %xmm0, %xmm3; \
116 addps %xmm3, %xmm4; \
117 movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
118 mulps %xmm0, %xmm3; \
119 addps %xmm3, %xmm5; \
120 movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
121 mulps %xmm0, %xmm3; \
122 mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
123 addps %xmm3, %xmm6; \
124 movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
125 addps %xmm0, %xmm7; \
126 movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
128 #define KERNEL3(address) \
129 mulps %xmm0, %xmm2; \
130 addps %xmm2, %xmm4; \
131 movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
132 mulps %xmm0, %xmm2; \
133 addps %xmm2, %xmm5; \
134 movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
135 mulps %xmm0, %xmm2; \
136 mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
137 addps %xmm2, %xmm6; \
138 movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
139 addps %xmm0, %xmm7; \
140 movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
142 #define KERNEL4(address) \
143 mulps %xmm0, %xmm3; \
144 addps %xmm3, %xmm4; \
145 movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
146 mulps %xmm0, %xmm3; \
147 addps %xmm3, %xmm5; \
148 movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
149 mulps %xmm0, %xmm3; \
150 mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
151 addps %xmm3, %xmm6; \
152 movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
153 addps %xmm0, %xmm7; \
154 movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
156 #define KERNEL5(address) \
157 mulps %xmm1, %xmm2; \
158 addps %xmm2, %xmm4; \
159 movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
160 mulps %xmm1, %xmm2; \
161 addps %xmm2, %xmm5; \
162 movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
163 mulps %xmm1, %xmm2; \
164 mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
165 addps %xmm2, %xmm6; \
166 movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
167 addps %xmm1, %xmm7; \
168 movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
170 #define KERNEL6(address) \
171 mulps %xmm1, %xmm3; \
172 addps %xmm3, %xmm4; \
173 movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
174 mulps %xmm1, %xmm3; \
175 addps %xmm3, %xmm5; \
176 movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
177 mulps %xmm1, %xmm3; \
178 mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
179 addps %xmm3, %xmm6; \
180 movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
181 addps %xmm1, %xmm7; \
182 movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
184 #define KERNEL7(address) \
185 mulps %xmm1, %xmm2; \
186 addps %xmm2, %xmm4; \
187 movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
188 mulps %xmm1, %xmm2; \
189 addps %xmm2, %xmm5; \
190 movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
191 mulps %xmm1, %xmm2; \
192 mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
193 addps %xmm2, %xmm6; \
194 movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
195 addps %xmm1, %xmm7; \
196 movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
198 #define KERNEL8(address) \
199 mulps %xmm1, %xmm3; \
200 addps %xmm3, %xmm4; \
201 movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
202 mulps %xmm1, %xmm3; \
203 addps %xmm3, %xmm5; \
204 movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
205 mulps %xmm1, %xmm3; \
206 mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
207 addps %xmm3, %xmm6; \
208 movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
209 addps %xmm1, %xmm7; \
210 movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
223 subl $128 + LOCAL_BUFFER_SIZE, %esp
238 movss STACK_OFFT, %xmm4
249 leal (, LDC, SIZE), LDC
253 leal (, %eax, SIZE), %eax
261 leal (, %eax, SIZE), %eax
295 sall $2 + BASE_SHIFT, %eax
299 #if defined(LN) || defined(RT)
302 sall $2 + BASE_SHIFT, %eax
304 leal (BB, %eax, 4), BB
312 #if defined(LT) || defined(RN)
323 movaps 0 * SIZE(B), %xmm3
324 movaps 4 * SIZE(B), %xmm7
326 pshufd $0x00, %xmm3, %xmm0
327 pshufd $0x55, %xmm3, %xmm1
328 pshufd $0xaa, %xmm3, %xmm2
329 pshufd $0xff, %xmm3, %xmm3
331 pshufd $0x00, %xmm7, %xmm4
332 pshufd $0x55, %xmm7, %xmm5
333 pshufd $0xaa, %xmm7, %xmm6
334 pshufd $0xff, %xmm7, %xmm7
336 movaps %xmm0, 0 * SIZE(BB)
337 movaps %xmm1, 4 * SIZE(BB)
338 movaps %xmm2, 8 * SIZE(BB)
339 movaps %xmm3, 12 * SIZE(BB)
340 movaps %xmm4, 16 * SIZE(BB)
341 movaps %xmm5, 20 * SIZE(BB)
342 movaps %xmm6, 24 * SIZE(BB)
343 movaps %xmm7, 28 * SIZE(BB)
346 addl $32 * SIZE, %ecx
352 #if defined(LT) || defined(RN)
362 movaps 0 * SIZE(B), %xmm3
364 pshufd $0x00, %xmm3, %xmm0
365 pshufd $0x55, %xmm3, %xmm1
366 pshufd $0xaa, %xmm3, %xmm2
367 pshufd $0xff, %xmm3, %xmm3
369 movaps %xmm0, 0 * SIZE(BB)
370 movaps %xmm1, 4 * SIZE(BB)
371 movaps %xmm2, 8 * SIZE(BB)
372 movaps %xmm3, 12 * SIZE(BB)
378 #if defined(LT) || defined(RN)
385 leal (, LDC, 4), %eax
400 sall $BASE_SHIFT, %eax
404 #if defined(LN) || defined(RT)
407 leal (AA, %eax, SIZE), AA
412 #if defined(LN) || defined(RT)
414 sall $2 + BASE_SHIFT, %eax
415 leal (BB, %eax, 4), BB
418 movss 0 * SIZE(AA), %xmm0
420 movss 4 * SIZE(AA), %xmm1
422 movss 0 * SIZE(BB), %xmm2
424 movss 16 * SIZE(BB), %xmm3
427 #if defined(LT) || defined(RN)
440 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
441 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
443 movss 4 * SIZE(BB), %xmm2
446 movss 8 * SIZE(BB), %xmm2
448 mulss 12 * SIZE(BB), %xmm0
450 movss 32 * SIZE(BB), %xmm2
452 movss 1 * SIZE(AA), %xmm0
456 movss 20 * SIZE(BB), %xmm3
459 movss 24 * SIZE(BB), %xmm3
461 mulss 28 * SIZE(BB), %xmm0
463 movss 48 * SIZE(BB), %xmm3
465 movss 2 * SIZE(AA), %xmm0
469 movss 36 * SIZE(BB), %xmm2
472 movss 40 * SIZE(BB), %xmm2
474 mulss 44 * SIZE(BB), %xmm0
476 movss 64 * SIZE(BB), %xmm2
478 movss 3 * SIZE(AA), %xmm0
482 movss 52 * SIZE(BB), %xmm3
485 movss 56 * SIZE(BB), %xmm3
487 mulss 60 * SIZE(BB), %xmm0
489 movss 80 * SIZE(BB), %xmm3
491 movss 8 * SIZE(AA), %xmm0
495 movss 68 * SIZE(BB), %xmm2
498 movss 72 * SIZE(BB), %xmm2
500 mulss 76 * SIZE(BB), %xmm1
502 movss 96 * SIZE(BB), %xmm2
504 movss 5 * SIZE(AA), %xmm1
508 movss 84 * SIZE(BB), %xmm3
511 movss 88 * SIZE(BB), %xmm3
513 mulss 92 * SIZE(BB), %xmm1
515 movss 112 * SIZE(BB), %xmm3
517 movss 6 * SIZE(AA), %xmm1
521 movss 100 * SIZE(BB), %xmm2
524 movss 104 * SIZE(BB), %xmm2
526 mulss 108 * SIZE(BB), %xmm1
528 movss 128 * SIZE(BB), %xmm2
530 movss 7 * SIZE(AA), %xmm1
534 movss 116 * SIZE(BB), %xmm3
537 movss 120 * SIZE(BB), %xmm3
539 mulss 124 * SIZE(BB), %xmm1
541 movss 144 * SIZE(BB), %xmm3
543 movss 12 * SIZE(AA), %xmm1
552 #if defined(LT) || defined(RN)
558 andl $7, %eax # if (k & 1)
566 movss 4 * SIZE(BB), %xmm2
569 movss 8 * SIZE(BB), %xmm2
571 mulss 12 * SIZE(BB), %xmm0
573 movss 16 * SIZE(BB), %xmm2
575 movss 1 * SIZE(AA), %xmm0
584 #if defined(LN) || defined(RT)
596 leal (AA, %eax, SIZE), AA
598 sall $2 + BASE_SHIFT, %eax
600 leal (BB, %eax, 4), BB
603 #if defined(LN) || defined(LT)
604 unpcklps %xmm6, %xmm4
605 unpcklps %xmm7, %xmm5
606 unpcklps %xmm5, %xmm4
608 movaps 0 * SIZE(B), %xmm1
612 movss 0 * SIZE(AA), %xmm0
613 movss 1 * SIZE(AA), %xmm1
614 movss 2 * SIZE(AA), %xmm2
615 movss 3 * SIZE(AA), %xmm3
623 #if defined(LN) || defined(LT)
624 movss 0 * SIZE(AA), %xmm4
625 pshufd $0x00, %xmm4, %xmm6
630 movaps 0 * SIZE(B), %xmm6
631 pshufd $0x00, %xmm6, %xmm7
633 pshufd $0x55, %xmm6, %xmm7
636 pshufd $0xaa, %xmm6, %xmm7
639 pshufd $0xff, %xmm6, %xmm7
643 movaps 4 * SIZE(B), %xmm6
644 pshufd $0x55, %xmm6, %xmm7
646 pshufd $0xaa, %xmm6, %xmm7
649 pshufd $0xff, %xmm6, %xmm7
653 movaps 8 * SIZE(B), %xmm6
654 pshufd $0xaa, %xmm6, %xmm7
656 pshufd $0xff, %xmm6, %xmm7
660 movaps 12 * SIZE(B), %xmm6
661 pshufd $0xff, %xmm6, %xmm7
666 movaps 12 * SIZE(B), %xmm6
667 pshufd $0xff, %xmm6, %xmm7
669 pshufd $0xaa, %xmm6, %xmm7
672 pshufd $0x55, %xmm6, %xmm7
675 pshufd $0x00, %xmm6, %xmm7
679 movaps 8 * SIZE(B), %xmm6
680 pshufd $0xaa, %xmm6, %xmm7
682 pshufd $0x55, %xmm6, %xmm7
685 pshufd $0x00, %xmm6, %xmm7
689 movaps 4 * SIZE(B), %xmm6
690 pshufd $0x55, %xmm6, %xmm7
692 pshufd $0x00, %xmm6, %xmm7
696 movaps 0 * SIZE(B), %xmm6
697 pshufd $0x00, %xmm6, %xmm7
701 #if defined(LN) || defined(LT)
702 movaps %xmm1, 0 * SIZE(B)
704 pshufd $0x00, %xmm1, %xmm0
705 pshufd $0x55, %xmm1, %xmm2
706 pshufd $0xaa, %xmm1, %xmm4
707 pshufd $0xff, %xmm1, %xmm6
708 movaps %xmm0, 0 * SIZE(BB)
709 movaps %xmm2, 4 * SIZE(BB)
710 movaps %xmm4, 8 * SIZE(BB)
711 movaps %xmm6, 12 * SIZE(BB)
713 movss %xmm0, 0 * SIZE(AA)
714 movss %xmm1, 1 * SIZE(AA)
715 movss %xmm2, 2 * SIZE(AA)
716 movss %xmm3, 3 * SIZE(AA)
723 leal (LDC, LDC, 2), %eax
725 #if defined(LN) || defined(LT)
727 unpcklps %xmm5, %xmm1
728 unpckhps %xmm5, %xmm0
731 unpcklps %xmm7, %xmm3
732 unpckhps %xmm7, %xmm4
735 unpcklps %xmm3, %xmm1
736 unpckhps %xmm3, %xmm2
739 unpcklps %xmm4, %xmm0
740 unpckhps %xmm4, %xmm6
742 movss %xmm1, 0 * SIZE(CO1)
743 movss %xmm2, 0 * SIZE(CO1, LDC, 1)
744 movss %xmm0, 0 * SIZE(CO1, LDC, 2)
745 movss %xmm6, 0 * SIZE(CO1, %eax, 1)
747 movss %xmm0, 0 * SIZE(CO1)
748 movss %xmm1, 0 * SIZE(CO1, LDC, 1)
749 movss %xmm2, 0 * SIZE(CO1, LDC, 2)
750 movss %xmm3, 0 * SIZE(CO1, %eax, 1)
757 #if defined(LT) || defined(RN)
760 leal (AA, %eax, SIZE), AA
778 sall $BASE_SHIFT, %eax
789 sall $1 + BASE_SHIFT, %eax
793 #if defined(LN) || defined(RT)
796 leal (, %eax, SIZE), %eax
797 leal (AA, %eax, 2), AA
802 #if defined(LN) || defined(RT)
804 sall $2 + BASE_SHIFT, %eax
805 leal (BB, %eax, 4), BB
811 movsd 0 * SIZE(AA), %xmm0
816 movsd 8 * SIZE(AA), %xmm1
818 movaps 0 * SIZE(BB), %xmm2
820 movaps 16 * SIZE(BB), %xmm3
823 #if defined(LT) || defined(RN)
836 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
837 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
839 movaps 4 * SIZE(BB), %xmm2
842 movaps 8 * SIZE(BB), %xmm2
845 movaps 12 * SIZE(BB), %xmm2
847 movsd 2 * SIZE(AA), %xmm0
849 movaps 32 * SIZE(BB), %xmm2
853 movaps 20 * SIZE(BB), %xmm3
856 movaps 24 * SIZE(BB), %xmm3
859 movaps 28 * SIZE(BB), %xmm3
861 movsd 4 * SIZE(AA), %xmm0
863 movaps 48 * SIZE(BB), %xmm3
867 movaps 36 * SIZE(BB), %xmm2
870 movaps 40 * SIZE(BB), %xmm2
873 movaps 44 * SIZE(BB), %xmm2
875 movsd 6 * SIZE(AA), %xmm0
877 movaps 64 * SIZE(BB), %xmm2
881 movaps 52 * SIZE(BB), %xmm3
884 movaps 56 * SIZE(BB), %xmm3
887 movaps 60 * SIZE(BB), %xmm3
889 movsd 16 * SIZE(AA), %xmm0
891 movaps 80 * SIZE(BB), %xmm3
895 movaps 68 * SIZE(BB), %xmm2
898 movaps 72 * SIZE(BB), %xmm2
901 movaps 76 * SIZE(BB), %xmm2
903 movsd 10 * SIZE(AA), %xmm1
905 movaps 96 * SIZE(BB), %xmm2
909 movaps 84 * SIZE(BB), %xmm3
912 movaps 88 * SIZE(BB), %xmm3
915 movaps 92 * SIZE(BB), %xmm3
917 movsd 12 * SIZE(AA), %xmm1
919 movaps 112 * SIZE(BB), %xmm3
923 movaps 100 * SIZE(BB), %xmm2
926 movaps 104 * SIZE(BB), %xmm2
929 movaps 108 * SIZE(BB), %xmm2
931 movsd 14 * SIZE(AA), %xmm1
933 movaps 128 * SIZE(BB), %xmm2
937 movaps 116 * SIZE(BB), %xmm3
940 movaps 120 * SIZE(BB), %xmm3
943 movaps 124 * SIZE(BB), %xmm3
945 movsd 24 * SIZE(AA), %xmm1
947 movaps 144 * SIZE(BB), %xmm3
956 #if defined(LT) || defined(RN)
962 andl $7, %eax # if (k & 1)
970 movaps 4 * SIZE(BB), %xmm2
973 movaps 8 * SIZE(BB), %xmm2
976 movaps 12 * SIZE(BB), %xmm2
978 movsd 2 * SIZE(AA), %xmm0
980 movaps 16 * SIZE(BB), %xmm2
989 #if defined(LN) || defined(RT)
1001 sall $1 + BASE_SHIFT, %eax
1002 leal (AA, %eax, 1), AA
1003 leal (B, %eax, 2), B
1004 leal (BB, %eax, 8), BB
1007 #if defined(LN) || defined(LT)
1008 unpcklps %xmm6, %xmm4
1009 unpcklps %xmm7, %xmm5
1012 unpcklps %xmm5, %xmm4
1013 unpckhps %xmm5, %xmm6
1015 movaps 0 * SIZE(B), %xmm1
1016 movaps 4 * SIZE(B), %xmm3
1024 movsd 0 * SIZE(AA), %xmm0
1028 movsd 2 * SIZE(AA), %xmm1
1032 movsd 4 * SIZE(AA), %xmm2
1036 movsd 6 * SIZE(AA), %xmm3
1045 movaps 0 * SIZE(AA), %xmm4
1046 pshufd $0xff, %xmm4, %xmm6
1048 pshufd $0xaa, %xmm4, %xmm6
1052 pshufd $0x00, %xmm4, %xmm6
1057 movaps 0 * SIZE(AA), %xmm4
1058 pshufd $0x00, %xmm4, %xmm6
1061 pshufd $0x55, %xmm4, %xmm6
1065 pshufd $0xff, %xmm4, %xmm6
1070 movaps 0 * SIZE(B), %xmm6
1071 pshufd $0x00, %xmm6, %xmm7
1073 pshufd $0x55, %xmm6, %xmm7
1076 pshufd $0xaa, %xmm6, %xmm7
1079 pshufd $0xff, %xmm6, %xmm7
1083 movaps 4 * SIZE(B), %xmm6
1084 pshufd $0x55, %xmm6, %xmm7
1086 pshufd $0xaa, %xmm6, %xmm7
1089 pshufd $0xff, %xmm6, %xmm7
1093 movaps 8 * SIZE(B), %xmm6
1094 pshufd $0xaa, %xmm6, %xmm7
1096 pshufd $0xff, %xmm6, %xmm7
1100 movaps 12 * SIZE(B), %xmm6
1101 pshufd $0xff, %xmm6, %xmm7
1106 movaps 12 * SIZE(B), %xmm6
1107 pshufd $0xff, %xmm6, %xmm7
1109 pshufd $0xaa, %xmm6, %xmm7
1112 pshufd $0x55, %xmm6, %xmm7
1115 pshufd $0x00, %xmm6, %xmm7
1119 movaps 8 * SIZE(B), %xmm6
1120 pshufd $0xaa, %xmm6, %xmm7
1122 pshufd $0x55, %xmm6, %xmm7
1125 pshufd $0x00, %xmm6, %xmm7
1129 movaps 4 * SIZE(B), %xmm6
1130 pshufd $0x55, %xmm6, %xmm7
1132 pshufd $0x00, %xmm6, %xmm7
1136 movaps 0 * SIZE(B), %xmm6
1137 pshufd $0x00, %xmm6, %xmm7
1141 #if defined(LN) || defined(LT)
1142 movaps %xmm1, 0 * SIZE(B)
1143 movaps %xmm3, 4 * SIZE(B)
1145 pshufd $0x00, %xmm1, %xmm0
1146 pshufd $0x55, %xmm1, %xmm2
1147 pshufd $0xaa, %xmm1, %xmm4
1148 pshufd $0xff, %xmm1, %xmm6
1149 movaps %xmm0, 0 * SIZE(BB)
1150 movaps %xmm2, 4 * SIZE(BB)
1151 movaps %xmm4, 8 * SIZE(BB)
1152 movaps %xmm6, 12 * SIZE(BB)
1154 pshufd $0x00, %xmm3, %xmm0
1155 pshufd $0x55, %xmm3, %xmm2
1156 pshufd $0xaa, %xmm3, %xmm4
1157 pshufd $0xff, %xmm3, %xmm6
1158 movaps %xmm0, 16 * SIZE(BB)
1159 movaps %xmm2, 20 * SIZE(BB)
1160 movaps %xmm4, 24 * SIZE(BB)
1161 movaps %xmm6, 28 * SIZE(BB)
1163 movlps %xmm0, 0 * SIZE(AA)
1164 movlps %xmm1, 2 * SIZE(AA)
1165 movlps %xmm2, 4 * SIZE(AA)
1166 movlps %xmm3, 6 * SIZE(AA)
1173 leal (LDC, LDC, 2), %eax
1175 #if defined(LN) || defined(LT)
1177 unpcklps %xmm5, %xmm1
1178 unpckhps %xmm5, %xmm0
1181 unpcklps %xmm7, %xmm3
1182 unpckhps %xmm7, %xmm4
1185 unpcklps %xmm3, %xmm1
1186 unpckhps %xmm3, %xmm2
1189 unpcklps %xmm4, %xmm0
1190 unpckhps %xmm4, %xmm6
1192 movlps %xmm1, 0 * SIZE(CO1)
1193 movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
1194 movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
1195 movlps %xmm6, 0 * SIZE(CO1, %eax, 1)
1197 movlps %xmm0, 0 * SIZE(CO1)
1198 movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
1199 movlps %xmm2, 0 * SIZE(CO1, LDC, 2)
1200 movlps %xmm3, 0 * SIZE(CO1, %eax, 1)
1207 #if defined(LT) || defined(RN)
1210 leal (,%eax, SIZE), %eax
1211 leal (AA, %eax, 2), AA
1229 sall $1 + BASE_SHIFT, %eax
1236 sarl $2, %ebx # i = (m >> 2)
1243 sall $2 + BASE_SHIFT, %eax
1247 #if defined(LN) || defined(RT)
1250 leal (, %eax, SIZE), %eax
1251 leal (AA, %eax, 4), AA
1256 #if defined(LN) || defined(RT)
1258 sall $2 + BASE_SHIFT, %eax
1259 leal (BB, %eax, 4), BB
1262 movaps 0 * SIZE(AA), %xmm0
1264 movaps 16 * SIZE(AA), %xmm1
1266 movaps 0 * SIZE(BB), %xmm2
1268 movaps 16 * SIZE(BB), %xmm3
1271 leal (LDC, LDC, 2), %eax
1273 PREFETCHW -4 * SIZE(CO1)
1274 PREFETCHW -4 * SIZE(CO1, LDC)
1275 PREFETCHW -4 * SIZE(CO1, LDC, 2)
1276 PREFETCHW -4 * SIZE(CO1, %eax)
1278 #if defined(LT) || defined(RN)
1298 addl $128 * SIZE, BB
1305 #if defined(LT) || defined(RN)
1311 andl $7, %eax # if (k & 1)
1319 movaps 4 * SIZE(BB), %xmm2
1322 movaps 8 * SIZE(BB), %xmm2
1324 mulps 12 * SIZE(BB), %xmm0
1326 movaps 16 * SIZE(BB), %xmm2
1328 movaps 4 * SIZE(AA), %xmm0
1337 #if defined(LN) || defined(RT)
1349 sall $2 + BASE_SHIFT, %eax
1350 leal (AA, %eax, 1), AA
1351 leal (B, %eax, 1), B
1352 leal (BB, %eax, 4), BB
1355 #if defined(LN) || defined(LT)
1357 unpcklps %xmm6, %xmm4
1358 unpckhps %xmm6, %xmm0
1361 unpcklps %xmm7, %xmm5
1362 unpckhps %xmm7, %xmm1
1365 unpcklps %xmm5, %xmm4
1366 unpckhps %xmm5, %xmm6
1369 unpcklps %xmm1, %xmm0
1370 unpckhps %xmm1, %xmm2
1372 movaps 0 * SIZE(B), %xmm1
1373 movaps 4 * SIZE(B), %xmm3
1374 movaps 8 * SIZE(B), %xmm5
1375 movaps 12 * SIZE(B), %xmm7
1382 movaps 0 * SIZE(AA), %xmm0
1383 movaps 4 * SIZE(AA), %xmm1
1384 movaps 8 * SIZE(AA), %xmm2
1385 movaps 12 * SIZE(AA), %xmm3
1394 movaps 12 * SIZE(AA), %xmm4
1395 pshufd $0xff, %xmm4, %xmm6
1397 pshufd $0xaa, %xmm4, %xmm6
1400 pshufd $0x55, %xmm4, %xmm6
1403 pshufd $0x00, %xmm4, %xmm6
1407 movaps 8 * SIZE(AA), %xmm4
1408 pshufd $0xaa, %xmm4, %xmm6
1410 pshufd $0x55, %xmm4, %xmm6
1413 pshufd $0x00, %xmm4, %xmm6
1417 movaps 4 * SIZE(AA), %xmm4
1418 pshufd $0x55, %xmm4, %xmm6
1420 pshufd $0x00, %xmm4, %xmm6
1424 movaps 0 * SIZE(AA), %xmm4
1425 pshufd $0x00, %xmm4, %xmm6
1430 movaps 0 * SIZE(AA), %xmm4
1431 pshufd $0x00, %xmm4, %xmm6
1434 pshufd $0x55, %xmm4, %xmm6
1437 pshufd $0xaa, %xmm4, %xmm6
1440 pshufd $0xff, %xmm4, %xmm6
1444 movaps 4 * SIZE(AA), %xmm4
1445 pshufd $0x55, %xmm4, %xmm6
1447 pshufd $0xaa, %xmm4, %xmm6
1450 pshufd $0xff, %xmm4, %xmm6
1454 movaps 8 * SIZE(AA), %xmm4
1455 pshufd $0xaa, %xmm4, %xmm6
1457 pshufd $0xff, %xmm4, %xmm6
1461 movaps 12 * SIZE(AA), %xmm4
1462 pshufd $0xff, %xmm4, %xmm6
1467 movaps 0 * SIZE(B), %xmm6
1468 pshufd $0x00, %xmm6, %xmm7
1470 pshufd $0x55, %xmm6, %xmm7
1473 pshufd $0xaa, %xmm6, %xmm7
1476 pshufd $0xff, %xmm6, %xmm7
1480 movaps 4 * SIZE(B), %xmm6
1481 pshufd $0x55, %xmm6, %xmm7
1483 pshufd $0xaa, %xmm6, %xmm7
1486 pshufd $0xff, %xmm6, %xmm7
1490 movaps 8 * SIZE(B), %xmm6
1491 pshufd $0xaa, %xmm6, %xmm7
1493 pshufd $0xff, %xmm6, %xmm7
1497 movaps 12 * SIZE(B), %xmm6
1498 pshufd $0xff, %xmm6, %xmm7
1503 movaps 12 * SIZE(B), %xmm6
1504 pshufd $0xff, %xmm6, %xmm7
1506 pshufd $0xaa, %xmm6, %xmm7
1509 pshufd $0x55, %xmm6, %xmm7
1512 pshufd $0x00, %xmm6, %xmm7
1516 movaps 8 * SIZE(B), %xmm6
1517 pshufd $0xaa, %xmm6, %xmm7
1519 pshufd $0x55, %xmm6, %xmm7
1522 pshufd $0x00, %xmm6, %xmm7
1526 movaps 4 * SIZE(B), %xmm6
1527 pshufd $0x55, %xmm6, %xmm7
1529 pshufd $0x00, %xmm6, %xmm7
1533 movaps 0 * SIZE(B), %xmm6
1534 pshufd $0x00, %xmm6, %xmm7
1538 #if defined(LN) || defined(LT)
1539 movaps %xmm1, 0 * SIZE(B)
1540 movaps %xmm3, 4 * SIZE(B)
1541 movaps %xmm5, 8 * SIZE(B)
1542 movaps %xmm7, 12 * SIZE(B)
1544 pshufd $0x00, %xmm1, %xmm0
1545 pshufd $0x55, %xmm1, %xmm2
1546 pshufd $0xaa, %xmm1, %xmm4
1547 pshufd $0xff, %xmm1, %xmm6
1548 movaps %xmm0, 0 * SIZE(BB)
1549 movaps %xmm2, 4 * SIZE(BB)
1550 movaps %xmm4, 8 * SIZE(BB)
1551 movaps %xmm6, 12 * SIZE(BB)
1553 pshufd $0x00, %xmm3, %xmm0
1554 pshufd $0x55, %xmm3, %xmm2
1555 pshufd $0xaa, %xmm3, %xmm4
1556 pshufd $0xff, %xmm3, %xmm6
1557 movaps %xmm0, 16 * SIZE(BB)
1558 movaps %xmm2, 20 * SIZE(BB)
1559 movaps %xmm4, 24 * SIZE(BB)
1560 movaps %xmm6, 28 * SIZE(BB)
1562 pshufd $0x00, %xmm5, %xmm0
1563 pshufd $0x55, %xmm5, %xmm2
1564 pshufd $0xaa, %xmm5, %xmm4
1565 pshufd $0xff, %xmm5, %xmm6
1566 movaps %xmm0, 32 * SIZE(BB)
1567 movaps %xmm2, 36 * SIZE(BB)
1568 movaps %xmm4, 40 * SIZE(BB)
1569 movaps %xmm6, 44 * SIZE(BB)
1571 pshufd $0x00, %xmm7, %xmm0
1572 pshufd $0x55, %xmm7, %xmm2
1573 pshufd $0xaa, %xmm7, %xmm4
1574 pshufd $0xff, %xmm7, %xmm6
1575 movaps %xmm0, 48 * SIZE(BB)
1576 movaps %xmm2, 52 * SIZE(BB)
1577 movaps %xmm4, 56 * SIZE(BB)
1578 movaps %xmm6, 60 * SIZE(BB)
1580 movaps %xmm0, 0 * SIZE(AA)
1581 movaps %xmm1, 4 * SIZE(AA)
1582 movaps %xmm2, 8 * SIZE(AA)
1583 movaps %xmm3, 12 * SIZE(AA)
1590 leal (LDC, LDC, 2), %eax
1592 #if defined(LN) || defined(LT)
1594 unpcklps %xmm5, %xmm1
1595 unpckhps %xmm5, %xmm0
1598 unpcklps %xmm7, %xmm3
1599 unpckhps %xmm7, %xmm4
1602 unpcklps %xmm3, %xmm1
1603 unpckhps %xmm3, %xmm2
1606 unpcklps %xmm4, %xmm0
1607 unpckhps %xmm4, %xmm6
1609 movlps %xmm1, 0 * SIZE(CO1)
1610 movhps %xmm1, 2 * SIZE(CO1)
1611 movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
1612 movhps %xmm2, 2 * SIZE(CO1, LDC, 1)
1613 movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
1614 movhps %xmm0, 2 * SIZE(CO1, LDC, 2)
1615 movlps %xmm6, 0 * SIZE(CO1, %eax, 1)
1616 movhps %xmm6, 2 * SIZE(CO1, %eax, 1)
1618 movlps %xmm0, 0 * SIZE(CO1)
1619 movhps %xmm0, 2 * SIZE(CO1)
1620 movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
1621 movhps %xmm1, 2 * SIZE(CO1, LDC, 1)
1622 movlps %xmm2, 0 * SIZE(CO1, LDC, 2)
1623 movhps %xmm2, 2 * SIZE(CO1, LDC, 2)
1624 movlps %xmm3, 0 * SIZE(CO1, %eax, 1)
1625 movhps %xmm3, 2 * SIZE(CO1, %eax, 1)
1632 #if defined(LT) || defined(RN)
1635 leal (,%eax, SIZE), %eax
1636 leal (AA, %eax, 4), AA
1654 sall $2 + BASE_SHIFT, %eax
1665 leal (, %eax, SIZE), %eax
1666 leal (B, %eax, 4), B
1669 #if defined(LT) || defined(RN)
1672 leal (,%eax, SIZE), %eax
1673 leal (B, %eax, 4), B
1702 sall $1 + BASE_SHIFT, %eax
1706 #if defined(LN) || defined(RT)
1709 sall $1 + BASE_SHIFT, %eax
1710 leal (B, %eax, 1), B
1711 leal (BB, %eax, 4), BB
1719 #if defined(LT) || defined(RN)
1730 movaps 0 * SIZE(B), %xmm3
1731 movaps 4 * SIZE(B), %xmm7
1733 pshufd $0x00, %xmm3, %xmm0
1734 pshufd $0x55, %xmm3, %xmm1
1735 pshufd $0xaa, %xmm3, %xmm2
1736 pshufd $0xff, %xmm3, %xmm3
1738 pshufd $0x00, %xmm7, %xmm4
1739 pshufd $0x55, %xmm7, %xmm5
1740 pshufd $0xaa, %xmm7, %xmm6
1741 pshufd $0xff, %xmm7, %xmm7
1743 movaps %xmm0, 0 * SIZE(BB)
1744 movaps %xmm1, 4 * SIZE(BB)
1745 movaps %xmm2, 8 * SIZE(BB)
1746 movaps %xmm3, 12 * SIZE(BB)
1747 movaps %xmm4, 16 * SIZE(BB)
1748 movaps %xmm5, 20 * SIZE(BB)
1749 movaps %xmm6, 24 * SIZE(BB)
1750 movaps %xmm7, 28 * SIZE(BB)
1753 addl $32 * SIZE, %ecx
1759 #if defined(LT) || defined(RN)
1774 movsd 0 * SIZE(B), %xmm3
1776 pshufd $0x00, %xmm3, %xmm0
1777 pshufd $0x55, %xmm3, %xmm1
1779 movaps %xmm0, 0 * SIZE(BB)
1780 movaps %xmm1, 4 * SIZE(BB)
1783 addl $8 * SIZE, %ecx
1789 #if defined(LT) || defined(RN)
1796 leal (, LDC, 2), %eax
1811 sall $BASE_SHIFT, %eax
1815 #if defined(LN) || defined(RT)
1818 leal (AA, %eax, SIZE), AA
1823 #if defined(LN) || defined(RT)
1825 sall $1 + BASE_SHIFT, %eax
1826 leal (BB, %eax, 4), BB
1834 movss 0 * SIZE(AA), %xmm0
1835 movss 4 * SIZE(AA), %xmm1
1836 movss 0 * SIZE(BB), %xmm2
1837 movss 16 * SIZE(BB), %xmm3
1839 #if defined(LT) || defined(RN)
1851 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
1852 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
1854 mulss 4 * SIZE(BB), %xmm0
1856 movss 8 * SIZE(BB), %xmm2
1858 movss 1 * SIZE(AA), %xmm0
1860 mulss 12 * SIZE(BB), %xmm0
1862 movss 32 * SIZE(BB), %xmm2
1864 movss 2 * SIZE(AA), %xmm0
1866 mulss 20 * SIZE(BB), %xmm0
1868 movss 24 * SIZE(BB), %xmm3
1870 movss 3 * SIZE(AA), %xmm0
1872 mulss 28 * SIZE(BB), %xmm0
1874 movss 48 * SIZE(BB), %xmm3
1876 movss 8 * SIZE(AA), %xmm0
1878 mulss 36 * SIZE(BB), %xmm1
1880 movss 40 * SIZE(BB), %xmm2
1882 movss 5 * SIZE(AA), %xmm1
1884 mulss 44 * SIZE(BB), %xmm1
1886 movss 64 * SIZE(BB), %xmm2
1888 movss 6 * SIZE(AA), %xmm1
1890 mulss 52 * SIZE(BB), %xmm1
1892 movss 56 * SIZE(BB), %xmm3
1894 movss 7 * SIZE(AA), %xmm1
1896 mulss 60 * SIZE(BB), %xmm1
1898 movss 80 * SIZE(BB), %xmm3
1900 movss 12 * SIZE(AA), %xmm1
1909 #if defined(LT) || defined(RN)
1915 andl $7, %eax # if (k & 1)
1922 mulss 4 * SIZE(BB), %xmm0
1924 movss 8 * SIZE(BB), %xmm2
1926 movss 1 * SIZE(AA), %xmm0
1938 #if defined(LN) || defined(RT)
1950 sall $BASE_SHIFT, %eax
1951 leal (AA, %eax, 1), AA
1952 leal (B, %eax, 2), B
1953 leal (BB, %eax, 8), BB
1956 #if defined(LN) || defined(LT)
1957 unpcklps %xmm5, %xmm4
1962 movsd 0 * SIZE(B), %xmm1
1966 movss 0 * SIZE(AA), %xmm0
1967 movss 1 * SIZE(AA), %xmm1
1973 #if defined(LN) || defined(LT)
1974 movss 0 * SIZE(AA), %xmm4
1975 pshufd $0x00, %xmm4, %xmm6
1980 movaps 0 * SIZE(B), %xmm6
1981 pshufd $0x00, %xmm6, %xmm7
1983 pshufd $0x55, %xmm6, %xmm7
1987 pshufd $0xff, %xmm6, %xmm7
1992 movaps 0 * SIZE(B), %xmm6
1993 pshufd $0xff, %xmm6, %xmm7
1995 pshufd $0xaa, %xmm6, %xmm7
1999 pshufd $0x00, %xmm6, %xmm7
2003 #if defined(LN) || defined(LT)
2004 movlps %xmm1, 0 * SIZE(B)
2006 pshufd $0x00, %xmm1, %xmm0
2007 pshufd $0x55, %xmm1, %xmm2
2008 movaps %xmm0, 0 * SIZE(BB)
2009 movaps %xmm2, 4 * SIZE(BB)
2011 movss %xmm0, 0 * SIZE(AA)
2012 movss %xmm1, 1 * SIZE(AA)
2019 #if defined(LN) || defined(LT)
2020 pshufd $1, %xmm1, %xmm3
2022 movss %xmm1, 0 * SIZE(CO1)
2023 movss %xmm3, 0 * SIZE(CO1, LDC)
2025 movss %xmm0, 0 * SIZE(CO1)
2026 movss %xmm1, 0 * SIZE(CO1, LDC)
2033 #if defined(LT) || defined(RN)
2036 leal (AA, %eax, SIZE), AA
2054 sall $BASE_SHIFT, %eax
2066 sall $1 + BASE_SHIFT, %eax
2070 #if defined(LN) || defined(RT)
2073 leal (, %eax, SIZE), %eax
2074 leal (AA, %eax, 2), AA
2079 #if defined(LN) || defined(RT)
2081 sall $1 + BASE_SHIFT, %eax
2082 leal (BB, %eax, 4), BB
2093 movsd 0 * SIZE(AA), %xmm0
2097 movsd 8 * SIZE(AA), %xmm1
2098 movaps 0 * SIZE(BB), %xmm2
2099 movaps 16 * SIZE(BB), %xmm3
2101 #if defined(LT) || defined(RN)
2112 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
2113 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
2118 movaps 4 * SIZE(BB), %xmm2
2120 movsd 2 * SIZE(AA), %xmm0
2122 movaps 8 * SIZE(BB), %xmm2
2126 movaps 12 * SIZE(BB), %xmm2
2128 movsd 4 * SIZE(AA), %xmm0
2130 movaps 32 * SIZE(BB), %xmm2
2134 movaps 20 * SIZE(BB), %xmm3
2136 movsd 6 * SIZE(AA), %xmm0
2138 movaps 24 * SIZE(BB), %xmm3
2142 movaps 28 * SIZE(BB), %xmm3
2144 movsd 16 * SIZE(AA), %xmm0
2146 movaps 48 * SIZE(BB), %xmm3
2150 movaps 36 * SIZE(BB), %xmm2
2152 movsd 10 * SIZE(AA), %xmm1
2154 movaps 40 * SIZE(BB), %xmm2
2158 movaps 44 * SIZE(BB), %xmm2
2160 movsd 12 * SIZE(AA), %xmm1
2162 movaps 64 * SIZE(BB), %xmm2
2166 movaps 52 * SIZE(BB), %xmm3
2168 movsd 14 * SIZE(AA), %xmm1
2170 movaps 56 * SIZE(BB), %xmm3
2174 movaps 60 * SIZE(BB), %xmm3
2176 movsd 24 * SIZE(AA), %xmm1
2178 movaps 80 * SIZE(BB), %xmm3
2187 #if defined(LT) || defined(RN)
2193 andl $7, %eax # if (k & 1)
2201 movaps 4 * SIZE(BB), %xmm2
2203 movsd 2 * SIZE(AA), %xmm0
2205 movaps 8 * SIZE(BB), %xmm2
2217 #if defined(LN) || defined(RT)
2229 sall $BASE_SHIFT, %eax
2230 leal (AA, %eax, 2), AA
2231 leal (B, %eax, 2), B
2232 leal (BB, %eax, 8), BB
2235 #if defined(LN) || defined(LT)
2236 unpcklps %xmm6, %xmm4
2237 unpcklps %xmm7, %xmm5
2240 unpcklps %xmm5, %xmm4
2241 unpckhps %xmm5, %xmm6
2246 movsd 0 * SIZE(B), %xmm1
2250 movsd 2 * SIZE(B), %xmm3
2258 movsd 0 * SIZE(AA), %xmm0
2262 movsd 2 * SIZE(AA), %xmm1
2269 movaps 0 * SIZE(AA), %xmm4
2270 pshufd $0xff, %xmm4, %xmm6
2272 pshufd $0xaa, %xmm4, %xmm6
2276 pshufd $0x00, %xmm4, %xmm6
2281 movaps 0 * SIZE(AA), %xmm4
2282 pshufd $0x00, %xmm4, %xmm6
2284 pshufd $0x55, %xmm4, %xmm6
2288 pshufd $0xff, %xmm4, %xmm6
2293 movaps 0 * SIZE(B), %xmm6
2294 pshufd $0x00, %xmm6, %xmm7
2296 pshufd $0x55, %xmm6, %xmm7
2300 pshufd $0xff, %xmm6, %xmm7
2305 movaps 0 * SIZE(B), %xmm6
2306 pshufd $0xff, %xmm6, %xmm7
2308 pshufd $0xaa, %xmm6, %xmm7
2312 pshufd $0x00, %xmm6, %xmm7
2316 #if defined(LN) || defined(LT)
2317 movlps %xmm1, 0 * SIZE(B)
2318 movlps %xmm3, 2 * SIZE(B)
2320 pshufd $0x00, %xmm1, %xmm0
2321 pshufd $0x55, %xmm1, %xmm2
2322 movaps %xmm0, 0 * SIZE(BB)
2323 movaps %xmm2, 4 * SIZE(BB)
2325 pshufd $0x00, %xmm3, %xmm0
2326 pshufd $0x55, %xmm3, %xmm2
2327 movaps %xmm0, 8 * SIZE(BB)
2328 movaps %xmm2, 12 * SIZE(BB)
2330 movlps %xmm0, 0 * SIZE(AA)
2331 movlps %xmm1, 2 * SIZE(AA)
2338 #if defined(LN) || defined(LT)
2339 unpcklps %xmm3, %xmm1
2341 movlps %xmm1, 0 * SIZE(CO1)
2342 movhps %xmm1, 0 * SIZE(CO1, LDC)
2344 movlps %xmm0, 0 * SIZE(CO1)
2345 movlps %xmm1, 0 * SIZE(CO1, LDC)
2352 #if defined(LT) || defined(RN)
2355 leal (,%eax, SIZE), %eax
2356 leal (AA, %eax, 2), AA
2374 sall $1 + BASE_SHIFT, %eax
2381 sarl $2, %ebx # i = (m >> 2)
2388 sall $2 + BASE_SHIFT, %eax
2392 #if defined(LN) || defined(RT)
2395 leal (, %eax, SIZE), %eax
2396 leal (AA, %eax, 4), AA
2401 #if defined(LN) || defined(RT)
2403 sall $1 + BASE_SHIFT, %eax
2404 leal (BB, %eax, 4), BB
2412 movaps 0 * SIZE(AA), %xmm0
2413 movaps 16 * SIZE(AA), %xmm1
2414 movaps 0 * SIZE(BB), %xmm2
2415 movaps 16 * SIZE(BB), %xmm3
2417 PREFETCHW -4 * SIZE(CO1)
2418 PREFETCHW -4 * SIZE(CO1, LDC)
2420 #if defined(LT) || defined(RN)
2432 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
2433 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
2435 mulps 4 * SIZE(BB), %xmm0
2437 movaps 8 * SIZE(BB), %xmm2
2439 movaps 4 * SIZE(AA), %xmm0
2442 mulps 12 * SIZE(BB), %xmm0
2444 movaps 32 * SIZE(BB), %xmm2
2446 movaps 8 * SIZE(AA), %xmm0
2449 mulps 20 * SIZE(BB), %xmm0
2451 movaps 24 * SIZE(BB), %xmm3
2453 movaps 12 * SIZE(AA), %xmm0
2456 mulps 28 * SIZE(BB), %xmm0
2458 movaps 48 * SIZE(BB), %xmm3
2460 movaps 32 * SIZE(AA), %xmm0
2462 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
2463 prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
2466 mulps 36 * SIZE(BB), %xmm1
2468 movaps 40 * SIZE(BB), %xmm2
2470 movaps 20 * SIZE(AA), %xmm1
2473 mulps 44 * SIZE(BB), %xmm1
2475 movaps 64 * SIZE(BB), %xmm2
2477 movaps 24 * SIZE(AA), %xmm1
2480 mulps 52 * SIZE(BB), %xmm1
2482 movaps 56 * SIZE(BB), %xmm3
2484 movaps 28 * SIZE(AA), %xmm1
2487 mulps 60 * SIZE(BB), %xmm1
2489 movaps 80 * SIZE(BB), %xmm3
2491 movaps 48 * SIZE(AA), %xmm1
2500 #if defined(LT) || defined(RN)
2506 andl $7, %eax # if (k & 1)
2513 mulps 4 * SIZE(BB), %xmm0
2515 movaps 8 * SIZE(BB), %xmm2
2517 movaps 4 * SIZE(AA), %xmm0
2526 #if defined(LN) || defined(RT)
2538 sall $1 + BASE_SHIFT, %eax
2539 leal (AA, %eax, 2), AA
2540 leal (B, %eax, 1), B
2541 leal (BB, %eax, 4), BB
2544 #if defined(LN) || defined(LT)
2546 unpcklps %xmm6, %xmm4
2547 unpckhps %xmm6, %xmm0
2550 unpcklps %xmm7, %xmm5
2551 unpckhps %xmm7, %xmm1
2554 unpcklps %xmm5, %xmm4
2555 unpckhps %xmm5, %xmm6
2558 unpcklps %xmm1, %xmm0
2559 unpckhps %xmm1, %xmm2
2564 movsd 0 * SIZE(B), %xmm1
2568 movsd 2 * SIZE(B), %xmm3
2572 movsd 4 * SIZE(B), %xmm5
2576 movsd 6 * SIZE(B), %xmm7
2583 movaps 0 * SIZE(AA), %xmm0
2584 movaps 4 * SIZE(AA), %xmm1
2591 movaps 12 * SIZE(AA), %xmm4
2592 pshufd $0xff, %xmm4, %xmm6
2594 pshufd $0xaa, %xmm4, %xmm6
2597 pshufd $0x55, %xmm4, %xmm6
2600 pshufd $0x00, %xmm4, %xmm6
2604 movaps 8 * SIZE(AA), %xmm4
2605 pshufd $0xaa, %xmm4, %xmm6
2607 pshufd $0x55, %xmm4, %xmm6
2610 pshufd $0x00, %xmm4, %xmm6
2614 movaps 4 * SIZE(AA), %xmm4
2615 pshufd $0x55, %xmm4, %xmm6
2617 pshufd $0x00, %xmm4, %xmm6
2621 movaps 0 * SIZE(AA), %xmm4
2622 pshufd $0x00, %xmm4, %xmm6
2627 movaps 0 * SIZE(AA), %xmm4
2628 pshufd $0x00, %xmm4, %xmm6
2631 pshufd $0x55, %xmm4, %xmm6
2634 pshufd $0xaa, %xmm4, %xmm6
2637 pshufd $0xff, %xmm4, %xmm6
2641 movaps 4 * SIZE(AA), %xmm4
2642 pshufd $0x55, %xmm4, %xmm6
2644 pshufd $0xaa, %xmm4, %xmm6
2647 pshufd $0xff, %xmm4, %xmm6
2651 movaps 8 * SIZE(AA), %xmm4
2652 pshufd $0xaa, %xmm4, %xmm6
2654 pshufd $0xff, %xmm4, %xmm6
2658 movaps 12 * SIZE(AA), %xmm4
2659 pshufd $0xff, %xmm4, %xmm6
2664 movaps 0 * SIZE(B), %xmm6
2665 pshufd $0x00, %xmm6, %xmm7
2667 pshufd $0x55, %xmm6, %xmm7
2671 pshufd $0xff, %xmm6, %xmm7
2676 movaps 0 * SIZE(B), %xmm6
2677 pshufd $0xff, %xmm6, %xmm7
2679 pshufd $0xaa, %xmm6, %xmm7
2683 pshufd $0x00, %xmm6, %xmm7
2687 #if defined(LN) || defined(LT)
2688 movlps %xmm1, 0 * SIZE(B)
2689 movlps %xmm3, 2 * SIZE(B)
2690 movlps %xmm5, 4 * SIZE(B)
2691 movlps %xmm7, 6 * SIZE(B)
2693 pshufd $0x00, %xmm1, %xmm0
2694 pshufd $0x55, %xmm1, %xmm2
2695 movaps %xmm0, 0 * SIZE(BB)
2696 movaps %xmm2, 4 * SIZE(BB)
2698 pshufd $0x00, %xmm3, %xmm0
2699 pshufd $0x55, %xmm3, %xmm2
2700 movaps %xmm0, 8 * SIZE(BB)
2701 movaps %xmm2, 12 * SIZE(BB)
2703 pshufd $0x00, %xmm5, %xmm0
2704 pshufd $0x55, %xmm5, %xmm2
2705 movaps %xmm0, 16 * SIZE(BB)
2706 movaps %xmm2, 20 * SIZE(BB)
2708 pshufd $0x00, %xmm7, %xmm0
2709 pshufd $0x55, %xmm7, %xmm2
2710 movaps %xmm0, 24 * SIZE(BB)
2711 movaps %xmm2, 28 * SIZE(BB)
2713 movaps %xmm0, 0 * SIZE(AA)
2714 movaps %xmm1, 4 * SIZE(AA)
2721 #if defined(LN) || defined(LT)
2722 unpcklps %xmm5, %xmm1
2723 unpcklps %xmm7, %xmm3
2726 unpcklps %xmm3, %xmm1
2727 unpckhps %xmm3, %xmm2
2729 movlps %xmm1, 0 * SIZE(CO1)
2730 movhps %xmm1, 2 * SIZE(CO1)
2731 movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
2732 movhps %xmm2, 2 * SIZE(CO1, LDC, 1)
2734 movlps %xmm0, 0 * SIZE(CO1)
2735 movhps %xmm0, 2 * SIZE(CO1)
2736 movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
2737 movhps %xmm1, 2 * SIZE(CO1, LDC, 1)
2744 #if defined(LT) || defined(RN)
2747 leal (,%eax, SIZE), %eax
2748 leal (AA, %eax, 4), AA
2766 sall $2 + BASE_SHIFT, %eax
2777 leal (, %eax, SIZE), %eax
2778 leal (B, %eax, 2), B
2781 #if defined(LT) || defined(RN)
2784 leal (,%eax, SIZE), %eax
2785 leal (B, %eax, 2), B
2811 sall $BASE_SHIFT, %eax
2815 #if defined(LN) || defined(RT)
2818 sall $BASE_SHIFT, %eax
2819 leal (B, %eax, 1), B
2820 leal (BB, %eax, 4), BB
2828 #if defined(LT) || defined(RN)
2839 movsd 0 * SIZE(B), %xmm3
2840 movhps 2 * SIZE(B), %xmm3
2841 movsd 4 * SIZE(B), %xmm7
2842 movhps 6 * SIZE(B), %xmm7
2844 pshufd $0x00, %xmm3, %xmm0
2845 pshufd $0x55, %xmm3, %xmm1
2846 pshufd $0xaa, %xmm3, %xmm2
2847 pshufd $0xff, %xmm3, %xmm3
2849 pshufd $0x00, %xmm7, %xmm4
2850 pshufd $0x55, %xmm7, %xmm5
2851 pshufd $0xaa, %xmm7, %xmm6
2852 pshufd $0xff, %xmm7, %xmm7
2854 movaps %xmm0, 0 * SIZE(BB)
2855 movaps %xmm1, 4 * SIZE(BB)
2856 movaps %xmm2, 8 * SIZE(BB)
2857 movaps %xmm3, 12 * SIZE(BB)
2858 movaps %xmm4, 16 * SIZE(BB)
2859 movaps %xmm5, 20 * SIZE(BB)
2860 movaps %xmm6, 24 * SIZE(BB)
2861 movaps %xmm7, 28 * SIZE(BB)
2870 #if defined(LT) || defined(RN)
2882 movss 0 * SIZE(B), %xmm3
2884 pshufd $0x00, %xmm3, %xmm0
2886 movaps %xmm0, 0 * SIZE(BB)
2895 #if defined(LT) || defined(RN)
2915 sall $BASE_SHIFT, %eax
2919 #if defined(LN) || defined(RT)
2922 leal (AA, %eax, SIZE), AA
2927 #if defined(LN) || defined(RT)
2929 sall $BASE_SHIFT, %eax
2930 leal (BB, %eax, 4), BB
2938 movss 0 * SIZE(AA), %xmm0
2939 movss 4 * SIZE(AA), %xmm1
2940 movss 0 * SIZE(BB), %xmm2
2941 movss 16 * SIZE(BB), %xmm3
2943 #if defined(LT) || defined(RN)
2955 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
2956 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
2958 movss 1 * SIZE(AA), %xmm0
2960 movss 32 * SIZE(BB), %xmm2
2961 mulss 4 * SIZE(BB), %xmm0
2963 movss 2 * SIZE(AA), %xmm0
2964 mulss 8 * SIZE(BB), %xmm0
2966 movss 3 * SIZE(AA), %xmm0
2967 mulss 12 * SIZE(BB), %xmm0
2969 movss 8 * SIZE(AA), %xmm0
2971 movss 5 * SIZE(AA), %xmm1
2973 movss 48 * SIZE(BB), %xmm3
2974 mulss 20 * SIZE(BB), %xmm1
2976 movss 6 * SIZE(AA), %xmm1
2977 mulss 24 * SIZE(BB), %xmm1
2979 movss 7 * SIZE(AA), %xmm1
2980 mulss 28 * SIZE(BB), %xmm1
2982 movss 12 * SIZE(AA), %xmm1
2991 #if defined(LT) || defined(RN)
2997 andl $7, %eax # if (k & 1)
3004 movss 1 * SIZE(AA), %xmm0
3006 movss 4 * SIZE(BB), %xmm2
3019 #if defined(LN) || defined(RT)
3027 sall $ BASE_SHIFT, %eax
3028 leal (AA, %eax, 1), AA
3029 leal (B, %eax, 1), B
3030 leal (BB, %eax, 4), BB
3033 #if defined(LN) || defined(LT)
3034 movss 0 * SIZE(B), %xmm1
3037 movss 0 * SIZE(AA), %xmm0
3041 #if defined(LN) || defined(LT)
3042 mulss 0 * SIZE(AA), %xmm1
3045 #if defined(RN) || defined(RT)
3046 mulss 0 * SIZE(B), %xmm0
3049 #if defined(LN) || defined(LT)
3050 movss %xmm1, 0 * SIZE(B)
3052 pshufd $0x00, %xmm1, %xmm0
3053 movaps %xmm0, 0 * SIZE(BB)
3055 movss %xmm0, 0 * SIZE(AA)
3062 #if defined(LN) || defined(LT)
3063 movss %xmm1, 0 * SIZE(CO1)
3065 movss %xmm0, 0 * SIZE(CO1)
3072 #if defined(LT) || defined(RN)
3075 leal (AA, %eax, SIZE), AA
3093 sall $BASE_SHIFT, %eax
3104 sall $1 + BASE_SHIFT, %eax
3108 #if defined(LN) || defined(RT)
3111 leal (, %eax, SIZE), %eax
3112 leal (AA, %eax, 2), AA
3117 #if defined(LN) || defined(RT)
3119 sall $BASE_SHIFT, %eax
3120 leal (BB, %eax, 4), BB
3131 movsd 0 * SIZE(AA), %xmm0
3135 movsd 8 * SIZE(AA), %xmm1
3136 movaps 0 * SIZE(BB), %xmm2
3137 movaps 16 * SIZE(BB), %xmm3
3139 #if defined(LT) || defined(RN)
3151 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
3152 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
3154 movsd 2 * SIZE(AA), %xmm0
3156 movaps 4 * SIZE(BB), %xmm2
3158 movsd 4 * SIZE(AA), %xmm0
3160 movaps 8 * SIZE(BB), %xmm2
3162 movsd 6 * SIZE(AA), %xmm0
3164 movaps 12 * SIZE(BB), %xmm2
3166 movsd 16 * SIZE(AA), %xmm0
3168 movaps 32 * SIZE(BB), %xmm2
3170 movsd 10 * SIZE(AA), %xmm1
3172 movaps 20 * SIZE(BB), %xmm3
3174 movsd 12 * SIZE(AA), %xmm1
3176 movaps 24 * SIZE(BB), %xmm3
3178 movsd 14 * SIZE(AA), %xmm1
3180 movaps 28 * SIZE(BB), %xmm3
3182 movsd 24 * SIZE(AA), %xmm1
3184 movaps 48 * SIZE(BB), %xmm3
3193 #if defined(LT) || defined(RN)
3199 andl $7, %eax # if (k & 1)
3207 movsd 2 * SIZE(AA), %xmm0
3208 movaps 4 * SIZE(BB), %xmm2
3221 #if defined(LN) || defined(RT)
3233 sall $ BASE_SHIFT, %eax
3234 leal (AA, %eax, 2), AA
3235 leal (B, %eax, 1), B
3236 leal (BB, %eax, 4), BB
3239 #if defined(LN) || defined(LT)
3240 pshufd $1, %xmm4, %xmm6
3242 movss 0 * SIZE(B), %xmm1
3243 movss 1 * SIZE(B), %xmm3
3251 movsd 0 * SIZE(AA), %xmm0
3257 movaps 0 * SIZE(AA), %xmm4
3258 pshufd $0xff, %xmm4, %xmm6
3260 pshufd $0xaa, %xmm4, %xmm6
3264 pshufd $0x00, %xmm4, %xmm6
3269 movaps 0 * SIZE(AA), %xmm4
3270 pshufd $0x00, %xmm4, %xmm6
3272 pshufd $0x55, %xmm4, %xmm6
3276 pshufd $0xff, %xmm4, %xmm6
3280 #if defined(RN) || defined(RT)
3281 movss 0 * SIZE(B), %xmm6
3282 pshufd $0x00, %xmm6, %xmm7
3286 #if defined(LN) || defined(LT)
3287 movss %xmm1, 0 * SIZE(B)
3288 movss %xmm3, 1 * SIZE(B)
3290 pshufd $0x00, %xmm1, %xmm0
3291 movaps %xmm0, 0 * SIZE(BB)
3292 pshufd $0x00, %xmm3, %xmm0
3293 movaps %xmm0, 4 * SIZE(BB)
3295 movlps %xmm0, 0 * SIZE(AA)
3302 #if defined(LN) || defined(LT)
3303 movss %xmm1, 0 * SIZE(CO1)
3304 movss %xmm3, 1 * SIZE(CO1)
3306 movlps %xmm0, 0 * SIZE(CO1)
3313 #if defined(LT) || defined(RN)
3316 leal (,%eax, SIZE), %eax
3317 leal (AA, %eax, 2), AA
3335 sall $1 + BASE_SHIFT, %eax
3342 sarl $2, %ebx # i = (m >> 2)
3349 sall $2 + BASE_SHIFT, %eax
3353 #if defined(LN) || defined(RT)
3356 leal (, %eax, SIZE), %eax
3357 leal (AA, %eax, 4), AA
3362 #if defined(LN) || defined(RT)
3364 sall $BASE_SHIFT, %eax
3365 leal (BB, %eax, 4), BB
3373 movaps 0 * SIZE(AA), %xmm0
3374 movaps 16 * SIZE(AA), %xmm1
3375 movaps 0 * SIZE(BB), %xmm2
3376 movaps 16 * SIZE(BB), %xmm3
3378 PREFETCHW -4 * SIZE(CO1)
3380 #if defined(LT) || defined(RN)
3392 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
3393 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
3395 movaps 4 * SIZE(AA), %xmm0
3397 movaps 32 * SIZE(BB), %xmm2
3398 mulps 4 * SIZE(BB), %xmm0
3400 movaps 8 * SIZE(AA), %xmm0
3401 mulps 8 * SIZE(BB), %xmm0
3403 movaps 12 * SIZE(AA), %xmm0
3404 mulps 12 * SIZE(BB), %xmm0
3406 movaps 32 * SIZE(AA), %xmm0
3407 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
3408 prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
3411 movaps 20 * SIZE(AA), %xmm1
3413 movaps 48 * SIZE(BB), %xmm3
3414 mulps 20 * SIZE(BB), %xmm1
3416 movaps 24 * SIZE(AA), %xmm1
3417 mulps 24 * SIZE(BB), %xmm1
3419 movaps 28 * SIZE(AA), %xmm1
3420 mulps 28 * SIZE(BB), %xmm1
3422 movaps 48 * SIZE(AA), %xmm1
3431 #if defined(LT) || defined(RN)
3437 andl $7, %eax # if (k & 1)
3445 movaps 4 * SIZE(AA), %xmm0
3446 movaps 4 * SIZE(BB), %xmm2
3459 #if defined(LN) || defined(RT)
3471 sall $ BASE_SHIFT, %eax
3472 leal (AA, %eax, 4), AA
3473 leal (B, %eax, 1), B
3474 leal (BB, %eax, 4), BB
3477 #if defined(LN) || defined(LT)
3479 unpcklps %xmm6, %xmm4
3480 unpckhps %xmm6, %xmm0
3483 unpcklps %xmm7, %xmm5
3484 unpckhps %xmm7, %xmm1
3487 unpcklps %xmm5, %xmm4
3488 unpckhps %xmm5, %xmm6
3491 unpcklps %xmm1, %xmm0
3492 unpckhps %xmm1, %xmm2
3494 movss 0 * SIZE(B), %xmm1
3495 movss 1 * SIZE(B), %xmm3
3496 movss 2 * SIZE(B), %xmm5
3497 movss 3 * SIZE(B), %xmm7
3504 movaps 0 * SIZE(AA), %xmm0
3510 movaps 12 * SIZE(AA), %xmm4
3511 pshufd $0xff, %xmm4, %xmm6
3513 pshufd $0xaa, %xmm4, %xmm6
3516 pshufd $0x55, %xmm4, %xmm6
3519 pshufd $0x00, %xmm4, %xmm6
3523 movaps 8 * SIZE(AA), %xmm4
3524 pshufd $0xaa, %xmm4, %xmm6
3526 pshufd $0x55, %xmm4, %xmm6
3529 pshufd $0x00, %xmm4, %xmm6
3533 movaps 4 * SIZE(AA), %xmm4
3534 pshufd $0x55, %xmm4, %xmm6
3536 pshufd $0x00, %xmm4, %xmm6
3540 movaps 0 * SIZE(AA), %xmm4
3541 pshufd $0x00, %xmm4, %xmm6
3546 movaps 0 * SIZE(AA), %xmm4
3547 pshufd $0x00, %xmm4, %xmm6
3550 pshufd $0x55, %xmm4, %xmm6
3553 pshufd $0xaa, %xmm4, %xmm6
3556 pshufd $0xff, %xmm4, %xmm6
3560 movaps 4 * SIZE(AA), %xmm4
3561 pshufd $0x55, %xmm4, %xmm6
3563 pshufd $0xaa, %xmm4, %xmm6
3566 pshufd $0xff, %xmm4, %xmm6
3570 movaps 8 * SIZE(AA), %xmm4
3571 pshufd $0xaa, %xmm4, %xmm6
3573 pshufd $0xff, %xmm4, %xmm6
3577 movaps 12 * SIZE(AA), %xmm4
3578 pshufd $0xff, %xmm4, %xmm6
3582 #if defined(RN) || defined(RT)
3583 movss 0 * SIZE(B), %xmm6
3584 pshufd $0x00, %xmm6, %xmm7
3588 #if defined(LN) || defined(LT)
3589 movss %xmm1, 0 * SIZE(B)
3590 movss %xmm3, 1 * SIZE(B)
3591 movss %xmm5, 2 * SIZE(B)
3592 movss %xmm7, 3 * SIZE(B)
3594 pshufd $0x00, %xmm1, %xmm0
3595 movaps %xmm0, 0 * SIZE(BB)
3596 pshufd $0x00, %xmm3, %xmm0
3597 movaps %xmm0, 4 * SIZE(BB)
3599 pshufd $0x00, %xmm5, %xmm0
3600 movaps %xmm0, 8 * SIZE(BB)
3601 pshufd $0x00, %xmm7, %xmm0
3602 movaps %xmm0, 12 * SIZE(BB)
3604 movss %xmm0, 0 * SIZE(AA)
3605 movss %xmm1, 1 * SIZE(AA)
3606 movss %xmm2, 2 * SIZE(AA)
3607 movss %xmm3, 3 * SIZE(AA)
3614 #if defined(LN) || defined(LT)
3615 unpcklps %xmm5, %xmm1
3616 unpcklps %xmm7, %xmm3
3618 unpcklps %xmm3, %xmm1
3620 movlps %xmm1, 0 * SIZE(CO1)
3621 movhps %xmm1, 2 * SIZE(CO1)
3623 movlps %xmm0, 0 * SIZE(CO1)
3624 movhps %xmm0, 2 * SIZE(CO1)
3631 #if defined(LT) || defined(RN)
3634 leal (,%eax, SIZE), %eax
3635 leal (AA, %eax, 4), AA
3653 sall $2 + BASE_SHIFT, %eax
3664 leal (B, %eax, SIZE), B
3667 #if defined(LT) || defined(RN)
3670 leal (B, %eax, SIZE), B
3684 movl OLD_STACK, %esp