1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
44 #define OLD_M 4 + STACK(%esi)
45 #define OLD_N 8 + STACK(%esi)
46 #define OLD_K 12 + STACK(%esi)
47 #define OLD_A 20 + STACK(%esi)
48 #define OLD_B 24 + STACK(%esi)
49 #define OLD_C 28 + STACK(%esi)
50 #define OLD_LDC 32 + STACK(%esi)
51 #define STACK_OFFT 36 + STACK(%esi)
59 #define OLD_STACK 40(%esp)
60 #define OFFSET 44(%esp)
63 #define AORIG 56(%esp)
64 #define BORIG 60(%esp)
65 #define BUFFER 128(%esp)
67 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
68 #define PREFETCH prefetch
69 #define PREFETCHW prefetchw
70 #define PREFETCHSIZE (16 * 10 + 8)
73 #if defined(PENTIUM4) || defined(PENTIUMM)
74 #define PREFETCH prefetcht0
75 #define PREFETCHW prefetcht0
76 #define PREFETCHSIZE 96
79 #if defined(PENRYN) || defined(DUNNINGTON)
80 #define PREFETCH prefetcht0
81 #define PREFETCHW prefetcht0
82 #define PREFETCHSIZE 96
91 #if defined(OPTERON) || !defined(HAVE_SSE2)
99 #define KERNEL1(address) \
100 mulps %xmm0, %xmm2; \
101 addps %xmm2, %xmm4; \
102 movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
103 mulps %xmm0, %xmm2; \
104 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
105 addps %xmm2, %xmm5; \
106 movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
107 mulps %xmm0, %xmm2; \
108 mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
109 addps %xmm2, %xmm6; \
110 movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
111 addps %xmm0, %xmm7; \
112 movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
114 #define KERNEL2(address) \
115 mulps %xmm0, %xmm3; \
116 addps %xmm3, %xmm4; \
117 movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
118 mulps %xmm0, %xmm3; \
119 addps %xmm3, %xmm5; \
120 movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
121 mulps %xmm0, %xmm3; \
122 mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
123 addps %xmm3, %xmm6; \
124 movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
125 addps %xmm0, %xmm7; \
126 movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
128 #define KERNEL3(address) \
129 mulps %xmm0, %xmm2; \
130 addps %xmm2, %xmm4; \
131 movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
132 mulps %xmm0, %xmm2; \
133 addps %xmm2, %xmm5; \
134 movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
135 mulps %xmm0, %xmm2; \
136 mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
137 addps %xmm2, %xmm6; \
138 movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
139 addps %xmm0, %xmm7; \
140 movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
142 #define KERNEL4(address) \
143 mulps %xmm0, %xmm3; \
144 addps %xmm3, %xmm4; \
145 movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
146 mulps %xmm0, %xmm3; \
147 addps %xmm3, %xmm5; \
148 movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
149 mulps %xmm0, %xmm3; \
150 mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
151 addps %xmm3, %xmm6; \
152 movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
153 addps %xmm0, %xmm7; \
154 movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
156 #define KERNEL5(address) \
157 mulps %xmm1, %xmm2; \
158 addps %xmm2, %xmm4; \
159 movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
160 mulps %xmm1, %xmm2; \
161 addps %xmm2, %xmm5; \
162 movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
163 mulps %xmm1, %xmm2; \
164 mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
165 addps %xmm2, %xmm6; \
166 movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
167 addps %xmm1, %xmm7; \
168 movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
170 #define KERNEL6(address) \
171 mulps %xmm1, %xmm3; \
172 addps %xmm3, %xmm4; \
173 movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
174 mulps %xmm1, %xmm3; \
175 addps %xmm3, %xmm5; \
176 movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
177 mulps %xmm1, %xmm3; \
178 mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
179 addps %xmm3, %xmm6; \
180 movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
181 addps %xmm1, %xmm7; \
182 movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
184 #define KERNEL7(address) \
185 mulps %xmm1, %xmm2; \
186 addps %xmm2, %xmm4; \
187 movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
188 mulps %xmm1, %xmm2; \
189 addps %xmm2, %xmm5; \
190 movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
191 mulps %xmm1, %xmm2; \
192 mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
193 addps %xmm2, %xmm6; \
194 movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
195 addps %xmm1, %xmm7; \
196 movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
198 #define KERNEL8(address) \
199 mulps %xmm1, %xmm3; \
200 addps %xmm3, %xmm4; \
201 movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
202 mulps %xmm1, %xmm3; \
203 addps %xmm3, %xmm5; \
204 movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
205 mulps %xmm1, %xmm3; \
206 mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
207 addps %xmm3, %xmm6; \
208 movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
209 addps %xmm1, %xmm7; \
210 movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
223 subl $128 + LOCAL_BUFFER_SIZE, %esp
238 movss STACK_OFFT, %xmm4
249 leal (, LDC, SIZE), LDC
253 leal (, %eax, SIZE), %eax
261 leal (, %eax, SIZE), %eax
295 sall $2 + BASE_SHIFT, %eax
299 #if defined(LN) || defined(RT)
302 sall $2 + BASE_SHIFT, %eax
304 leal (BB, %eax, 4), BB
312 #if defined(LT) || defined(RN)
323 movaps 0 * SIZE(B), %xmm3
324 movaps 4 * SIZE(B), %xmm7
326 pshufd $0x00, %xmm3, %xmm0
327 pshufd $0x55, %xmm3, %xmm1
328 pshufd $0xaa, %xmm3, %xmm2
329 pshufd $0xff, %xmm3, %xmm3
331 pshufd $0x00, %xmm7, %xmm4
332 pshufd $0x55, %xmm7, %xmm5
333 pshufd $0xaa, %xmm7, %xmm6
334 pshufd $0xff, %xmm7, %xmm7
336 movaps %xmm0, 0 * SIZE(BB)
337 movaps %xmm1, 4 * SIZE(BB)
338 movaps %xmm2, 8 * SIZE(BB)
339 movaps %xmm3, 12 * SIZE(BB)
340 movaps %xmm4, 16 * SIZE(BB)
341 movaps %xmm5, 20 * SIZE(BB)
342 movaps %xmm6, 24 * SIZE(BB)
343 movaps %xmm7, 28 * SIZE(BB)
346 addl $32 * SIZE, %ecx
352 #if defined(LT) || defined(RN)
362 movaps 0 * SIZE(B), %xmm3
364 pshufd $0x00, %xmm3, %xmm0
365 pshufd $0x55, %xmm3, %xmm1
366 pshufd $0xaa, %xmm3, %xmm2
367 pshufd $0xff, %xmm3, %xmm3
369 movaps %xmm0, 0 * SIZE(BB)
370 movaps %xmm1, 4 * SIZE(BB)
371 movaps %xmm2, 8 * SIZE(BB)
372 movaps %xmm3, 12 * SIZE(BB)
378 #if defined(LT) || defined(RN)
385 leal (, LDC, 4), %eax
396 sarl $2, %ebx # i = (m >> 2)
403 sall $2 + BASE_SHIFT, %eax
407 #if defined(LN) || defined(RT)
410 leal (, %eax, SIZE), %eax
411 leal (AA, %eax, 4), AA
416 #if defined(LN) || defined(RT)
418 sall $2 + BASE_SHIFT, %eax
419 leal (BB, %eax, 4), BB
422 movaps 0 * SIZE(AA), %xmm0
424 movaps 16 * SIZE(AA), %xmm1
426 movaps 0 * SIZE(BB), %xmm2
428 movaps 16 * SIZE(BB), %xmm3
431 leal (LDC, LDC, 2), %eax
433 PREFETCHW 3 * SIZE(CO1)
434 PREFETCHW 3 * SIZE(CO1, LDC)
435 PREFETCHW 3 * SIZE(CO1, LDC, 2)
436 PREFETCHW 3 * SIZE(CO1, %eax)
438 #if defined(LT) || defined(RN)
465 #if defined(LT) || defined(RN)
471 andl $7, %eax # if (k & 1)
479 movaps 4 * SIZE(BB), %xmm2
482 movaps 8 * SIZE(BB), %xmm2
484 mulps 12 * SIZE(BB), %xmm0
486 movaps 16 * SIZE(BB), %xmm2
488 movaps 4 * SIZE(AA), %xmm0
497 #if defined(LN) || defined(RT)
509 sall $2 + BASE_SHIFT, %eax
510 leal (AA, %eax, 1), AA
512 leal (BB, %eax, 4), BB
515 #if defined(LN) || defined(LT)
517 unpcklps %xmm6, %xmm4
518 unpckhps %xmm6, %xmm0
521 unpcklps %xmm7, %xmm5
522 unpckhps %xmm7, %xmm1
525 unpcklps %xmm5, %xmm4
526 unpckhps %xmm5, %xmm6
529 unpcklps %xmm1, %xmm0
530 unpckhps %xmm1, %xmm2
532 movaps 0 * SIZE(B), %xmm1
533 movaps 4 * SIZE(B), %xmm3
534 movaps 8 * SIZE(B), %xmm5
535 movaps 12 * SIZE(B), %xmm7
542 movaps 0 * SIZE(AA), %xmm0
543 movaps 4 * SIZE(AA), %xmm1
544 movaps 8 * SIZE(AA), %xmm2
545 movaps 12 * SIZE(AA), %xmm3
554 movaps 12 * SIZE(AA), %xmm4
555 pshufd $0xff, %xmm4, %xmm6
557 pshufd $0xaa, %xmm4, %xmm6
560 pshufd $0x55, %xmm4, %xmm6
563 pshufd $0x00, %xmm4, %xmm6
567 movaps 8 * SIZE(AA), %xmm4
568 pshufd $0xaa, %xmm4, %xmm6
570 pshufd $0x55, %xmm4, %xmm6
573 pshufd $0x00, %xmm4, %xmm6
577 movaps 4 * SIZE(AA), %xmm4
578 pshufd $0x55, %xmm4, %xmm6
580 pshufd $0x00, %xmm4, %xmm6
584 movaps 0 * SIZE(AA), %xmm4
585 pshufd $0x00, %xmm4, %xmm6
590 movaps 0 * SIZE(AA), %xmm4
591 pshufd $0x00, %xmm4, %xmm6
594 pshufd $0x55, %xmm4, %xmm6
597 pshufd $0xaa, %xmm4, %xmm6
600 pshufd $0xff, %xmm4, %xmm6
604 movaps 4 * SIZE(AA), %xmm4
605 pshufd $0x55, %xmm4, %xmm6
607 pshufd $0xaa, %xmm4, %xmm6
610 pshufd $0xff, %xmm4, %xmm6
614 movaps 8 * SIZE(AA), %xmm4
615 pshufd $0xaa, %xmm4, %xmm6
617 pshufd $0xff, %xmm4, %xmm6
621 movaps 12 * SIZE(AA), %xmm4
622 pshufd $0xff, %xmm4, %xmm6
627 movaps 0 * SIZE(B), %xmm6
628 pshufd $0x00, %xmm6, %xmm7
630 pshufd $0x55, %xmm6, %xmm7
633 pshufd $0xaa, %xmm6, %xmm7
636 pshufd $0xff, %xmm6, %xmm7
640 movaps 4 * SIZE(B), %xmm6
641 pshufd $0x55, %xmm6, %xmm7
643 pshufd $0xaa, %xmm6, %xmm7
646 pshufd $0xff, %xmm6, %xmm7
650 movaps 8 * SIZE(B), %xmm6
651 pshufd $0xaa, %xmm6, %xmm7
653 pshufd $0xff, %xmm6, %xmm7
657 movaps 12 * SIZE(B), %xmm6
658 pshufd $0xff, %xmm6, %xmm7
663 movaps 12 * SIZE(B), %xmm6
664 pshufd $0xff, %xmm6, %xmm7
666 pshufd $0xaa, %xmm6, %xmm7
669 pshufd $0x55, %xmm6, %xmm7
672 pshufd $0x00, %xmm6, %xmm7
676 movaps 8 * SIZE(B), %xmm6
677 pshufd $0xaa, %xmm6, %xmm7
679 pshufd $0x55, %xmm6, %xmm7
682 pshufd $0x00, %xmm6, %xmm7
686 movaps 4 * SIZE(B), %xmm6
687 pshufd $0x55, %xmm6, %xmm7
689 pshufd $0x00, %xmm6, %xmm7
693 movaps 0 * SIZE(B), %xmm6
694 pshufd $0x00, %xmm6, %xmm7
698 #if defined(LN) || defined(LT)
699 movaps %xmm1, 0 * SIZE(B)
700 movaps %xmm3, 4 * SIZE(B)
701 movaps %xmm5, 8 * SIZE(B)
702 movaps %xmm7, 12 * SIZE(B)
704 pshufd $0x00, %xmm1, %xmm0
705 pshufd $0x55, %xmm1, %xmm2
706 pshufd $0xaa, %xmm1, %xmm4
707 pshufd $0xff, %xmm1, %xmm6
708 movaps %xmm0, 0 * SIZE(BB)
709 movaps %xmm2, 4 * SIZE(BB)
710 movaps %xmm4, 8 * SIZE(BB)
711 movaps %xmm6, 12 * SIZE(BB)
713 pshufd $0x00, %xmm3, %xmm0
714 pshufd $0x55, %xmm3, %xmm2
715 pshufd $0xaa, %xmm3, %xmm4
716 pshufd $0xff, %xmm3, %xmm6
717 movaps %xmm0, 16 * SIZE(BB)
718 movaps %xmm2, 20 * SIZE(BB)
719 movaps %xmm4, 24 * SIZE(BB)
720 movaps %xmm6, 28 * SIZE(BB)
722 pshufd $0x00, %xmm5, %xmm0
723 pshufd $0x55, %xmm5, %xmm2
724 pshufd $0xaa, %xmm5, %xmm4
725 pshufd $0xff, %xmm5, %xmm6
726 movaps %xmm0, 32 * SIZE(BB)
727 movaps %xmm2, 36 * SIZE(BB)
728 movaps %xmm4, 40 * SIZE(BB)
729 movaps %xmm6, 44 * SIZE(BB)
731 pshufd $0x00, %xmm7, %xmm0
732 pshufd $0x55, %xmm7, %xmm2
733 pshufd $0xaa, %xmm7, %xmm4
734 pshufd $0xff, %xmm7, %xmm6
735 movaps %xmm0, 48 * SIZE(BB)
736 movaps %xmm2, 52 * SIZE(BB)
737 movaps %xmm4, 56 * SIZE(BB)
738 movaps %xmm6, 60 * SIZE(BB)
740 movaps %xmm0, 0 * SIZE(AA)
741 movaps %xmm1, 4 * SIZE(AA)
742 movaps %xmm2, 8 * SIZE(AA)
743 movaps %xmm3, 12 * SIZE(AA)
750 leal (LDC, LDC, 2), %eax
752 #if defined(LN) || defined(LT)
754 unpcklps %xmm5, %xmm1
755 unpckhps %xmm5, %xmm0
758 unpcklps %xmm7, %xmm3
759 unpckhps %xmm7, %xmm4
762 unpcklps %xmm3, %xmm1
763 unpckhps %xmm3, %xmm2
766 unpcklps %xmm4, %xmm0
767 unpckhps %xmm4, %xmm6
769 movlps %xmm1, 0 * SIZE(CO1)
770 movhps %xmm1, 2 * SIZE(CO1)
771 movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
772 movhps %xmm2, 2 * SIZE(CO1, LDC, 1)
773 movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
774 movhps %xmm0, 2 * SIZE(CO1, LDC, 2)
775 movlps %xmm6, 0 * SIZE(CO1, %eax, 1)
776 movhps %xmm6, 2 * SIZE(CO1, %eax, 1)
778 movlps %xmm0, 0 * SIZE(CO1)
779 movhps %xmm0, 2 * SIZE(CO1)
780 movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
781 movhps %xmm1, 2 * SIZE(CO1, LDC, 1)
782 movlps %xmm2, 0 * SIZE(CO1, LDC, 2)
783 movhps %xmm2, 2 * SIZE(CO1, LDC, 2)
784 movlps %xmm3, 0 * SIZE(CO1, %eax, 1)
785 movhps %xmm3, 2 * SIZE(CO1, %eax, 1)
792 #if defined(LT) || defined(RN)
795 leal (,%eax, SIZE), %eax
796 leal (AA, %eax, 4), AA
814 sall $2 + BASE_SHIFT, %eax
828 sall $1 + BASE_SHIFT, %eax
832 #if defined(LN) || defined(RT)
835 leal (, %eax, SIZE), %eax
836 leal (AA, %eax, 2), AA
841 #if defined(LN) || defined(RT)
843 sall $2 + BASE_SHIFT, %eax
844 leal (BB, %eax, 4), BB
850 movsd 0 * SIZE(AA), %xmm0
855 movsd 8 * SIZE(AA), %xmm1
857 movaps 0 * SIZE(BB), %xmm2
859 movaps 16 * SIZE(BB), %xmm3
862 #if defined(LT) || defined(RN)
875 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
876 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
878 movaps 4 * SIZE(BB), %xmm2
881 movaps 8 * SIZE(BB), %xmm2
884 movaps 12 * SIZE(BB), %xmm2
886 movsd 2 * SIZE(AA), %xmm0
888 movaps 32 * SIZE(BB), %xmm2
892 movaps 20 * SIZE(BB), %xmm3
895 movaps 24 * SIZE(BB), %xmm3
898 movaps 28 * SIZE(BB), %xmm3
900 movsd 4 * SIZE(AA), %xmm0
902 movaps 48 * SIZE(BB), %xmm3
906 movaps 36 * SIZE(BB), %xmm2
909 movaps 40 * SIZE(BB), %xmm2
912 movaps 44 * SIZE(BB), %xmm2
914 movsd 6 * SIZE(AA), %xmm0
916 movaps 64 * SIZE(BB), %xmm2
920 movaps 52 * SIZE(BB), %xmm3
923 movaps 56 * SIZE(BB), %xmm3
926 movaps 60 * SIZE(BB), %xmm3
928 movsd 16 * SIZE(AA), %xmm0
930 movaps 80 * SIZE(BB), %xmm3
934 movaps 68 * SIZE(BB), %xmm2
937 movaps 72 * SIZE(BB), %xmm2
940 movaps 76 * SIZE(BB), %xmm2
942 movsd 10 * SIZE(AA), %xmm1
944 movaps 96 * SIZE(BB), %xmm2
948 movaps 84 * SIZE(BB), %xmm3
951 movaps 88 * SIZE(BB), %xmm3
954 movaps 92 * SIZE(BB), %xmm3
956 movsd 12 * SIZE(AA), %xmm1
958 movaps 112 * SIZE(BB), %xmm3
962 movaps 100 * SIZE(BB), %xmm2
965 movaps 104 * SIZE(BB), %xmm2
968 movaps 108 * SIZE(BB), %xmm2
970 movsd 14 * SIZE(AA), %xmm1
972 movaps 128 * SIZE(BB), %xmm2
976 movaps 116 * SIZE(BB), %xmm3
979 movaps 120 * SIZE(BB), %xmm3
982 movaps 124 * SIZE(BB), %xmm3
984 movsd 24 * SIZE(AA), %xmm1
986 movaps 144 * SIZE(BB), %xmm3
995 #if defined(LT) || defined(RN)
1001 andl $7, %eax # if (k & 1)
1009 movaps 4 * SIZE(BB), %xmm2
1012 movaps 8 * SIZE(BB), %xmm2
1015 movaps 12 * SIZE(BB), %xmm2
1017 movsd 2 * SIZE(AA), %xmm0
1019 movaps 16 * SIZE(BB), %xmm2
1028 #if defined(LN) || defined(RT)
1040 sall $1 + BASE_SHIFT, %eax
1041 leal (AA, %eax, 1), AA
1042 leal (B, %eax, 2), B
1043 leal (BB, %eax, 8), BB
1046 #if defined(LN) || defined(LT)
1047 unpcklps %xmm6, %xmm4
1048 unpcklps %xmm7, %xmm5
1051 unpcklps %xmm5, %xmm4
1052 unpckhps %xmm5, %xmm6
1054 movaps 0 * SIZE(B), %xmm1
1055 movaps 4 * SIZE(B), %xmm3
1063 movsd 0 * SIZE(AA), %xmm0
1067 movsd 2 * SIZE(AA), %xmm1
1071 movsd 4 * SIZE(AA), %xmm2
1075 movsd 6 * SIZE(AA), %xmm3
1084 movaps 0 * SIZE(AA), %xmm4
1085 pshufd $0xff, %xmm4, %xmm6
1087 pshufd $0xaa, %xmm4, %xmm6
1091 pshufd $0x00, %xmm4, %xmm6
1096 movaps 0 * SIZE(AA), %xmm4
1097 pshufd $0x00, %xmm4, %xmm6
1100 pshufd $0x55, %xmm4, %xmm6
1104 pshufd $0xff, %xmm4, %xmm6
1109 movaps 0 * SIZE(B), %xmm6
1110 pshufd $0x00, %xmm6, %xmm7
1112 pshufd $0x55, %xmm6, %xmm7
1115 pshufd $0xaa, %xmm6, %xmm7
1118 pshufd $0xff, %xmm6, %xmm7
1122 movaps 4 * SIZE(B), %xmm6
1123 pshufd $0x55, %xmm6, %xmm7
1125 pshufd $0xaa, %xmm6, %xmm7
1128 pshufd $0xff, %xmm6, %xmm7
1132 movaps 8 * SIZE(B), %xmm6
1133 pshufd $0xaa, %xmm6, %xmm7
1135 pshufd $0xff, %xmm6, %xmm7
1139 movaps 12 * SIZE(B), %xmm6
1140 pshufd $0xff, %xmm6, %xmm7
1145 movaps 12 * SIZE(B), %xmm6
1146 pshufd $0xff, %xmm6, %xmm7
1148 pshufd $0xaa, %xmm6, %xmm7
1151 pshufd $0x55, %xmm6, %xmm7
1154 pshufd $0x00, %xmm6, %xmm7
1158 movaps 8 * SIZE(B), %xmm6
1159 pshufd $0xaa, %xmm6, %xmm7
1161 pshufd $0x55, %xmm6, %xmm7
1164 pshufd $0x00, %xmm6, %xmm7
1168 movaps 4 * SIZE(B), %xmm6
1169 pshufd $0x55, %xmm6, %xmm7
1171 pshufd $0x00, %xmm6, %xmm7
1175 movaps 0 * SIZE(B), %xmm6
1176 pshufd $0x00, %xmm6, %xmm7
1180 #if defined(LN) || defined(LT)
1181 movaps %xmm1, 0 * SIZE(B)
1182 movaps %xmm3, 4 * SIZE(B)
1184 pshufd $0x00, %xmm1, %xmm0
1185 pshufd $0x55, %xmm1, %xmm2
1186 pshufd $0xaa, %xmm1, %xmm4
1187 pshufd $0xff, %xmm1, %xmm6
1188 movaps %xmm0, 0 * SIZE(BB)
1189 movaps %xmm2, 4 * SIZE(BB)
1190 movaps %xmm4, 8 * SIZE(BB)
1191 movaps %xmm6, 12 * SIZE(BB)
1193 pshufd $0x00, %xmm3, %xmm0
1194 pshufd $0x55, %xmm3, %xmm2
1195 pshufd $0xaa, %xmm3, %xmm4
1196 pshufd $0xff, %xmm3, %xmm6
1197 movaps %xmm0, 16 * SIZE(BB)
1198 movaps %xmm2, 20 * SIZE(BB)
1199 movaps %xmm4, 24 * SIZE(BB)
1200 movaps %xmm6, 28 * SIZE(BB)
1202 movlps %xmm0, 0 * SIZE(AA)
1203 movlps %xmm1, 2 * SIZE(AA)
1204 movlps %xmm2, 4 * SIZE(AA)
1205 movlps %xmm3, 6 * SIZE(AA)
1212 leal (LDC, LDC, 2), %eax
1214 #if defined(LN) || defined(LT)
1216 unpcklps %xmm5, %xmm1
1217 unpckhps %xmm5, %xmm0
1220 unpcklps %xmm7, %xmm3
1221 unpckhps %xmm7, %xmm4
1224 unpcklps %xmm3, %xmm1
1225 unpckhps %xmm3, %xmm2
1228 unpcklps %xmm4, %xmm0
1229 unpckhps %xmm4, %xmm6
1231 movlps %xmm1, 0 * SIZE(CO1)
1232 movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
1233 movlps %xmm0, 0 * SIZE(CO1, LDC, 2)
1234 movlps %xmm6, 0 * SIZE(CO1, %eax, 1)
1236 movlps %xmm0, 0 * SIZE(CO1)
1237 movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
1238 movlps %xmm2, 0 * SIZE(CO1, LDC, 2)
1239 movlps %xmm3, 0 * SIZE(CO1, %eax, 1)
1246 #if defined(LT) || defined(RN)
1249 leal (,%eax, SIZE), %eax
1250 leal (AA, %eax, 2), AA
1268 sall $1 + BASE_SHIFT, %eax
1279 sall $BASE_SHIFT, %eax
1283 #if defined(LN) || defined(RT)
1286 leal (AA, %eax, SIZE), AA
1291 #if defined(LN) || defined(RT)
1293 sall $2 + BASE_SHIFT, %eax
1294 leal (BB, %eax, 4), BB
1297 movss 0 * SIZE(AA), %xmm0
1299 movss 4 * SIZE(AA), %xmm1
1301 movss 0 * SIZE(BB), %xmm2
1303 movss 16 * SIZE(BB), %xmm3
1306 #if defined(LT) || defined(RN)
1319 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
1320 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
1322 movss 4 * SIZE(BB), %xmm2
1325 movss 8 * SIZE(BB), %xmm2
1327 mulss 12 * SIZE(BB), %xmm0
1329 movss 32 * SIZE(BB), %xmm2
1331 movss 1 * SIZE(AA), %xmm0
1335 movss 20 * SIZE(BB), %xmm3
1338 movss 24 * SIZE(BB), %xmm3
1340 mulss 28 * SIZE(BB), %xmm0
1342 movss 48 * SIZE(BB), %xmm3
1344 movss 2 * SIZE(AA), %xmm0
1348 movss 36 * SIZE(BB), %xmm2
1351 movss 40 * SIZE(BB), %xmm2
1353 mulss 44 * SIZE(BB), %xmm0
1355 movss 64 * SIZE(BB), %xmm2
1357 movss 3 * SIZE(AA), %xmm0
1361 movss 52 * SIZE(BB), %xmm3
1364 movss 56 * SIZE(BB), %xmm3
1366 mulss 60 * SIZE(BB), %xmm0
1368 movss 80 * SIZE(BB), %xmm3
1370 movss 8 * SIZE(AA), %xmm0
1374 movss 68 * SIZE(BB), %xmm2
1377 movss 72 * SIZE(BB), %xmm2
1379 mulss 76 * SIZE(BB), %xmm1
1381 movss 96 * SIZE(BB), %xmm2
1383 movss 5 * SIZE(AA), %xmm1
1387 movss 84 * SIZE(BB), %xmm3
1390 movss 88 * SIZE(BB), %xmm3
1392 mulss 92 * SIZE(BB), %xmm1
1394 movss 112 * SIZE(BB), %xmm3
1396 movss 6 * SIZE(AA), %xmm1
1400 movss 100 * SIZE(BB), %xmm2
1403 movss 104 * SIZE(BB), %xmm2
1405 mulss 108 * SIZE(BB), %xmm1
1407 movss 128 * SIZE(BB), %xmm2
1409 movss 7 * SIZE(AA), %xmm1
1413 movss 116 * SIZE(BB), %xmm3
1416 movss 120 * SIZE(BB), %xmm3
1418 mulss 124 * SIZE(BB), %xmm1
1420 movss 144 * SIZE(BB), %xmm3
1422 movss 12 * SIZE(AA), %xmm1
1425 addl $128 * SIZE, BB
1431 #if defined(LT) || defined(RN)
1437 andl $7, %eax # if (k & 1)
1445 movss 4 * SIZE(BB), %xmm2
1448 movss 8 * SIZE(BB), %xmm2
1450 mulss 12 * SIZE(BB), %xmm0
1452 movss 16 * SIZE(BB), %xmm2
1454 movss 1 * SIZE(AA), %xmm0
1463 #if defined(LN) || defined(RT)
1475 leal (AA, %eax, SIZE), AA
1477 sall $2 + BASE_SHIFT, %eax
1478 leal (B, %eax, 1), B
1479 leal (BB, %eax, 4), BB
1482 #if defined(LN) || defined(LT)
1483 unpcklps %xmm6, %xmm4
1484 unpcklps %xmm7, %xmm5
1485 unpcklps %xmm5, %xmm4
1487 movaps 0 * SIZE(B), %xmm1
1491 movss 0 * SIZE(AA), %xmm0
1492 movss 1 * SIZE(AA), %xmm1
1493 movss 2 * SIZE(AA), %xmm2
1494 movss 3 * SIZE(AA), %xmm3
1502 #if defined(LN) || defined(LT)
1503 movss 0 * SIZE(AA), %xmm4
1504 pshufd $0x00, %xmm4, %xmm6
1509 movaps 0 * SIZE(B), %xmm6
1510 pshufd $0x00, %xmm6, %xmm7
1512 pshufd $0x55, %xmm6, %xmm7
1515 pshufd $0xaa, %xmm6, %xmm7
1518 pshufd $0xff, %xmm6, %xmm7
1522 movaps 4 * SIZE(B), %xmm6
1523 pshufd $0x55, %xmm6, %xmm7
1525 pshufd $0xaa, %xmm6, %xmm7
1528 pshufd $0xff, %xmm6, %xmm7
1532 movaps 8 * SIZE(B), %xmm6
1533 pshufd $0xaa, %xmm6, %xmm7
1535 pshufd $0xff, %xmm6, %xmm7
1539 movaps 12 * SIZE(B), %xmm6
1540 pshufd $0xff, %xmm6, %xmm7
1545 movaps 12 * SIZE(B), %xmm6
1546 pshufd $0xff, %xmm6, %xmm7
1548 pshufd $0xaa, %xmm6, %xmm7
1551 pshufd $0x55, %xmm6, %xmm7
1554 pshufd $0x00, %xmm6, %xmm7
1558 movaps 8 * SIZE(B), %xmm6
1559 pshufd $0xaa, %xmm6, %xmm7
1561 pshufd $0x55, %xmm6, %xmm7
1564 pshufd $0x00, %xmm6, %xmm7
1568 movaps 4 * SIZE(B), %xmm6
1569 pshufd $0x55, %xmm6, %xmm7
1571 pshufd $0x00, %xmm6, %xmm7
1575 movaps 0 * SIZE(B), %xmm6
1576 pshufd $0x00, %xmm6, %xmm7
1580 #if defined(LN) || defined(LT)
1581 movaps %xmm1, 0 * SIZE(B)
1583 pshufd $0x00, %xmm1, %xmm0
1584 pshufd $0x55, %xmm1, %xmm2
1585 pshufd $0xaa, %xmm1, %xmm4
1586 pshufd $0xff, %xmm1, %xmm6
1587 movaps %xmm0, 0 * SIZE(BB)
1588 movaps %xmm2, 4 * SIZE(BB)
1589 movaps %xmm4, 8 * SIZE(BB)
1590 movaps %xmm6, 12 * SIZE(BB)
1592 movss %xmm0, 0 * SIZE(AA)
1593 movss %xmm1, 1 * SIZE(AA)
1594 movss %xmm2, 2 * SIZE(AA)
1595 movss %xmm3, 3 * SIZE(AA)
1602 leal (LDC, LDC, 2), %eax
1604 #if defined(LN) || defined(LT)
1606 unpcklps %xmm5, %xmm1
1607 unpckhps %xmm5, %xmm0
1610 unpcklps %xmm7, %xmm3
1611 unpckhps %xmm7, %xmm4
1614 unpcklps %xmm3, %xmm1
1615 unpckhps %xmm3, %xmm2
1618 unpcklps %xmm4, %xmm0
1619 unpckhps %xmm4, %xmm6
1621 movss %xmm1, 0 * SIZE(CO1)
1622 movss %xmm2, 0 * SIZE(CO1, LDC, 1)
1623 movss %xmm0, 0 * SIZE(CO1, LDC, 2)
1624 movss %xmm6, 0 * SIZE(CO1, %eax, 1)
1626 movss %xmm0, 0 * SIZE(CO1)
1627 movss %xmm1, 0 * SIZE(CO1, LDC, 1)
1628 movss %xmm2, 0 * SIZE(CO1, LDC, 2)
1629 movss %xmm3, 0 * SIZE(CO1, %eax, 1)
1636 #if defined(LT) || defined(RN)
1639 leal (AA, %eax, SIZE), AA
1657 sall $BASE_SHIFT, %eax
1665 leal (, %eax, SIZE), %eax
1666 leal (B, %eax, 4), B
1669 #if defined(LT) || defined(RN)
1672 leal (,%eax, SIZE), %eax
1673 leal (B, %eax, 4), B
1702 sall $1 + BASE_SHIFT, %eax
1706 #if defined(LN) || defined(RT)
1709 sall $1 + BASE_SHIFT, %eax
1710 leal (B, %eax, 1), B
1711 leal (BB, %eax, 4), BB
1719 #if defined(LT) || defined(RN)
1730 movaps 0 * SIZE(B), %xmm3
1731 movaps 4 * SIZE(B), %xmm7
1733 pshufd $0x00, %xmm3, %xmm0
1734 pshufd $0x55, %xmm3, %xmm1
1735 pshufd $0xaa, %xmm3, %xmm2
1736 pshufd $0xff, %xmm3, %xmm3
1738 pshufd $0x00, %xmm7, %xmm4
1739 pshufd $0x55, %xmm7, %xmm5
1740 pshufd $0xaa, %xmm7, %xmm6
1741 pshufd $0xff, %xmm7, %xmm7
1743 movaps %xmm0, 0 * SIZE(BB)
1744 movaps %xmm1, 4 * SIZE(BB)
1745 movaps %xmm2, 8 * SIZE(BB)
1746 movaps %xmm3, 12 * SIZE(BB)
1747 movaps %xmm4, 16 * SIZE(BB)
1748 movaps %xmm5, 20 * SIZE(BB)
1749 movaps %xmm6, 24 * SIZE(BB)
1750 movaps %xmm7, 28 * SIZE(BB)
1753 addl $32 * SIZE, %ecx
1759 #if defined(LT) || defined(RN)
1774 movsd 0 * SIZE(B), %xmm3
1776 pshufd $0x00, %xmm3, %xmm0
1777 pshufd $0x55, %xmm3, %xmm1
1779 movaps %xmm0, 0 * SIZE(BB)
1780 movaps %xmm1, 4 * SIZE(BB)
1783 addl $8 * SIZE, %ecx
1789 #if defined(LT) || defined(RN)
1796 leal (, LDC, 2), %eax
1807 sarl $2, %ebx # i = (m >> 2)
1814 sall $2 + BASE_SHIFT, %eax
1818 #if defined(LN) || defined(RT)
1821 leal (, %eax, SIZE), %eax
1822 leal (AA, %eax, 4), AA
1827 #if defined(LN) || defined(RT)
1829 sall $1 + BASE_SHIFT, %eax
1830 leal (BB, %eax, 4), BB
1838 movaps 0 * SIZE(AA), %xmm0
1839 movaps 16 * SIZE(AA), %xmm1
1840 movaps 0 * SIZE(BB), %xmm2
1841 movaps 16 * SIZE(BB), %xmm3
1843 PREFETCHW 3 * SIZE(CO1)
1844 PREFETCHW 3 * SIZE(CO1, LDC)
1846 #if defined(LT) || defined(RN)
1858 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
1859 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
1861 mulps 4 * SIZE(BB), %xmm0
1863 movaps 8 * SIZE(BB), %xmm2
1865 movaps 4 * SIZE(AA), %xmm0
1868 mulps 12 * SIZE(BB), %xmm0
1870 movaps 32 * SIZE(BB), %xmm2
1872 movaps 8 * SIZE(AA), %xmm0
1875 mulps 20 * SIZE(BB), %xmm0
1877 movaps 24 * SIZE(BB), %xmm3
1879 movaps 12 * SIZE(AA), %xmm0
1882 mulps 28 * SIZE(BB), %xmm0
1884 movaps 48 * SIZE(BB), %xmm3
1886 movaps 32 * SIZE(AA), %xmm0
1888 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
1889 prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
1892 mulps 36 * SIZE(BB), %xmm1
1894 movaps 40 * SIZE(BB), %xmm2
1896 movaps 20 * SIZE(AA), %xmm1
1899 mulps 44 * SIZE(BB), %xmm1
1901 movaps 64 * SIZE(BB), %xmm2
1903 movaps 24 * SIZE(AA), %xmm1
1906 mulps 52 * SIZE(BB), %xmm1
1908 movaps 56 * SIZE(BB), %xmm3
1910 movaps 28 * SIZE(AA), %xmm1
1913 mulps 60 * SIZE(BB), %xmm1
1915 movaps 80 * SIZE(BB), %xmm3
1917 movaps 48 * SIZE(AA), %xmm1
1926 #if defined(LT) || defined(RN)
1932 andl $7, %eax # if (k & 1)
1939 mulps 4 * SIZE(BB), %xmm0
1941 movaps 8 * SIZE(BB), %xmm2
1943 movaps 4 * SIZE(AA), %xmm0
1952 #if defined(LN) || defined(RT)
1964 sall $1 + BASE_SHIFT, %eax
1965 leal (AA, %eax, 2), AA
1966 leal (B, %eax, 1), B
1967 leal (BB, %eax, 4), BB
1970 #if defined(LN) || defined(LT)
1972 unpcklps %xmm6, %xmm4
1973 unpckhps %xmm6, %xmm0
1976 unpcklps %xmm7, %xmm5
1977 unpckhps %xmm7, %xmm1
1980 unpcklps %xmm5, %xmm4
1981 unpckhps %xmm5, %xmm6
1984 unpcklps %xmm1, %xmm0
1985 unpckhps %xmm1, %xmm2
1990 movsd 0 * SIZE(B), %xmm1
1994 movsd 2 * SIZE(B), %xmm3
1998 movsd 4 * SIZE(B), %xmm5
2002 movsd 6 * SIZE(B), %xmm7
2009 movaps 0 * SIZE(AA), %xmm0
2010 movaps 4 * SIZE(AA), %xmm1
2017 movaps 12 * SIZE(AA), %xmm4
2018 pshufd $0xff, %xmm4, %xmm6
2020 pshufd $0xaa, %xmm4, %xmm6
2023 pshufd $0x55, %xmm4, %xmm6
2026 pshufd $0x00, %xmm4, %xmm6
2030 movaps 8 * SIZE(AA), %xmm4
2031 pshufd $0xaa, %xmm4, %xmm6
2033 pshufd $0x55, %xmm4, %xmm6
2036 pshufd $0x00, %xmm4, %xmm6
2040 movaps 4 * SIZE(AA), %xmm4
2041 pshufd $0x55, %xmm4, %xmm6
2043 pshufd $0x00, %xmm4, %xmm6
2047 movaps 0 * SIZE(AA), %xmm4
2048 pshufd $0x00, %xmm4, %xmm6
2053 movaps 0 * SIZE(AA), %xmm4
2054 pshufd $0x00, %xmm4, %xmm6
2057 pshufd $0x55, %xmm4, %xmm6
2060 pshufd $0xaa, %xmm4, %xmm6
2063 pshufd $0xff, %xmm4, %xmm6
2067 movaps 4 * SIZE(AA), %xmm4
2068 pshufd $0x55, %xmm4, %xmm6
2070 pshufd $0xaa, %xmm4, %xmm6
2073 pshufd $0xff, %xmm4, %xmm6
2077 movaps 8 * SIZE(AA), %xmm4
2078 pshufd $0xaa, %xmm4, %xmm6
2080 pshufd $0xff, %xmm4, %xmm6
2084 movaps 12 * SIZE(AA), %xmm4
2085 pshufd $0xff, %xmm4, %xmm6
2090 movaps 0 * SIZE(B), %xmm6
2091 pshufd $0x00, %xmm6, %xmm7
2093 pshufd $0x55, %xmm6, %xmm7
2097 pshufd $0xff, %xmm6, %xmm7
2102 movaps 0 * SIZE(B), %xmm6
2103 pshufd $0xff, %xmm6, %xmm7
2105 pshufd $0xaa, %xmm6, %xmm7
2109 pshufd $0x00, %xmm6, %xmm7
2113 #if defined(LN) || defined(LT)
2114 movlps %xmm1, 0 * SIZE(B)
2115 movlps %xmm3, 2 * SIZE(B)
2116 movlps %xmm5, 4 * SIZE(B)
2117 movlps %xmm7, 6 * SIZE(B)
2119 pshufd $0x00, %xmm1, %xmm0
2120 pshufd $0x55, %xmm1, %xmm2
2121 movaps %xmm0, 0 * SIZE(BB)
2122 movaps %xmm2, 4 * SIZE(BB)
2124 pshufd $0x00, %xmm3, %xmm0
2125 pshufd $0x55, %xmm3, %xmm2
2126 movaps %xmm0, 8 * SIZE(BB)
2127 movaps %xmm2, 12 * SIZE(BB)
2129 pshufd $0x00, %xmm5, %xmm0
2130 pshufd $0x55, %xmm5, %xmm2
2131 movaps %xmm0, 16 * SIZE(BB)
2132 movaps %xmm2, 20 * SIZE(BB)
2134 pshufd $0x00, %xmm7, %xmm0
2135 pshufd $0x55, %xmm7, %xmm2
2136 movaps %xmm0, 24 * SIZE(BB)
2137 movaps %xmm2, 28 * SIZE(BB)
2139 movaps %xmm0, 0 * SIZE(AA)
2140 movaps %xmm1, 4 * SIZE(AA)
2147 #if defined(LN) || defined(LT)
2148 unpcklps %xmm5, %xmm1
2149 unpcklps %xmm7, %xmm3
2152 unpcklps %xmm3, %xmm1
2153 unpckhps %xmm3, %xmm2
2155 movlps %xmm1, 0 * SIZE(CO1)
2156 movhps %xmm1, 2 * SIZE(CO1)
2157 movlps %xmm2, 0 * SIZE(CO1, LDC, 1)
2158 movhps %xmm2, 2 * SIZE(CO1, LDC, 1)
2160 movlps %xmm0, 0 * SIZE(CO1)
2161 movhps %xmm0, 2 * SIZE(CO1)
2162 movlps %xmm1, 0 * SIZE(CO1, LDC, 1)
2163 movhps %xmm1, 2 * SIZE(CO1, LDC, 1)
2170 #if defined(LT) || defined(RN)
2173 leal (,%eax, SIZE), %eax
2174 leal (AA, %eax, 4), AA
2192 sall $2 + BASE_SHIFT, %eax
2206 sall $1 + BASE_SHIFT, %eax
2210 #if defined(LN) || defined(RT)
2213 leal (, %eax, SIZE), %eax
2214 leal (AA, %eax, 2), AA
2219 #if defined(LN) || defined(RT)
2221 sall $1 + BASE_SHIFT, %eax
2222 leal (BB, %eax, 4), BB
2233 movsd 0 * SIZE(AA), %xmm0
2237 movsd 8 * SIZE(AA), %xmm1
2238 movaps 0 * SIZE(BB), %xmm2
2239 movaps 16 * SIZE(BB), %xmm3
2241 #if defined(LT) || defined(RN)
2252 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
2253 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
2258 movaps 4 * SIZE(BB), %xmm2
2260 movsd 2 * SIZE(AA), %xmm0
2262 movaps 8 * SIZE(BB), %xmm2
2266 movaps 12 * SIZE(BB), %xmm2
2268 movsd 4 * SIZE(AA), %xmm0
2270 movaps 32 * SIZE(BB), %xmm2
2274 movaps 20 * SIZE(BB), %xmm3
2276 movsd 6 * SIZE(AA), %xmm0
2278 movaps 24 * SIZE(BB), %xmm3
2282 movaps 28 * SIZE(BB), %xmm3
2284 movsd 16 * SIZE(AA), %xmm0
2286 movaps 48 * SIZE(BB), %xmm3
2290 movaps 36 * SIZE(BB), %xmm2
2292 movsd 10 * SIZE(AA), %xmm1
2294 movaps 40 * SIZE(BB), %xmm2
2298 movaps 44 * SIZE(BB), %xmm2
2300 movsd 12 * SIZE(AA), %xmm1
2302 movaps 64 * SIZE(BB), %xmm2
2306 movaps 52 * SIZE(BB), %xmm3
2308 movsd 14 * SIZE(AA), %xmm1
2310 movaps 56 * SIZE(BB), %xmm3
2314 movaps 60 * SIZE(BB), %xmm3
2316 movsd 24 * SIZE(AA), %xmm1
2318 movaps 80 * SIZE(BB), %xmm3
2327 #if defined(LT) || defined(RN)
2333 andl $7, %eax # if (k & 1)
2341 movaps 4 * SIZE(BB), %xmm2
2343 movsd 2 * SIZE(AA), %xmm0
2345 movaps 8 * SIZE(BB), %xmm2
2357 #if defined(LN) || defined(RT)
2369 sall $BASE_SHIFT, %eax
2370 leal (AA, %eax, 2), AA
2371 leal (B, %eax, 2), B
2372 leal (BB, %eax, 8), BB
2375 #if defined(LN) || defined(LT)
2376 unpcklps %xmm6, %xmm4
2377 unpcklps %xmm7, %xmm5
2380 unpcklps %xmm5, %xmm4
2381 unpckhps %xmm5, %xmm6
2386 movsd 0 * SIZE(B), %xmm1
2390 movsd 2 * SIZE(B), %xmm3
2398 movsd 0 * SIZE(AA), %xmm0
2402 movsd 2 * SIZE(AA), %xmm1
2409 movaps 0 * SIZE(AA), %xmm4
2410 pshufd $0xff, %xmm4, %xmm6
2412 pshufd $0xaa, %xmm4, %xmm6
2416 pshufd $0x00, %xmm4, %xmm6
2421 movaps 0 * SIZE(AA), %xmm4
2422 pshufd $0x00, %xmm4, %xmm6
2424 pshufd $0x55, %xmm4, %xmm6
2428 pshufd $0xff, %xmm4, %xmm6
2433 movaps 0 * SIZE(B), %xmm6
2434 pshufd $0x00, %xmm6, %xmm7
2436 pshufd $0x55, %xmm6, %xmm7
2440 pshufd $0xff, %xmm6, %xmm7
2445 movaps 0 * SIZE(B), %xmm6
2446 pshufd $0xff, %xmm6, %xmm7
2448 pshufd $0xaa, %xmm6, %xmm7
2452 pshufd $0x00, %xmm6, %xmm7
2456 #if defined(LN) || defined(LT)
2457 movlps %xmm1, 0 * SIZE(B)
2458 movlps %xmm3, 2 * SIZE(B)
2460 pshufd $0x00, %xmm1, %xmm0
2461 pshufd $0x55, %xmm1, %xmm2
2462 movaps %xmm0, 0 * SIZE(BB)
2463 movaps %xmm2, 4 * SIZE(BB)
2465 pshufd $0x00, %xmm3, %xmm0
2466 pshufd $0x55, %xmm3, %xmm2
2467 movaps %xmm0, 8 * SIZE(BB)
2468 movaps %xmm2, 12 * SIZE(BB)
2470 movlps %xmm0, 0 * SIZE(AA)
2471 movlps %xmm1, 2 * SIZE(AA)
2478 #if defined(LN) || defined(LT)
2479 unpcklps %xmm3, %xmm1
2481 movlps %xmm1, 0 * SIZE(CO1)
2482 movhps %xmm1, 0 * SIZE(CO1, LDC)
2484 movlps %xmm0, 0 * SIZE(CO1)
2485 movlps %xmm1, 0 * SIZE(CO1, LDC)
2492 #if defined(LT) || defined(RN)
2495 leal (,%eax, SIZE), %eax
2496 leal (AA, %eax, 2), AA
2514 sall $1 + BASE_SHIFT, %eax
2525 sall $BASE_SHIFT, %eax
2529 #if defined(LN) || defined(RT)
2532 leal (AA, %eax, SIZE), AA
2537 #if defined(LN) || defined(RT)
2539 sall $1 + BASE_SHIFT, %eax
2540 leal (BB, %eax, 4), BB
2548 movss 0 * SIZE(AA), %xmm0
2549 movss 4 * SIZE(AA), %xmm1
2550 movss 0 * SIZE(BB), %xmm2
2551 movss 16 * SIZE(BB), %xmm3
2553 #if defined(LT) || defined(RN)
2565 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
2566 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
2568 mulss 4 * SIZE(BB), %xmm0
2570 movss 8 * SIZE(BB), %xmm2
2572 movss 1 * SIZE(AA), %xmm0
2574 mulss 12 * SIZE(BB), %xmm0
2576 movss 32 * SIZE(BB), %xmm2
2578 movss 2 * SIZE(AA), %xmm0
2580 mulss 20 * SIZE(BB), %xmm0
2582 movss 24 * SIZE(BB), %xmm3
2584 movss 3 * SIZE(AA), %xmm0
2586 mulss 28 * SIZE(BB), %xmm0
2588 movss 48 * SIZE(BB), %xmm3
2590 movss 8 * SIZE(AA), %xmm0
2592 mulss 36 * SIZE(BB), %xmm1
2594 movss 40 * SIZE(BB), %xmm2
2596 movss 5 * SIZE(AA), %xmm1
2598 mulss 44 * SIZE(BB), %xmm1
2600 movss 64 * SIZE(BB), %xmm2
2602 movss 6 * SIZE(AA), %xmm1
2604 mulss 52 * SIZE(BB), %xmm1
2606 movss 56 * SIZE(BB), %xmm3
2608 movss 7 * SIZE(AA), %xmm1
2610 mulss 60 * SIZE(BB), %xmm1
2612 movss 80 * SIZE(BB), %xmm3
2614 movss 12 * SIZE(AA), %xmm1
2623 #if defined(LT) || defined(RN)
2629 andl $7, %eax # if (k & 1)
2636 mulss 4 * SIZE(BB), %xmm0
2638 movss 8 * SIZE(BB), %xmm2
2640 movss 1 * SIZE(AA), %xmm0
2652 #if defined(LN) || defined(RT)
2664 sall $BASE_SHIFT, %eax
2665 leal (AA, %eax, 1), AA
2666 leal (B, %eax, 2), B
2667 leal (BB, %eax, 8), BB
2670 #if defined(LN) || defined(LT)
2671 unpcklps %xmm5, %xmm4
2676 movsd 0 * SIZE(B), %xmm1
2680 movss 0 * SIZE(AA), %xmm0
2681 movss 1 * SIZE(AA), %xmm1
2687 #if defined(LN) || defined(LT)
2688 movss 0 * SIZE(AA), %xmm4
2689 pshufd $0x00, %xmm4, %xmm6
2694 movaps 0 * SIZE(B), %xmm6
2695 pshufd $0x00, %xmm6, %xmm7
2697 pshufd $0x55, %xmm6, %xmm7
2701 pshufd $0xff, %xmm6, %xmm7
2706 movaps 0 * SIZE(B), %xmm6
2707 pshufd $0xff, %xmm6, %xmm7
2709 pshufd $0xaa, %xmm6, %xmm7
2713 pshufd $0x00, %xmm6, %xmm7
2717 #if defined(LN) || defined(LT)
2718 movlps %xmm1, 0 * SIZE(B)
2720 pshufd $0x00, %xmm1, %xmm0
2721 pshufd $0x55, %xmm1, %xmm2
2722 movaps %xmm0, 0 * SIZE(BB)
2723 movaps %xmm2, 4 * SIZE(BB)
2725 movss %xmm0, 0 * SIZE(AA)
2726 movss %xmm1, 1 * SIZE(AA)
2733 #if defined(LN) || defined(LT)
2734 pshufd $1, %xmm1, %xmm3
2736 movss %xmm1, 0 * SIZE(CO1)
2737 movss %xmm3, 0 * SIZE(CO1, LDC)
2739 movss %xmm0, 0 * SIZE(CO1)
2740 movss %xmm1, 0 * SIZE(CO1, LDC)
2747 #if defined(LT) || defined(RN)
2750 leal (AA, %eax, SIZE), AA
2768 sall $BASE_SHIFT, %eax
2776 leal (, %eax, SIZE), %eax
2777 leal (B, %eax, 2), B
2780 #if defined(LT) || defined(RN)
2783 leal (,%eax, SIZE), %eax
2784 leal (B, %eax, 2), B
2810 sall $BASE_SHIFT, %eax
2814 #if defined(LN) || defined(RT)
2817 sall $BASE_SHIFT, %eax
2818 leal (B, %eax, 1), B
2819 leal (BB, %eax, 4), BB
2827 #if defined(LT) || defined(RN)
2838 movsd 0 * SIZE(B), %xmm3
2839 movhps 2 * SIZE(B), %xmm3
2840 movsd 4 * SIZE(B), %xmm7
2841 movhps 6 * SIZE(B), %xmm7
2843 pshufd $0x00, %xmm3, %xmm0
2844 pshufd $0x55, %xmm3, %xmm1
2845 pshufd $0xaa, %xmm3, %xmm2
2846 pshufd $0xff, %xmm3, %xmm3
2848 pshufd $0x00, %xmm7, %xmm4
2849 pshufd $0x55, %xmm7, %xmm5
2850 pshufd $0xaa, %xmm7, %xmm6
2851 pshufd $0xff, %xmm7, %xmm7
2853 movaps %xmm0, 0 * SIZE(BB)
2854 movaps %xmm1, 4 * SIZE(BB)
2855 movaps %xmm2, 8 * SIZE(BB)
2856 movaps %xmm3, 12 * SIZE(BB)
2857 movaps %xmm4, 16 * SIZE(BB)
2858 movaps %xmm5, 20 * SIZE(BB)
2859 movaps %xmm6, 24 * SIZE(BB)
2860 movaps %xmm7, 28 * SIZE(BB)
2869 #if defined(LT) || defined(RN)
2881 movss 0 * SIZE(B), %xmm3
2883 pshufd $0x00, %xmm3, %xmm0
2885 movaps %xmm0, 0 * SIZE(BB)
2894 #if defined(LT) || defined(RN)
2910 sarl $2, %ebx # i = (m >> 2)
2917 sall $2 + BASE_SHIFT, %eax
2921 #if defined(LN) || defined(RT)
2924 leal (, %eax, SIZE), %eax
2925 leal (AA, %eax, 4), AA
2930 #if defined(LN) || defined(RT)
2932 sall $BASE_SHIFT, %eax
2933 leal (BB, %eax, 4), BB
2941 movaps 0 * SIZE(AA), %xmm0
2942 movaps 16 * SIZE(AA), %xmm1
2943 movaps 0 * SIZE(BB), %xmm2
2944 movaps 16 * SIZE(BB), %xmm3
2946 PREFETCHW 3 * SIZE(CO1)
2948 #if defined(LT) || defined(RN)
2960 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
2961 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
2963 movaps 4 * SIZE(AA), %xmm0
2965 movaps 32 * SIZE(BB), %xmm2
2966 mulps 4 * SIZE(BB), %xmm0
2968 movaps 8 * SIZE(AA), %xmm0
2969 mulps 8 * SIZE(BB), %xmm0
2971 movaps 12 * SIZE(AA), %xmm0
2972 mulps 12 * SIZE(BB), %xmm0
2974 movaps 32 * SIZE(AA), %xmm0
2975 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
2976 prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
2979 movaps 20 * SIZE(AA), %xmm1
2981 movaps 48 * SIZE(BB), %xmm3
2982 mulps 20 * SIZE(BB), %xmm1
2984 movaps 24 * SIZE(AA), %xmm1
2985 mulps 24 * SIZE(BB), %xmm1
2987 movaps 28 * SIZE(AA), %xmm1
2988 mulps 28 * SIZE(BB), %xmm1
2990 movaps 48 * SIZE(AA), %xmm1
2999 #if defined(LT) || defined(RN)
3005 andl $7, %eax # if (k & 1)
3013 movaps 4 * SIZE(AA), %xmm0
3014 movaps 4 * SIZE(BB), %xmm2
3027 #if defined(LN) || defined(RT)
3039 sall $ BASE_SHIFT, %eax
3040 leal (AA, %eax, 4), AA
3041 leal (B, %eax, 1), B
3042 leal (BB, %eax, 4), BB
3045 #if defined(LN) || defined(LT)
3047 unpcklps %xmm6, %xmm4
3048 unpckhps %xmm6, %xmm0
3051 unpcklps %xmm7, %xmm5
3052 unpckhps %xmm7, %xmm1
3055 unpcklps %xmm5, %xmm4
3056 unpckhps %xmm5, %xmm6
3059 unpcklps %xmm1, %xmm0
3060 unpckhps %xmm1, %xmm2
3062 movss 0 * SIZE(B), %xmm1
3063 movss 1 * SIZE(B), %xmm3
3064 movss 2 * SIZE(B), %xmm5
3065 movss 3 * SIZE(B), %xmm7
3072 movaps 0 * SIZE(AA), %xmm0
3078 movaps 12 * SIZE(AA), %xmm4
3079 pshufd $0xff, %xmm4, %xmm6
3081 pshufd $0xaa, %xmm4, %xmm6
3084 pshufd $0x55, %xmm4, %xmm6
3087 pshufd $0x00, %xmm4, %xmm6
3091 movaps 8 * SIZE(AA), %xmm4
3092 pshufd $0xaa, %xmm4, %xmm6
3094 pshufd $0x55, %xmm4, %xmm6
3097 pshufd $0x00, %xmm4, %xmm6
3101 movaps 4 * SIZE(AA), %xmm4
3102 pshufd $0x55, %xmm4, %xmm6
3104 pshufd $0x00, %xmm4, %xmm6
3108 movaps 0 * SIZE(AA), %xmm4
3109 pshufd $0x00, %xmm4, %xmm6
3114 movaps 0 * SIZE(AA), %xmm4
3115 pshufd $0x00, %xmm4, %xmm6
3118 pshufd $0x55, %xmm4, %xmm6
3121 pshufd $0xaa, %xmm4, %xmm6
3124 pshufd $0xff, %xmm4, %xmm6
3128 movaps 4 * SIZE(AA), %xmm4
3129 pshufd $0x55, %xmm4, %xmm6
3131 pshufd $0xaa, %xmm4, %xmm6
3134 pshufd $0xff, %xmm4, %xmm6
3138 movaps 8 * SIZE(AA), %xmm4
3139 pshufd $0xaa, %xmm4, %xmm6
3141 pshufd $0xff, %xmm4, %xmm6
3145 movaps 12 * SIZE(AA), %xmm4
3146 pshufd $0xff, %xmm4, %xmm6
3150 #if defined(RN) || defined(RT)
3151 movss 0 * SIZE(B), %xmm6
3152 pshufd $0x00, %xmm6, %xmm7
3156 #if defined(LN) || defined(LT)
3157 movss %xmm1, 0 * SIZE(B)
3158 movss %xmm3, 1 * SIZE(B)
3159 movss %xmm5, 2 * SIZE(B)
3160 movss %xmm7, 3 * SIZE(B)
3162 pshufd $0x00, %xmm1, %xmm0
3163 movaps %xmm0, 0 * SIZE(BB)
3164 pshufd $0x00, %xmm3, %xmm0
3165 movaps %xmm0, 4 * SIZE(BB)
3167 pshufd $0x00, %xmm5, %xmm0
3168 movaps %xmm0, 8 * SIZE(BB)
3169 pshufd $0x00, %xmm7, %xmm0
3170 movaps %xmm0, 12 * SIZE(BB)
3172 movss %xmm0, 0 * SIZE(AA)
3173 movss %xmm1, 1 * SIZE(AA)
3174 movss %xmm2, 2 * SIZE(AA)
3175 movss %xmm3, 3 * SIZE(AA)
3182 #if defined(LN) || defined(LT)
3183 unpcklps %xmm5, %xmm1
3184 unpcklps %xmm7, %xmm3
3186 unpcklps %xmm3, %xmm1
3188 movlps %xmm1, 0 * SIZE(CO1)
3189 movhps %xmm1, 2 * SIZE(CO1)
3191 movlps %xmm0, 0 * SIZE(CO1)
3192 movhps %xmm0, 2 * SIZE(CO1)
3199 #if defined(LT) || defined(RN)
3202 leal (,%eax, SIZE), %eax
3203 leal (AA, %eax, 4), AA
3221 sall $2 + BASE_SHIFT, %eax
3235 sall $1 + BASE_SHIFT, %eax
3239 #if defined(LN) || defined(RT)
3242 sall $1 + BASE_SHIFT, %eax
3243 leal (, %eax, SIZE), %eax
3244 leal (AA, %eax, 2), AA
3249 #if defined(LN) || defined(RT)
3251 sall $BASE_SHIFT, %eax
3252 leal (BB, %eax, 4), BB
3263 movsd 0 * SIZE(AA), %xmm0
3267 movsd 8 * SIZE(AA), %xmm1
3268 movaps 0 * SIZE(BB), %xmm2
3269 movaps 16 * SIZE(BB), %xmm3
3271 #if defined(LT) || defined(RN)
3283 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
3284 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
3286 movsd 2 * SIZE(AA), %xmm0
3288 movaps 4 * SIZE(BB), %xmm2
3290 movsd 4 * SIZE(AA), %xmm0
3292 movaps 8 * SIZE(BB), %xmm2
3294 movsd 6 * SIZE(AA), %xmm0
3296 movaps 12 * SIZE(BB), %xmm2
3298 movsd 16 * SIZE(AA), %xmm0
3300 movaps 32 * SIZE(BB), %xmm2
3302 movsd 10 * SIZE(AA), %xmm1
3304 movaps 20 * SIZE(BB), %xmm3
3306 movsd 12 * SIZE(AA), %xmm1
3308 movaps 24 * SIZE(BB), %xmm3
3310 movsd 14 * SIZE(AA), %xmm1
3312 movaps 28 * SIZE(BB), %xmm3
3314 movsd 24 * SIZE(AA), %xmm1
3316 movaps 48 * SIZE(BB), %xmm3
3325 #if defined(LT) || defined(RN)
3331 andl $7, %eax # if (k & 1)
3339 movsd 2 * SIZE(AA), %xmm0
3340 movaps 4 * SIZE(BB), %xmm2
3353 #if defined(LN) || defined(RT)
3365 sall $ BASE_SHIFT, %eax
3366 leal (AA, %eax, 2), AA
3367 leal (B, %eax, 1), B
3368 leal (BB, %eax, 4), BB
3371 #if defined(LN) || defined(LT)
3372 pshufd $1, %xmm4, %xmm6
3374 movss 0 * SIZE(B), %xmm1
3375 movss 1 * SIZE(B), %xmm3
3383 movsd 0 * SIZE(AA), %xmm0
3389 movaps 0 * SIZE(AA), %xmm4
3390 pshufd $0xff, %xmm4, %xmm6
3392 pshufd $0xaa, %xmm4, %xmm6
3396 pshufd $0x00, %xmm4, %xmm6
3401 movaps 0 * SIZE(AA), %xmm4
3402 pshufd $0x00, %xmm4, %xmm6
3404 pshufd $0x55, %xmm4, %xmm6
3408 pshufd $0xff, %xmm4, %xmm6
3412 #if defined(RN) || defined(RT)
3413 movss 0 * SIZE(B), %xmm6
3414 pshufd $0x00, %xmm6, %xmm7
3418 #if defined(LN) || defined(LT)
3419 movss %xmm1, 0 * SIZE(B)
3420 movss %xmm3, 1 * SIZE(B)
3422 pshufd $0x00, %xmm1, %xmm0
3423 movaps %xmm0, 0 * SIZE(BB)
3424 pshufd $0x00, %xmm3, %xmm0
3425 movaps %xmm0, 4 * SIZE(BB)
3427 movlps %xmm0, 0 * SIZE(AA)
3434 #if defined(LN) || defined(LT)
3435 movss %xmm1, 0 * SIZE(CO1)
3436 movss %xmm3, 1 * SIZE(CO1)
3438 movlps %xmm0, 0 * SIZE(CO1)
3445 #if defined(LT) || defined(RN)
3448 leal (,%eax, SIZE), %eax
3449 leal (AA, %eax, 2), AA
3467 sall $1 + BASE_SHIFT, %eax
3478 sall $BASE_SHIFT, %eax
3482 #if defined(LN) || defined(RT)
3485 leal (AA, %eax, SIZE), AA
3490 #if defined(LN) || defined(RT)
3492 sall $BASE_SHIFT, %eax
3493 leal (BB, %eax, 4), BB
3501 movss 0 * SIZE(AA), %xmm0
3502 movss 4 * SIZE(AA), %xmm1
3503 movss 0 * SIZE(BB), %xmm2
3504 movss 16 * SIZE(BB), %xmm3
3506 #if defined(LT) || defined(RN)
3518 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
3519 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
3521 movss 1 * SIZE(AA), %xmm0
3523 movss 32 * SIZE(BB), %xmm2
3524 mulss 4 * SIZE(BB), %xmm0
3526 movss 2 * SIZE(AA), %xmm0
3527 mulss 8 * SIZE(BB), %xmm0
3529 movss 3 * SIZE(AA), %xmm0
3530 mulss 12 * SIZE(BB), %xmm0
3532 movss 8 * SIZE(AA), %xmm0
3534 movss 5 * SIZE(AA), %xmm1
3536 movss 48 * SIZE(BB), %xmm3
3537 mulss 20 * SIZE(BB), %xmm1
3539 movss 6 * SIZE(AA), %xmm1
3540 mulss 24 * SIZE(BB), %xmm1
3542 movss 7 * SIZE(AA), %xmm1
3543 mulss 28 * SIZE(BB), %xmm1
3545 movss 12 * SIZE(AA), %xmm1
3554 #if defined(LT) || defined(RN)
3560 andl $7, %eax # if (k & 1)
3567 movss 1 * SIZE(AA), %xmm0
3569 movss 4 * SIZE(BB), %xmm2
3582 #if defined(LN) || defined(RT)
3590 sall $ BASE_SHIFT, %eax
3591 leal (AA, %eax, 1), AA
3592 leal (B, %eax, 1), B
3593 leal (BB, %eax, 4), BB
3596 #if defined(LN) || defined(LT)
3597 movss 0 * SIZE(B), %xmm1
3600 movss 0 * SIZE(AA), %xmm0
3604 #if defined(LN) || defined(LT)
3605 mulss 0 * SIZE(AA), %xmm1
3608 #if defined(RN) || defined(RT)
3609 mulss 0 * SIZE(B), %xmm0
3612 #if defined(LN) || defined(LT)
3613 movss %xmm1, 0 * SIZE(B)
3615 pshufd $0x00, %xmm1, %xmm0
3616 movaps %xmm0, 0 * SIZE(BB)
3618 movss %xmm0, 0 * SIZE(AA)
3625 #if defined(LN) || defined(LT)
3626 movss %xmm1, 0 * SIZE(CO1)
3628 movss %xmm0, 0 * SIZE(CO1)
3635 #if defined(LT) || defined(RN)
3638 leal (AA, %eax, SIZE), AA
3656 sall $BASE_SHIFT, %eax
3664 leal (B, %eax, SIZE), B
3667 #if defined(LT) || defined(RN)
3670 leal (B, %eax, SIZE), B
3683 movl OLD_STACK, %esp