1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
45 #define OLD_M 4 + STACK + ARGS(%esi)
46 #define OLD_N 8 + STACK + ARGS(%esi)
47 #define OLD_K 12 + STACK + ARGS(%esi)
48 #define OLD_ALPHA 16 + STACK + ARGS(%esi)
49 #define OLD_A 24 + STACK + ARGS(%esi)
50 #define OLD_B 28 + STACK + ARGS(%esi)
51 #define OLD_C 32 + STACK + ARGS(%esi)
52 #define OLD_LDC 36 + STACK + ARGS(%esi)
53 #define OLD_OFFT 40 + STACK + ARGS(%esi)
61 #define OLD_STACK 40(%esp)
62 #define OFFSET 44(%esp)
65 #define AORIG 56(%esp)
66 #define BORIG 60(%esp)
67 #define BUFFER 128(%esp)
69 #define STACK_ALIGN 4096
70 #define STACK_OFFSET 1024
72 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
73 #define PREFETCH prefetch
74 #define PREFETCHSIZE (8 * 10 + 4)
83 #define KERNEL1(address) \
86 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
87 movapd 2 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
90 movapd 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
92 mulpd 6 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
94 movapd 16 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
96 movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0
98 #define KERNEL2(address) \
100 addpd %xmm3, %xmm4; \
101 movapd 10 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
102 mulpd %xmm0, %xmm3; \
103 addpd %xmm3, %xmm5; \
104 movapd 12 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
105 mulpd %xmm0, %xmm3; \
106 mulpd 14 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
107 addpd %xmm3, %xmm6; \
108 movapd 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
109 addpd %xmm0, %xmm7; \
110 movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
112 #define KERNEL3(address) \
113 mulpd %xmm0, %xmm2; \
114 addpd %xmm2, %xmm4; \
115 movapd 18 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
116 mulpd %xmm0, %xmm2; \
117 addpd %xmm2, %xmm5; \
118 movapd 20 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
119 mulpd %xmm0, %xmm2; \
120 mulpd 22 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
121 addpd %xmm2, %xmm6; \
122 movapd 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
123 addpd %xmm0, %xmm7; \
124 movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0
126 #define KERNEL4(address) \
127 mulpd %xmm0, %xmm3; \
128 addpd %xmm3, %xmm4; \
129 movapd 26 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
130 mulpd %xmm0, %xmm3; \
131 addpd %xmm3, %xmm5; \
132 movapd 28 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
133 mulpd %xmm0, %xmm3; \
134 mulpd 30 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
135 addpd %xmm3, %xmm6; \
136 movapd 40 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
137 addpd %xmm0, %xmm7; \
138 movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0
140 #define KERNEL5(address) \
141 PREFETCH (PREFETCHSIZE + 8) * SIZE + (address) * 1 * SIZE(AA); \
142 mulpd %xmm1, %xmm2; \
143 addpd %xmm2, %xmm4; \
144 movapd 34 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
145 mulpd %xmm1, %xmm2; \
146 addpd %xmm2, %xmm5; \
147 movapd 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
148 mulpd %xmm1, %xmm2; \
149 mulpd 38 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
150 addpd %xmm2, %xmm6; \
151 movapd 48 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
152 addpd %xmm1, %xmm7; \
153 movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1
155 #define KERNEL6(address) \
156 mulpd %xmm1, %xmm3; \
157 addpd %xmm3, %xmm4; \
158 movapd 42 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
159 mulpd %xmm1, %xmm3; \
160 addpd %xmm3, %xmm5; \
161 movapd 44 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
162 mulpd %xmm1, %xmm3; \
163 mulpd 46 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
164 addpd %xmm3, %xmm6; \
165 movapd 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
166 addpd %xmm1, %xmm7; \
167 movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1
169 #define KERNEL7(address) \
170 mulpd %xmm1, %xmm2; \
171 addpd %xmm2, %xmm4; \
172 movapd 50 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
173 mulpd %xmm1, %xmm2; \
174 addpd %xmm2, %xmm5; \
175 movapd 52 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
176 mulpd %xmm1, %xmm2; \
177 mulpd 54 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
178 addpd %xmm2, %xmm6; \
179 movapd 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
180 addpd %xmm1, %xmm7; \
181 movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1
183 #define KERNEL8(address) \
184 mulpd %xmm1, %xmm3; \
185 addpd %xmm3, %xmm4; \
186 movapd 58 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
187 mulpd %xmm1, %xmm3; \
188 addpd %xmm3, %xmm5; \
189 movapd 60 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
190 mulpd %xmm1, %xmm3; \
191 mulpd 62 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
192 addpd %xmm3, %xmm6; \
193 movapd 72 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
194 addpd %xmm1, %xmm7; \
195 movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
208 movl %esp, %esi # save old stack
210 subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
211 andl $-STACK_ALIGN, %esp
212 addl $STACK_OFFSET, %esp
237 leal (, LDC, SIZE), LDC
241 leal (, %eax, SIZE), %eax
249 leal (, %eax, SIZE), %eax
284 sall $2 + BASE_SHIFT, %eax
288 #if defined(LN) || defined(RT)
291 leal (, %eax, SIZE), %eax
293 leal (BB, %eax, 8), BB
301 #if defined(LT) || defined(RN)
312 #define COPYPREFETCH 40
314 prefetchnta (COPYPREFETCH) * SIZE(B)
316 movq 0 * SIZE(B), %mm0
317 movq 1 * SIZE(B), %mm1
318 movq 2 * SIZE(B), %mm2
319 movq 3 * SIZE(B), %mm3
320 movq 4 * SIZE(B), %mm4
321 movq 5 * SIZE(B), %mm5
322 movq 6 * SIZE(B), %mm6
323 movq 7 * SIZE(B), %mm7
325 movq %mm0, 0 * SIZE(BB)
326 movq %mm0, 1 * SIZE(BB)
327 movq %mm1, 2 * SIZE(BB)
328 movq %mm1, 3 * SIZE(BB)
329 movq %mm2, 4 * SIZE(BB)
330 movq %mm2, 5 * SIZE(BB)
331 movq %mm3, 6 * SIZE(BB)
332 movq %mm3, 7 * SIZE(BB)
334 movq %mm4, 8 * SIZE(BB)
335 movq %mm4, 9 * SIZE(BB)
336 movq %mm5, 10 * SIZE(BB)
337 movq %mm5, 11 * SIZE(BB)
338 movq %mm6, 12 * SIZE(BB)
339 movq %mm6, 13 * SIZE(BB)
340 movq %mm7, 14 * SIZE(BB)
341 movq %mm7, 15 * SIZE(BB)
350 #if defined(LT) || defined(RN)
360 movq 0 * SIZE(B), %mm0
361 movq 1 * SIZE(B), %mm1
362 movq 2 * SIZE(B), %mm2
363 movq 3 * SIZE(B), %mm3
365 movq %mm0, 0 * SIZE(BB)
366 movq %mm0, 1 * SIZE(BB)
367 movq %mm1, 2 * SIZE(BB)
368 movq %mm1, 3 * SIZE(BB)
369 movq %mm2, 4 * SIZE(BB)
370 movq %mm2, 5 * SIZE(BB)
371 movq %mm3, 6 * SIZE(BB)
372 movq %mm3, 7 * SIZE(BB)
378 #if defined(LT) || defined(RN)
385 leal (, LDC, 4), %eax
396 sarl $1, %ebx # i = (m >> 2)
403 sall $1 + BASE_SHIFT, %eax
407 #if defined(LN) || defined(RT)
410 leal (, %eax, SIZE), %eax
411 leal (AA, %eax, 2), AA
416 #if defined(LN) || defined(RT)
418 sall $3 + BASE_SHIFT, %eax
427 movapd 0 * SIZE(AA), %xmm0
428 movapd 8 * SIZE(AA), %xmm1
429 movapd 0 * SIZE(BB), %xmm2
430 movapd 8 * SIZE(BB), %xmm3
432 leal (LDC, LDC, 2), %eax
435 prefetchw -2 * SIZE(CO1)
436 prefetchw -2 * SIZE(CO1, LDC)
437 prefetchw -2 * SIZE(CO1, LDC, 2)
438 prefetchw -2 * SIZE(CO1, %eax)
440 prefetchw 1 * SIZE(CO1)
441 prefetchw 1 * SIZE(CO1, LDC)
442 prefetchw 1 * SIZE(CO1, LDC, 2)
443 prefetchw 1 * SIZE(CO1, %eax)
446 #if defined(LT) || defined(RN)
537 addl $128 * 4 * SIZE, BB
538 addl $128 * 1 * SIZE, AA
544 leal (AA, %eax, 1), AA
545 leal (BB, %eax, 4), BB
571 #if defined(LT) || defined(RN)
577 andl $7, %eax # if (k & 1)
585 movapd 2 * SIZE(BB), %xmm2
588 movapd 4 * SIZE(BB), %xmm2
590 mulpd 6 * SIZE(BB), %xmm0
592 movapd 8 * SIZE(BB), %xmm2
594 movapd 2 * SIZE(AA), %xmm0
603 #if defined(LN) || defined(RT)
615 leal (, %eax, SIZE), %eax
616 leal (AA, %eax, 2), AA
618 leal (BB, %eax, 8), BB
621 #if defined(LN) || defined(LT)
623 unpcklpd %xmm5, %xmm4
624 unpckhpd %xmm5, %xmm0
627 unpcklpd %xmm7, %xmm6
628 unpckhpd %xmm7, %xmm1
630 movapd 0 * SIZE(B), %xmm2
631 movapd 2 * SIZE(B), %xmm5
632 movapd 4 * SIZE(B), %xmm3
633 movapd 6 * SIZE(B), %xmm7
640 movapd 0 * SIZE(AA), %xmm0
641 movapd 2 * SIZE(AA), %xmm1
642 movapd 4 * SIZE(AA), %xmm2
643 movapd 6 * SIZE(AA), %xmm3
652 movlpd 3 * SIZE(AA), %xmm4
653 movhpd 3 * SIZE(AA), %xmm4
657 movlpd 2 * SIZE(AA), %xmm4
658 movhpd 2 * SIZE(AA), %xmm4
665 movlpd 0 * SIZE(AA), %xmm4
666 movhpd 0 * SIZE(AA), %xmm4
673 movlpd 0 * SIZE(AA), %xmm4
674 movhpd 0 * SIZE(AA), %xmm4
678 movlpd 1 * SIZE(AA), %xmm4
679 movhpd 1 * SIZE(AA), %xmm4
686 movlpd 3 * SIZE(AA), %xmm4
687 movhpd 3 * SIZE(AA), %xmm4
693 movlpd 0 * SIZE(B), %xmm4
694 movhpd 0 * SIZE(B), %xmm4
696 movlpd 1 * SIZE(B), %xmm4
697 movhpd 1 * SIZE(B), %xmm4
700 movlpd 2 * SIZE(B), %xmm4
701 movhpd 2 * SIZE(B), %xmm4
704 movlpd 3 * SIZE(B), %xmm4
705 movhpd 3 * SIZE(B), %xmm4
709 movlpd 5 * SIZE(B), %xmm4
710 movhpd 5 * SIZE(B), %xmm4
712 movlpd 6 * SIZE(B), %xmm4
713 movhpd 6 * SIZE(B), %xmm4
716 movlpd 7 * SIZE(B), %xmm4
717 movhpd 7 * SIZE(B), %xmm4
721 movlpd 10 * SIZE(B), %xmm4
722 movhpd 10 * SIZE(B), %xmm4
724 movlpd 11 * SIZE(B), %xmm4
725 movhpd 11 * SIZE(B), %xmm4
729 movlpd 15 * SIZE(B), %xmm4
730 movhpd 15 * SIZE(B), %xmm4
735 movlpd 15 * SIZE(B), %xmm4
736 movhpd 15 * SIZE(B), %xmm4
738 movlpd 14 * SIZE(B), %xmm4
739 movhpd 14 * SIZE(B), %xmm4
742 movlpd 13 * SIZE(B), %xmm4
743 movhpd 13 * SIZE(B), %xmm4
746 movlpd 12 * SIZE(B), %xmm4
747 movhpd 12 * SIZE(B), %xmm4
751 movlpd 10 * SIZE(B), %xmm4
752 movhpd 10 * SIZE(B), %xmm4
754 movlpd 9 * SIZE(B), %xmm4
755 movhpd 9 * SIZE(B), %xmm4
758 movlpd 8 * SIZE(B), %xmm4
759 movhpd 8 * SIZE(B), %xmm4
763 movlpd 5 * SIZE(B), %xmm4
764 movhpd 5 * SIZE(B), %xmm4
766 movlpd 4 * SIZE(B), %xmm4
767 movhpd 4 * SIZE(B), %xmm4
771 movlpd 0 * SIZE(B), %xmm4
772 movhpd 0 * SIZE(B), %xmm4
776 #if defined(LN) || defined(LT)
777 movapd %xmm2, 0 * SIZE(B)
778 movapd %xmm5, 2 * SIZE(B)
779 movapd %xmm3, 4 * SIZE(B)
780 movapd %xmm7, 6 * SIZE(B)
782 movlpd %xmm2, 0 * SIZE(BB)
783 movlpd %xmm2, 1 * SIZE(BB)
784 movhpd %xmm2, 2 * SIZE(BB)
785 movhpd %xmm2, 3 * SIZE(BB)
786 movlpd %xmm5, 4 * SIZE(BB)
787 movlpd %xmm5, 5 * SIZE(BB)
788 movhpd %xmm5, 6 * SIZE(BB)
789 movhpd %xmm5, 7 * SIZE(BB)
790 movlpd %xmm3, 8 * SIZE(BB)
791 movlpd %xmm3, 9 * SIZE(BB)
792 movhpd %xmm3, 10 * SIZE(BB)
793 movhpd %xmm3, 11 * SIZE(BB)
794 movlpd %xmm7, 12 * SIZE(BB)
795 movlpd %xmm7, 13 * SIZE(BB)
796 movhpd %xmm7, 14 * SIZE(BB)
797 movhpd %xmm7, 15 * SIZE(BB)
799 movapd %xmm0, 0 * SIZE(AA)
800 movapd %xmm1, 2 * SIZE(AA)
801 movapd %xmm2, 4 * SIZE(AA)
802 movapd %xmm3, 6 * SIZE(AA)
809 leal (LDC, LDC, 2), %eax
811 #if defined(LN) || defined(LT)
812 movlpd %xmm2, 0 * SIZE(CO1)
813 movlpd %xmm3, 1 * SIZE(CO1)
814 movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
815 movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
816 movlpd %xmm5, 0 * SIZE(CO1, LDC, 2)
817 movlpd %xmm7, 1 * SIZE(CO1, LDC, 2)
818 movhpd %xmm5, 0 * SIZE(CO1, %eax, 1)
819 movhpd %xmm7, 1 * SIZE(CO1, %eax, 1)
821 movlpd %xmm0, 0 * SIZE(CO1)
822 movhpd %xmm0, 1 * SIZE(CO1)
823 movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
824 movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
825 movlpd %xmm2, 0 * SIZE(CO1, LDC, 2)
826 movhpd %xmm2, 1 * SIZE(CO1, LDC, 2)
827 movlpd %xmm3, 0 * SIZE(CO1, %eax, 1)
828 movhpd %xmm3, 1 * SIZE(CO1, %eax, 1)
835 #if defined(LT) || defined(RN)
838 leal (,%eax, SIZE), %eax
839 leal (AA, %eax, 2), AA
857 sall $1 + BASE_SHIFT, %eax
867 testl $1, %ebx # i = (m >> 2)
872 sall $BASE_SHIFT, %eax
876 #if defined(LN) || defined(RT)
879 leal (AA, %eax, SIZE), AA
884 #if defined(LN) || defined(RT)
886 sall $3 + BASE_SHIFT, %eax
895 movlpd 0 * SIZE(AA), %xmm0
896 movlpd 4 * SIZE(AA), %xmm1
897 movlpd 0 * SIZE(BB), %xmm2
898 movlpd 8 * SIZE(BB), %xmm3
900 #if defined(LT) || defined(RN)
913 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
914 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
916 movlpd 2 * SIZE(BB), %xmm2
919 movlpd 4 * SIZE(BB), %xmm2
921 mulsd 6 * SIZE(BB), %xmm0
923 movlpd 16 * SIZE(BB), %xmm2
925 movlpd 1 * SIZE(AA), %xmm0
928 movlpd 10 * SIZE(BB), %xmm3
931 movlpd 12 * SIZE(BB), %xmm3
933 mulsd 14 * SIZE(BB), %xmm0
935 movlpd 24 * SIZE(BB), %xmm3
937 movlpd 2 * SIZE(AA), %xmm0
940 movlpd 18 * SIZE(BB), %xmm2
943 movlpd 20 * SIZE(BB), %xmm2
945 mulsd 22 * SIZE(BB), %xmm0
947 movlpd 32 * SIZE(BB), %xmm2
949 movlpd 3 * SIZE(AA), %xmm0
952 movlpd 26 * SIZE(BB), %xmm3
955 movlpd 28 * SIZE(BB), %xmm3
957 mulsd 30 * SIZE(BB), %xmm0
959 movlpd 40 * SIZE(BB), %xmm3
961 movlpd 8 * SIZE(AA), %xmm0
962 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
963 PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
967 movlpd 34 * SIZE(BB), %xmm2
970 movlpd 36 * SIZE(BB), %xmm2
972 mulsd 38 * SIZE(BB), %xmm1
974 movlpd 48 * SIZE(BB), %xmm2
976 movlpd 5 * SIZE(AA), %xmm1
979 movlpd 42 * SIZE(BB), %xmm3
982 movlpd 44 * SIZE(BB), %xmm3
984 mulsd 46 * SIZE(BB), %xmm1
986 movlpd 56 * SIZE(BB), %xmm3
988 movlpd 6 * SIZE(AA), %xmm1
991 movlpd 50 * SIZE(BB), %xmm2
994 movlpd 52 * SIZE(BB), %xmm2
996 mulsd 54 * SIZE(BB), %xmm1
998 movlpd 64 * SIZE(BB), %xmm2
1000 movlpd 7 * SIZE(AA), %xmm1
1003 movlpd 58 * SIZE(BB), %xmm3
1006 movlpd 60 * SIZE(BB), %xmm3
1008 mulsd 62 * SIZE(BB), %xmm1
1010 movlpd 72 * SIZE(BB), %xmm3
1013 movlpd 12 * SIZE(AA), %xmm1
1020 #if defined(LT) || defined(RN)
1026 andl $7, %eax # if (k & 1)
1033 movlpd 2 * SIZE(BB), %xmm2
1036 movlpd 4 * SIZE(BB), %xmm2
1038 mulsd 6 * SIZE(BB), %xmm0
1040 movlpd 8 * SIZE(BB), %xmm2
1042 movlpd 1 * SIZE(AA), %xmm0
1051 #if defined(LN) || defined(RT)
1063 leal (, %eax, SIZE), %eax
1065 leal (B, %eax, 4), B
1066 leal (BB, %eax, 8), BB
1069 #if defined(LN) || defined(LT)
1070 unpcklpd %xmm5, %xmm4
1071 unpcklpd %xmm7, %xmm6
1073 movapd 0 * SIZE(B), %xmm2
1074 movapd 2 * SIZE(B), %xmm5
1079 movlpd 0 * SIZE(AA), %xmm0
1080 movlpd 1 * SIZE(AA), %xmm1
1081 movlpd 2 * SIZE(AA), %xmm2
1082 movlpd 3 * SIZE(AA), %xmm3
1091 movlpd 0 * SIZE(AA), %xmm4
1092 movhpd 0 * SIZE(AA), %xmm4
1098 movlpd 0 * SIZE(AA), %xmm4
1099 movhpd 0 * SIZE(AA), %xmm4
1105 movlpd 0 * SIZE(B), %xmm4
1107 movlpd 1 * SIZE(B), %xmm4
1110 movlpd 2 * SIZE(B), %xmm4
1113 movlpd 3 * SIZE(B), %xmm4
1117 movlpd 5 * SIZE(B), %xmm4
1119 movlpd 6 * SIZE(B), %xmm4
1122 movlpd 7 * SIZE(B), %xmm4
1126 movlpd 10 * SIZE(B), %xmm4
1128 movlpd 11 * SIZE(B), %xmm4
1132 movlpd 15 * SIZE(B), %xmm4
1137 movlpd 15 * SIZE(B), %xmm4
1139 movlpd 14 * SIZE(B), %xmm4
1142 movlpd 13 * SIZE(B), %xmm4
1145 movlpd 12 * SIZE(B), %xmm4
1149 movlpd 10 * SIZE(B), %xmm4
1151 movlpd 9 * SIZE(B), %xmm4
1154 movlpd 8 * SIZE(B), %xmm4
1158 movlpd 5 * SIZE(B), %xmm4
1160 movlpd 4 * SIZE(B), %xmm4
1164 movlpd 0 * SIZE(B), %xmm4
1168 #if defined(LN) || defined(LT)
1169 movapd %xmm2, 0 * SIZE(B)
1170 movapd %xmm5, 2 * SIZE(B)
1172 movlpd %xmm2, 0 * SIZE(BB)
1173 movlpd %xmm2, 1 * SIZE(BB)
1174 movhpd %xmm2, 2 * SIZE(BB)
1175 movhpd %xmm2, 3 * SIZE(BB)
1176 movlpd %xmm5, 4 * SIZE(BB)
1177 movlpd %xmm5, 5 * SIZE(BB)
1178 movhpd %xmm5, 6 * SIZE(BB)
1179 movhpd %xmm5, 7 * SIZE(BB)
1181 movlpd %xmm0, 0 * SIZE(AA)
1182 movlpd %xmm1, 1 * SIZE(AA)
1183 movlpd %xmm2, 2 * SIZE(AA)
1184 movlpd %xmm3, 3 * SIZE(AA)
1191 leal (LDC, LDC, 2), %eax
1193 #if defined(LN) || defined(LT)
1194 movlpd %xmm2, 0 * SIZE(CO1)
1195 movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
1196 movlpd %xmm5, 0 * SIZE(CO1, LDC, 2)
1197 movhpd %xmm5, 0 * SIZE(CO1, %eax, 1)
1199 movlpd %xmm0, 0 * SIZE(CO1)
1200 movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
1201 movlpd %xmm2, 0 * SIZE(CO1, LDC, 2)
1202 movlpd %xmm3, 0 * SIZE(CO1, %eax, 1)
1209 #if defined(LT) || defined(RN)
1212 leal (AA,%eax, SIZE), AA
1230 sall $BASE_SHIFT, %eax
1238 leal (, %eax, SIZE), %eax
1239 leal (B, %eax, 4), B
1242 #if defined(LT) || defined(RN)
1245 leal (,%eax, SIZE), %eax
1246 leal (B, %eax, 4), B
1275 sall $1 + BASE_SHIFT, %eax
1279 #if defined(LN) || defined(RT)
1282 leal (, %eax, SIZE), %eax
1283 leal (B, %eax, 2), B
1284 leal (BB, %eax, 4), BB
1292 #if defined(LT) || defined(RN)
1303 #define COPYPREFETCH 40
1305 prefetchnta (COPYPREFETCH) * SIZE(B)
1307 movq 0 * SIZE(B), %mm0
1308 movq 1 * SIZE(B), %mm1
1309 movq 2 * SIZE(B), %mm2
1310 movq 3 * SIZE(B), %mm3
1311 movq 4 * SIZE(B), %mm4
1312 movq 5 * SIZE(B), %mm5
1313 movq 6 * SIZE(B), %mm6
1314 movq 7 * SIZE(B), %mm7
1316 movq %mm0, 0 * SIZE(BB)
1317 movq %mm0, 1 * SIZE(BB)
1318 movq %mm1, 2 * SIZE(BB)
1319 movq %mm1, 3 * SIZE(BB)
1320 movq %mm2, 4 * SIZE(BB)
1321 movq %mm2, 5 * SIZE(BB)
1322 movq %mm3, 6 * SIZE(BB)
1323 movq %mm3, 7 * SIZE(BB)
1325 movq %mm4, 8 * SIZE(BB)
1326 movq %mm4, 9 * SIZE(BB)
1327 movq %mm5, 10 * SIZE(BB)
1328 movq %mm5, 11 * SIZE(BB)
1329 movq %mm6, 12 * SIZE(BB)
1330 movq %mm6, 13 * SIZE(BB)
1331 movq %mm7, 14 * SIZE(BB)
1332 movq %mm7, 15 * SIZE(BB)
1341 #if defined(LT) || defined(RN)
1353 movq 0 * SIZE(B), %mm0
1354 movq 1 * SIZE(B), %mm1
1356 movq %mm0, 0 * SIZE(BB)
1357 movq %mm0, 1 * SIZE(BB)
1358 movq %mm1, 2 * SIZE(BB)
1359 movq %mm1, 3 * SIZE(BB)
1368 #if defined(LT) || defined(RN)
1375 leal (, LDC, 2), %eax
1386 sarl $1, %ebx # i = (m >> 2)
1393 sall $1 + BASE_SHIFT, %eax
1397 #if defined(LN) || defined(RT)
1400 leal (, %eax, SIZE), %eax
1401 leal (AA, %eax, 2), AA
1406 #if defined(LN) || defined(RT)
1408 sall $2 + BASE_SHIFT, %eax
1417 movapd 0 * SIZE(AA), %xmm0
1418 movapd 8 * SIZE(AA), %xmm1
1419 movapd 0 * SIZE(BB), %xmm2
1420 movapd 8 * SIZE(BB), %xmm3
1423 prefetchw -2 * SIZE(CO1)
1424 prefetchw -2 * SIZE(CO1, LDC)
1426 prefetchw 1 * SIZE(CO1)
1427 prefetchw 1 * SIZE(CO1, LDC)
1430 #if defined(LT) || defined(RN)
1442 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
1443 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
1445 mulpd 2 * SIZE(BB), %xmm0
1447 movapd 4 * SIZE(BB), %xmm2
1449 movapd 2 * SIZE(AA), %xmm0
1452 mulpd 6 * SIZE(BB), %xmm0
1454 movapd 16 * SIZE(BB), %xmm2
1456 movapd 4 * SIZE(AA), %xmm0
1459 mulpd 10 * SIZE(BB), %xmm0
1461 movapd 12 * SIZE(BB), %xmm3
1463 movapd 6 * SIZE(AA), %xmm0
1466 mulpd 14 * SIZE(BB), %xmm0
1468 movapd 24 * SIZE(BB), %xmm3
1470 movapd 16 * SIZE(AA), %xmm0
1472 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
1473 prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
1476 mulpd 18 * SIZE(BB), %xmm1
1478 movapd 20 * SIZE(BB), %xmm2
1480 movapd 10 * SIZE(AA), %xmm1
1483 mulpd 22 * SIZE(BB), %xmm1
1485 movapd 32 * SIZE(BB), %xmm2
1487 movapd 12 * SIZE(AA), %xmm1
1490 mulpd 26 * SIZE(BB), %xmm1
1492 movapd 28 * SIZE(BB), %xmm3
1494 movapd 14 * SIZE(AA), %xmm1
1497 mulpd 30 * SIZE(BB), %xmm1
1499 movapd 40 * SIZE(BB), %xmm3
1501 movapd 24 * SIZE(AA), %xmm1
1510 #if defined(LT) || defined(RN)
1516 andl $7, %eax # if (k & 1)
1523 mulpd 2 * SIZE(BB), %xmm0
1525 movapd 4 * SIZE(BB), %xmm2
1527 movapd 2 * SIZE(AA), %xmm0
1539 #if defined(LN) || defined(RT)
1551 leal (, %eax, SIZE), %eax
1552 leal (AA, %eax, 2), AA
1553 leal (B, %eax, 2), B
1554 leal (BB, %eax, 4), BB
1557 #if defined(LN) || defined(LT)
1559 unpcklpd %xmm5, %xmm4
1560 unpckhpd %xmm5, %xmm0
1562 movapd 0 * SIZE(B), %xmm2
1563 movapd 2 * SIZE(B), %xmm3
1568 movapd 0 * SIZE(AA), %xmm0
1569 movapd 2 * SIZE(AA), %xmm1
1576 movlpd 3 * SIZE(AA), %xmm4
1577 movhpd 3 * SIZE(AA), %xmm4
1580 movlpd 2 * SIZE(AA), %xmm4
1581 movhpd 2 * SIZE(AA), %xmm4
1585 movlpd 0 * SIZE(AA), %xmm4
1586 movhpd 0 * SIZE(AA), %xmm4
1592 movlpd 0 * SIZE(AA), %xmm4
1593 movhpd 0 * SIZE(AA), %xmm4
1596 movlpd 1 * SIZE(AA), %xmm4
1597 movhpd 1 * SIZE(AA), %xmm4
1601 movlpd 3 * SIZE(AA), %xmm4
1602 movhpd 3 * SIZE(AA), %xmm4
1607 movlpd 0 * SIZE(B), %xmm4
1608 movhpd 0 * SIZE(B), %xmm4
1610 movlpd 1 * SIZE(B), %xmm4
1611 movhpd 1 * SIZE(B), %xmm4
1615 movlpd 3 * SIZE(B), %xmm4
1616 movhpd 3 * SIZE(B), %xmm4
1621 movlpd 3 * SIZE(B), %xmm4
1622 movhpd 3 * SIZE(B), %xmm4
1624 movlpd 2 * SIZE(B), %xmm4
1625 movhpd 2 * SIZE(B), %xmm4
1629 movlpd 0 * SIZE(B), %xmm4
1630 movhpd 0 * SIZE(B), %xmm4
1634 #if defined(LN) || defined(LT)
1635 movapd %xmm2, 0 * SIZE(B)
1636 movapd %xmm3, 2 * SIZE(B)
1638 movlpd %xmm2, 0 * SIZE(BB)
1639 movlpd %xmm2, 1 * SIZE(BB)
1640 movhpd %xmm2, 2 * SIZE(BB)
1641 movhpd %xmm2, 3 * SIZE(BB)
1642 movlpd %xmm3, 4 * SIZE(BB)
1643 movlpd %xmm3, 5 * SIZE(BB)
1644 movhpd %xmm3, 6 * SIZE(BB)
1645 movhpd %xmm3, 7 * SIZE(BB)
1647 movapd %xmm0, 0 * SIZE(AA)
1648 movapd %xmm1, 2 * SIZE(AA)
1655 #if defined(LN) || defined(LT)
1656 movlpd %xmm2, 0 * SIZE(CO1)
1657 movlpd %xmm3, 1 * SIZE(CO1)
1658 movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
1659 movhpd %xmm3, 1 * SIZE(CO1, LDC, 1)
1661 movlpd %xmm0, 0 * SIZE(CO1)
1662 movhpd %xmm0, 1 * SIZE(CO1)
1663 movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
1664 movhpd %xmm1, 1 * SIZE(CO1, LDC, 1)
1671 #if defined(LT) || defined(RN)
1674 leal (,%eax, SIZE), %eax
1675 leal (AA, %eax, 2), AA
1693 sall $1 + BASE_SHIFT, %eax
1703 testl $1, %ebx # i = (m >> 2)
1708 sall $BASE_SHIFT, %eax
1712 #if defined(LN) || defined(RT)
1715 leal (AA, %eax, SIZE), AA
1720 #if defined(LN) || defined(RT)
1722 sall $2 + BASE_SHIFT, %eax
1731 movlpd 0 * SIZE(AA), %xmm0
1732 movlpd 4 * SIZE(AA), %xmm1
1733 movlpd 0 * SIZE(BB), %xmm2
1734 movlpd 8 * SIZE(BB), %xmm3
1736 #if defined(LT) || defined(RN)
1748 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
1749 mulsd 2 * SIZE(BB), %xmm0
1751 movlpd 4 * SIZE(BB), %xmm2
1753 movlpd 1 * SIZE(AA), %xmm0
1756 mulsd 6 * SIZE(BB), %xmm0
1758 movlpd 16 * SIZE(BB), %xmm2
1760 movlpd 2 * SIZE(AA), %xmm0
1763 mulsd 10 * SIZE(BB), %xmm0
1765 movlpd 12 * SIZE(BB), %xmm3
1767 movlpd 3 * SIZE(AA), %xmm0
1770 mulsd 14 * SIZE(BB), %xmm0
1772 movlpd 24 * SIZE(BB), %xmm3
1774 movlpd 8 * SIZE(AA), %xmm0
1777 mulsd 18 * SIZE(BB), %xmm1
1779 movlpd 20 * SIZE(BB), %xmm2
1781 movlpd 5 * SIZE(AA), %xmm1
1784 mulsd 22 * SIZE(BB), %xmm1
1786 movlpd 32 * SIZE(BB), %xmm2
1788 movlpd 6 * SIZE(AA), %xmm1
1791 mulsd 26 * SIZE(BB), %xmm1
1793 movlpd 28 * SIZE(BB), %xmm3
1795 movlpd 7 * SIZE(AA), %xmm1
1798 mulsd 30 * SIZE(BB), %xmm1
1800 movlpd 40 * SIZE(BB), %xmm3
1802 movlpd 12 * SIZE(AA), %xmm1
1811 #if defined(LT) || defined(RN)
1817 andl $7, %eax # if (k & 1)
1823 mulsd 2 * SIZE(BB), %xmm0
1825 movlpd 4 * SIZE(BB), %xmm2
1827 movlpd 1 * SIZE(AA), %xmm0
1839 #if defined(LN) || defined(RT)
1851 leal (, %eax, SIZE), %eax
1853 leal (B, %eax, 2), B
1854 leal (BB, %eax, 4), BB
1857 #if defined(LN) || defined(LT)
1858 unpcklpd %xmm5, %xmm4
1860 movapd 0 * SIZE(B), %xmm2
1864 movlpd 0 * SIZE(AA), %xmm0
1865 movlpd 1 * SIZE(AA), %xmm1
1872 movlpd 0 * SIZE(AA), %xmm4
1873 movhpd 0 * SIZE(AA), %xmm4
1878 movlpd 0 * SIZE(AA), %xmm4
1879 movhpd 0 * SIZE(AA), %xmm4
1884 movlpd 0 * SIZE(B), %xmm4
1886 movlpd 1 * SIZE(B), %xmm4
1890 movlpd 3 * SIZE(B), %xmm4
1895 movlpd 3 * SIZE(B), %xmm4
1897 movlpd 2 * SIZE(B), %xmm4
1901 movlpd 0 * SIZE(B), %xmm4
1905 #if defined(LN) || defined(LT)
1906 movapd %xmm2, 0 * SIZE(B)
1908 movlpd %xmm2, 0 * SIZE(BB)
1909 movlpd %xmm2, 1 * SIZE(BB)
1910 movhpd %xmm2, 2 * SIZE(BB)
1911 movhpd %xmm2, 3 * SIZE(BB)
1913 movlpd %xmm0, 0 * SIZE(AA)
1914 movlpd %xmm1, 1 * SIZE(AA)
1921 #if defined(LN) || defined(LT)
1922 movlpd %xmm2, 0 * SIZE(CO1)
1923 movhpd %xmm2, 0 * SIZE(CO1, LDC, 1)
1925 movlpd %xmm0, 0 * SIZE(CO1)
1926 movlpd %xmm1, 0 * SIZE(CO1, LDC, 1)
1933 #if defined(LT) || defined(RN)
1936 leal (AA,%eax, SIZE), AA
1954 sall $BASE_SHIFT, %eax
1962 leal (, %eax, SIZE), %eax
1963 leal (B, %eax, 2), B
1966 #if defined(LT) || defined(RN)
1969 leal (,%eax, SIZE), %eax
1970 leal (B, %eax, 2), B
1996 sall $BASE_SHIFT, %eax
2000 #if defined(LN) || defined(RT)
2003 leal (, %eax, SIZE), %eax
2004 leal (B, %eax, 1), B
2005 leal (BB, %eax, 2), BB
2013 #if defined(LT) || defined(RN)
2024 #define COPYPREFETCH 40
2026 prefetchnta (COPYPREFETCH) * SIZE(B)
2028 movq 0 * SIZE(B), %mm0
2029 movq 1 * SIZE(B), %mm1
2030 movq 2 * SIZE(B), %mm2
2031 movq 3 * SIZE(B), %mm3
2032 movq 4 * SIZE(B), %mm4
2033 movq 5 * SIZE(B), %mm5
2034 movq 6 * SIZE(B), %mm6
2035 movq 7 * SIZE(B), %mm7
2037 movq %mm0, 0 * SIZE(BB)
2038 movq %mm0, 1 * SIZE(BB)
2039 movq %mm1, 2 * SIZE(BB)
2040 movq %mm1, 3 * SIZE(BB)
2041 movq %mm2, 4 * SIZE(BB)
2042 movq %mm2, 5 * SIZE(BB)
2043 movq %mm3, 6 * SIZE(BB)
2044 movq %mm3, 7 * SIZE(BB)
2046 movq %mm4, 8 * SIZE(BB)
2047 movq %mm4, 9 * SIZE(BB)
2048 movq %mm5, 10 * SIZE(BB)
2049 movq %mm5, 11 * SIZE(BB)
2050 movq %mm6, 12 * SIZE(BB)
2051 movq %mm6, 13 * SIZE(BB)
2052 movq %mm7, 14 * SIZE(BB)
2053 movq %mm7, 15 * SIZE(BB)
2062 #if defined(LT) || defined(RN)
2074 movq 0 * SIZE(B), %mm0
2076 movq %mm0, 0 * SIZE(BB)
2077 movq %mm0, 1 * SIZE(BB)
2086 #if defined(LT) || defined(RN)
2102 sarl $1, %ebx # i = (m >> 2)
2109 sall $1 + BASE_SHIFT, %eax
2113 #if defined(LN) || defined(RT)
2116 leal (, %eax, SIZE), %eax
2117 leal (AA, %eax, 2), AA
2122 #if defined(LN) || defined(RT)
2124 sall $1 + BASE_SHIFT, %eax
2133 movapd 0 * SIZE(AA), %xmm0
2134 movapd 8 * SIZE(AA), %xmm1
2135 movapd 0 * SIZE(BB), %xmm2
2136 movapd 8 * SIZE(BB), %xmm3
2139 prefetchw -2 * SIZE(CO1)
2141 prefetchw 1 * SIZE(CO1)
2144 #if defined(LT) || defined(RN)
2157 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
2158 movapd 16 * SIZE(BB), %xmm2
2160 movapd 2 * SIZE(AA), %xmm0
2161 mulpd 2 * SIZE(BB), %xmm0
2163 movapd 4 * SIZE(AA), %xmm0
2164 mulpd 4 * SIZE(BB), %xmm0
2166 movapd 6 * SIZE(AA), %xmm0
2167 mulpd 6 * SIZE(BB), %xmm0
2170 movapd 16 * SIZE(AA), %xmm0
2171 prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
2174 movapd 24 * SIZE(BB), %xmm3
2176 movapd 10 * SIZE(AA), %xmm1
2177 mulpd 10 * SIZE(BB), %xmm1
2179 movapd 12 * SIZE(AA), %xmm1
2180 mulpd 12 * SIZE(BB), %xmm1
2182 movapd 14 * SIZE(AA), %xmm1
2183 mulpd 14 * SIZE(BB), %xmm1
2185 movapd 24 * SIZE(AA), %xmm1
2194 #if defined(LT) || defined(RN)
2200 andl $7, %eax # if (k & 1)
2208 movapd 2 * SIZE(AA), %xmm0
2209 movapd 2 * SIZE(BB), %xmm2
2218 #if defined(LN) || defined(RT)
2230 leal (, %eax, SIZE), %eax
2231 leal (AA, %eax, 2), AA
2232 leal (B, %eax, 1), B
2233 leal (BB, %eax, 2), BB
2236 #if defined(LN) || defined(LT)
2237 movapd 0 * SIZE(B), %xmm2
2241 movapd 0 * SIZE(AA), %xmm0
2248 unpckhpd %xmm3, %xmm3
2250 movlpd 3 * SIZE(AA), %xmm4
2253 movlpd 2 * SIZE(AA), %xmm4
2257 movlpd 0 * SIZE(AA), %xmm4
2260 unpcklpd %xmm3, %xmm2
2265 unpckhpd %xmm3, %xmm3
2267 movlpd 0 * SIZE(AA), %xmm4
2270 movlpd 1 * SIZE(AA), %xmm4
2274 movlpd 3 * SIZE(AA), %xmm4
2277 unpcklpd %xmm3, %xmm2
2281 movlpd 0 * SIZE(B), %xmm4
2282 movhpd 0 * SIZE(B), %xmm4
2287 movlpd 0 * SIZE(B), %xmm4
2288 movhpd 0 * SIZE(B), %xmm4
2292 #if defined(LN) || defined(LT)
2293 movapd %xmm2, 0 * SIZE(B)
2295 movlpd %xmm2, 0 * SIZE(BB)
2296 movlpd %xmm2, 1 * SIZE(BB)
2297 movhpd %xmm2, 2 * SIZE(BB)
2298 movhpd %xmm2, 3 * SIZE(BB)
2300 movapd %xmm0, 0 * SIZE(AA)
2307 #if defined(LN) || defined(LT)
2308 movlpd %xmm2, 0 * SIZE(CO1)
2309 movhpd %xmm2, 1 * SIZE(CO1)
2311 movlpd %xmm0, 0 * SIZE(CO1)
2312 movhpd %xmm0, 1 * SIZE(CO1)
2319 #if defined(LT) || defined(RN)
2322 leal (,%eax, SIZE), %eax
2323 leal (AA, %eax, 2), AA
2341 sall $1 + BASE_SHIFT, %eax
2351 testl $1, %ebx # i = (m >> 2)
2356 sall $BASE_SHIFT, %eax
2360 #if defined(LN) || defined(RT)
2363 leal (AA, %eax, SIZE), AA
2368 #if defined(LN) || defined(RT)
2370 sall $1 + BASE_SHIFT, %eax
2379 movlpd 0 * SIZE(AA), %xmm0
2380 movlpd 4 * SIZE(AA), %xmm1
2381 movlpd 0 * SIZE(BB), %xmm2
2382 movlpd 8 * SIZE(BB), %xmm3
2384 #if defined(LT) || defined(RN)
2396 prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
2397 movlpd 1 * SIZE(AA), %xmm0
2398 mulsd 2 * SIZE(BB), %xmm0
2400 movlpd 16 * SIZE(BB), %xmm2
2402 movlpd 2 * SIZE(AA), %xmm0
2403 mulsd 4 * SIZE(BB), %xmm0
2405 movlpd 3 * SIZE(AA), %xmm0
2406 mulsd 6 * SIZE(BB), %xmm0
2408 movlpd 8 * SIZE(AA), %xmm0
2410 movlpd 5 * SIZE(AA), %xmm1
2411 mulsd 10 * SIZE(BB), %xmm1
2413 movlpd 24 * SIZE(BB), %xmm3
2415 movlpd 6 * SIZE(AA), %xmm1
2416 mulsd 12 * SIZE(BB), %xmm1
2418 movlpd 7 * SIZE(AA), %xmm1
2419 mulsd 14 * SIZE(BB), %xmm1
2421 movlpd 12 * SIZE(AA), %xmm1
2430 #if defined(LT) || defined(RN)
2436 andl $7, %eax # if (k & 1)
2443 movlpd 2 * SIZE(BB), %xmm2
2444 movlpd 1 * SIZE(AA), %xmm0
2457 #if defined(LN) || defined(RT)
2469 leal (, %eax, SIZE), %eax
2472 leal (BB, %eax, 2), BB
2475 #if defined(LN) || defined(LT)
2476 movlpd 0 * SIZE(B), %xmm2
2479 movlpd 0 * SIZE(AA), %xmm0
2484 movlpd 0 * SIZE(AA), %xmm4
2489 movlpd 0 * SIZE(AA), %xmm4
2494 movlpd 0 * SIZE(B), %xmm4
2499 movlpd 0 * SIZE(B), %xmm4
2503 #if defined(LN) || defined(LT)
2504 movlpd %xmm2, 0 * SIZE(B)
2506 movlpd %xmm2, 0 * SIZE(BB)
2507 movlpd %xmm2, 1 * SIZE(BB)
2509 movlpd %xmm0, 0 * SIZE(AA)
2516 #if defined(LN) || defined(LT)
2517 movlpd %xmm2, 0 * SIZE(CO1)
2519 movlpd %xmm0, 0 * SIZE(CO1)
2526 #if defined(LT) || defined(RN)
2529 leal (AA,%eax, SIZE), AA
2547 sall $BASE_SHIFT, %eax
2555 leal (B, %eax, SIZE), B
2558 #if defined(LT) || defined(RN)
2561 leal (B,%eax, SIZE), B
2574 movl OLD_STACK, %esp