1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
45 #define STACK_M 4 + STACK + ARGS(%esi)
46 #define STACK_N 8 + STACK + ARGS(%esi)
47 #define STACK_K 12 + STACK + ARGS(%esi)
48 #define STACK_A 24 + STACK + ARGS(%esi)
49 #define STACK_B 28 + STACK + ARGS(%esi)
50 #define STACK_C 32 + STACK + ARGS(%esi)
51 #define STACK_LDC 36 + STACK + ARGS(%esi)
52 #define STACK_OFFT 40 + STACK + ARGS(%esi)
54 #define POSINV 0(%esp)
61 #define OLD_STACK 40(%esp)
62 #define OFFSET 48(%esp)
65 #define AORIG 60(%esp)
66 #define BORIG 64(%esp)
67 #define BUFFER 128(%esp)
75 #define STACK_ALIGN 4096
76 #define STACK_OFFSET 1024
78 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
79 #define PREFETCHSIZE (16 * 10 + 8)
80 #define WPREFETCHSIZE 112
81 #define PREFETCH prefetch
82 #define PREFETCHW prefetchw
85 #if defined(PENTIUM4) || defined(PENTIUMM)
86 #define PREFETCH prefetcht1
87 #define PREFETCHSIZE 168
88 #define PREFETCHW prefetcht0
91 #if defined(PENRYN) || defined(DUNNINGTON)
92 #define PREFETCH prefetcht1
93 #define PREFETCHSIZE 168
94 #define PREFETCHW prefetcht0
97 #if defined(OPTERON) || !defined(HAVE_SSE2)
105 #define KERNEL1(address) \
106 mulps %xmm0, %xmm2; \
107 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
108 addps %xmm2, %xmm4; \
109 movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
110 mulps %xmm0, %xmm2; \
111 addps %xmm2, %xmm5; \
112 movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
113 mulps %xmm0, %xmm2; \
114 mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
115 addps %xmm2, %xmm6; \
116 movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
117 addps %xmm0, %xmm7; \
118 movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
120 #define KERNEL2(address) \
121 mulps %xmm0, %xmm3; \
122 addps %xmm3, %xmm4; \
123 movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
124 mulps %xmm0, %xmm3; \
125 addps %xmm3, %xmm5; \
126 movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
127 mulps %xmm0, %xmm3; \
128 mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
129 addps %xmm3, %xmm6; \
130 movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
131 addps %xmm0, %xmm7; \
132 movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
134 #define KERNEL3(address) \
135 mulps %xmm0, %xmm2; \
136 addps %xmm2, %xmm4; \
137 movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
138 mulps %xmm0, %xmm2; \
139 addps %xmm2, %xmm5; \
140 movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
141 mulps %xmm0, %xmm2; \
142 mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
143 addps %xmm2, %xmm6; \
144 movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
145 addps %xmm0, %xmm7; \
146 movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
148 #define KERNEL4(address) \
149 mulps %xmm0, %xmm3; \
150 addps %xmm3, %xmm4; \
151 movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
152 mulps %xmm0, %xmm3; \
153 addps %xmm3, %xmm5; \
154 movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
155 mulps %xmm0, %xmm3; \
156 mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
157 addps %xmm3, %xmm6; \
158 movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
159 addps %xmm0, %xmm7; \
160 movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
162 #define KERNEL5(address) \
163 mulps %xmm1, %xmm2; \
164 addps %xmm2, %xmm4; \
165 movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
166 mulps %xmm1, %xmm2; \
167 addps %xmm2, %xmm5; \
168 movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
169 mulps %xmm1, %xmm2; \
170 mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
171 addps %xmm2, %xmm6; \
172 movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
173 addps %xmm1, %xmm7; \
174 movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
176 #define KERNEL6(address) \
177 mulps %xmm1, %xmm3; \
178 addps %xmm3, %xmm4; \
179 movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
180 mulps %xmm1, %xmm3; \
181 addps %xmm3, %xmm5; \
182 movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
183 mulps %xmm1, %xmm3; \
184 mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
185 addps %xmm3, %xmm6; \
186 movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
187 addps %xmm1, %xmm7; \
188 movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
190 #define KERNEL7(address) \
191 mulps %xmm1, %xmm2; \
192 addps %xmm2, %xmm4; \
193 movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
194 mulps %xmm1, %xmm2; \
195 addps %xmm2, %xmm5; \
196 movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
197 mulps %xmm1, %xmm2; \
198 mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
199 addps %xmm2, %xmm6; \
200 movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
201 addps %xmm1, %xmm7; \
202 movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
204 #define KERNEL8(address) \
205 mulps %xmm1, %xmm3; \
206 addps %xmm3, %xmm4; \
207 movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
208 mulps %xmm1, %xmm3; \
209 addps %xmm3, %xmm5; \
210 movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
211 mulps %xmm1, %xmm3; \
212 mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
213 addps %xmm3, %xmm6; \
214 movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
215 addps %xmm1, %xmm7; \
216 movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
227 movl %esp, %esi # save old stack
229 subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
230 andl $-STACK_ALIGN, %esp # align stack
231 addl $STACK_OFFSET, %esp
248 movss STACK_OFFT, %xmm4
256 movss %xmm7, 0 + POSINV
257 movss %xmm2, 4 + POSINV
258 movss %xmm7, 8 + POSINV
259 movss %xmm2, 12 + POSINV
261 movss %xmm2, 0 + POSINV
262 movss %xmm7, 4 + POSINV
263 movss %xmm2, 8 + POSINV
264 movss %xmm7, 12 + POSINV
275 sall $ZBASE_SHIFT, LDC
279 sall $ZBASE_SHIFT, %eax
287 sall $ZBASE_SHIFT, %eax
322 sall $ZBASE_SHIFT, %eax
326 #if defined(LN) || defined(RT)
329 sall $ZBASE_SHIFT, %eax
331 leal (BB, %eax, 4), BB
339 #if defined(LT) || defined(RN)
350 movaps 0 * SIZE(B), %xmm3
351 movaps 4 * SIZE(B), %xmm7
353 pshufd $0x00, %xmm3, %xmm0
354 pshufd $0x55, %xmm3, %xmm1
355 pshufd $0xaa, %xmm3, %xmm2
356 pshufd $0xff, %xmm3, %xmm3
358 movaps %xmm0, 0 * SIZE(BB)
359 movaps %xmm1, 4 * SIZE(BB)
360 movaps %xmm2, 8 * SIZE(BB)
361 movaps %xmm3, 12 * SIZE(BB)
363 pshufd $0x00, %xmm7, %xmm4
364 pshufd $0x55, %xmm7, %xmm5
365 pshufd $0xaa, %xmm7, %xmm6
366 pshufd $0xff, %xmm7, %xmm7
368 movaps %xmm4, 16 * SIZE(BB)
369 movaps %xmm5, 20 * SIZE(BB)
370 movaps %xmm6, 24 * SIZE(BB)
371 movaps %xmm7, 28 * SIZE(BB)
380 #if defined(LT) || defined(RN)
395 movsd 0 * SIZE(B), %xmm3
397 pshufd $0x00, %xmm3, %xmm0
398 pshufd $0x55, %xmm3, %xmm1
400 movaps %xmm0, 0 * SIZE(BB)
401 movaps %xmm1, 4 * SIZE(BB)
403 addl $ 2 * SIZE, %edi
404 addl $ 8 * SIZE, %ecx
410 #if defined(LT) || defined(RN)
436 sall $1 + ZBASE_SHIFT, %eax
440 #if defined(LN) || defined(RT)
445 sall $1 + ZBASE_SHIFT, %eax
449 leal BUFFER, BB # boffset1 = boffset
451 #if defined(LN) || defined(RT)
453 sall $2 + ZBASE_SHIFT, %eax
462 movaps 0 * SIZE(AA), %xmm0
463 movaps 16 * SIZE(AA), %xmm1
464 movaps 0 * SIZE(BB), %xmm2
465 movaps 16 * SIZE(BB), %xmm3
467 PREFETCHW 3 * SIZE(CO1)
469 #if defined(LT) || defined(RN)
482 movaps 4 * SIZE(BB), %xmm2
484 movaps 4 * SIZE(AA), %xmm0
486 movaps 8 * SIZE(BB), %xmm2
489 movaps 12 * SIZE(BB), %xmm2
491 movaps 8 * SIZE(AA), %xmm0
493 movaps 32 * SIZE(BB), %xmm2
496 movaps 20 * SIZE(BB), %xmm3
498 movaps 12 * SIZE(AA), %xmm0
500 movaps 24 * SIZE(BB), %xmm3
503 movaps 28 * SIZE(BB), %xmm3
505 movaps 32 * SIZE(AA), %xmm0
507 movaps 48 * SIZE(BB), %xmm3
510 movaps 36 * SIZE(BB), %xmm2
512 movaps 20 * SIZE(AA), %xmm1
514 movaps 40 * SIZE(BB), %xmm2
517 movaps 44 * SIZE(BB), %xmm2
519 movaps 24 * SIZE(AA), %xmm1
521 movaps 64 * SIZE(BB), %xmm2
524 movaps 52 * SIZE(BB), %xmm3
526 movaps 28 * SIZE(AA), %xmm1
528 movaps 56 * SIZE(BB), %xmm3
531 movaps 60 * SIZE(BB), %xmm3
533 movaps 48 * SIZE(AA), %xmm1
535 movaps 80 * SIZE(BB), %xmm3
544 #if defined(LT) || defined(RN)
550 andl $7, %eax # if (k & 1)
557 mulps 4 * SIZE(BB), %xmm0
559 movaps 8 * SIZE(BB), %xmm2
561 movaps 4 * SIZE(AA), %xmm0
575 shufps $0xb1, %xmm5, %xmm5
577 #if defined(LN) || defined(LT)
589 #if defined(LN) || defined(RT)
601 sall $ZBASE_SHIFT, %eax
602 leal (AA, %eax, 2), AA
604 leal (BB, %eax, 4), BB
607 #if defined(LN) || defined(LT)
609 unpcklpd %xmm6, %xmm4
610 unpckhpd %xmm6, %xmm5
615 movsd 0 * SIZE(B), %xmm2
619 movsd 2 * SIZE(B), %xmm3
624 movaps 0 * SIZE(AA), %xmm1
630 movaps 4 * SIZE(AA), %xmm5
632 pshufd $0xee, %xmm5, %xmm6
633 pshufd $0xbb, %xmm5, %xmm7
635 pshufd $0xa0, %xmm3, %xmm4
636 pshufd $0xf5, %xmm3, %xmm3
648 pshufd $0x44, %xmm5, %xmm6
649 pshufd $0x11, %xmm5, %xmm7
651 pshufd $0xa0, %xmm3, %xmm4
652 pshufd $0xf5, %xmm3, %xmm1
665 movaps 0 * SIZE(AA), %xmm5
667 pshufd $0x44, %xmm5, %xmm6
668 pshufd $0x11, %xmm5, %xmm7
670 pshufd $0xa0, %xmm2, %xmm4
671 pshufd $0xf5, %xmm2, %xmm2
685 movaps 0 * SIZE(AA), %xmm5
687 pshufd $0x44, %xmm5, %xmm6
688 pshufd $0x11, %xmm5, %xmm7
690 pshufd $0xa0, %xmm2, %xmm4
691 pshufd $0xf5, %xmm2, %xmm2
703 pshufd $0xee, %xmm5, %xmm6
704 pshufd $0xbb, %xmm5, %xmm7
706 pshufd $0xa0, %xmm2, %xmm4
707 pshufd $0xf5, %xmm2, %xmm1
720 movaps 4 * SIZE(AA), %xmm5
722 pshufd $0xee, %xmm5, %xmm6
723 pshufd $0xbb, %xmm5, %xmm7
725 pshufd $0xa0, %xmm3, %xmm4
726 pshufd $0xf5, %xmm3, %xmm3
739 #if defined(RN) || defined(RT)
740 movaps 0 * SIZE(B), %xmm4
742 pshufd $0x44, %xmm4, %xmm6
743 pshufd $0x11, %xmm4, %xmm7
745 pshufd $0xa0, %xmm1, %xmm3
746 pshufd $0xf5, %xmm1, %xmm1
764 #if defined(LN) || defined(LT)
765 movlps %xmm2, 0 * SIZE(B)
766 movlps %xmm3, 2 * SIZE(B)
768 pshufd $0x00, %xmm2, %xmm0
769 pshufd $0x55, %xmm2, %xmm1
771 movaps %xmm0, 0 * SIZE(BB)
772 movaps %xmm1, 4 * SIZE(BB)
774 pshufd $0x00, %xmm3, %xmm0
775 pshufd $0x55, %xmm3, %xmm1
777 movaps %xmm0, 8 * SIZE(BB)
778 movaps %xmm1, 12 * SIZE(BB)
780 movlps %xmm2, 0 * SIZE(CO1)
781 movlps %xmm3, 2 * SIZE(CO1)
783 movaps %xmm1, 0 * SIZE(AA)
785 movlps %xmm1, 0 * SIZE(CO1)
786 movhps %xmm1, 2 * SIZE(CO1)
793 #if defined(LT) || defined(RN)
796 sall $1 + ZBASE_SHIFT, %eax
815 sall $1 + ZBASE_SHIFT, %eax
830 sall $ZBASE_SHIFT, %eax
834 #if defined(LN) || defined(RT)
839 sall $ZBASE_SHIFT, %eax
843 leal BUFFER, BB # boffset1 = boffset
845 #if defined(LN) || defined(RT)
847 sall $2 + ZBASE_SHIFT, %eax
854 movsd 0 * SIZE(AA), %xmm0
859 movsd 8 * SIZE(AA), %xmm1
861 movaps 0 * SIZE(BB), %xmm2
863 movaps 16 * SIZE(BB), %xmm3
866 #if defined(LT) || defined(RN)
879 movaps 4 * SIZE(BB), %xmm2
881 movsd 2 * SIZE(AA), %xmm0
883 movaps 8 * SIZE(BB), %xmm2
886 movaps 12 * SIZE(BB), %xmm2
888 movsd 4 * SIZE(AA), %xmm0
890 movaps 32 * SIZE(BB), %xmm2
893 movaps 20 * SIZE(BB), %xmm3
895 movsd 6 * SIZE(AA), %xmm0
897 movaps 24 * SIZE(BB), %xmm3
900 movaps 28 * SIZE(BB), %xmm3
902 movsd 16 * SIZE(AA), %xmm0
904 movaps 48 * SIZE(BB), %xmm3
907 movaps 36 * SIZE(BB), %xmm2
909 movsd 10 * SIZE(AA), %xmm1
911 movaps 40 * SIZE(BB), %xmm2
914 movaps 44 * SIZE(BB), %xmm2
916 movsd 12 * SIZE(AA), %xmm1
918 movaps 64 * SIZE(BB), %xmm2
921 movaps 52 * SIZE(BB), %xmm3
923 movsd 14 * SIZE(AA), %xmm1
925 movaps 56 * SIZE(BB), %xmm3
928 movaps 60 * SIZE(BB), %xmm3
930 movsd 24 * SIZE(AA), %xmm1
932 movaps 80 * SIZE(BB), %xmm3
941 #if defined(LT) || defined(RN)
947 andl $7, %eax # if (k & 1)
954 mulps 4 * SIZE(BB), %xmm0
956 movaps 8 * SIZE(BB), %xmm2
958 movsd 2 * SIZE(AA), %xmm0
972 shufps $0xb1, %xmm5, %xmm5
974 #if defined(LN) || defined(LT)
986 #if defined(LN) || defined(RT)
994 sall $ZBASE_SHIFT, %eax
997 leal (BB, %eax, 4), BB
1000 #if defined(LN) || defined(LT)
1004 movsd 0 * SIZE(B), %xmm2
1011 movsd 0 * SIZE(AA), %xmm1
1016 #if defined(LN) || defined(LT)
1017 movaps 0 * SIZE(AA), %xmm5
1019 pshufd $0x44, %xmm5, %xmm6
1020 pshufd $0x11, %xmm5, %xmm7
1022 pshufd $0xa0, %xmm2, %xmm4
1023 pshufd $0xf5, %xmm2, %xmm2
1036 #if defined(RN) || defined(RT)
1037 movaps 0 * SIZE(B), %xmm4
1039 pshufd $0x44, %xmm4, %xmm6
1040 pshufd $0x11, %xmm4, %xmm7
1042 pshufd $0xa0, %xmm1, %xmm3
1043 pshufd $0xf5, %xmm1, %xmm1
1061 #if defined(LN) || defined(LT)
1062 movlps %xmm2, 0 * SIZE(B)
1064 pshufd $0x00, %xmm2, %xmm0
1065 pshufd $0x55, %xmm2, %xmm1
1067 movaps %xmm0, 0 * SIZE(BB)
1068 movaps %xmm1, 4 * SIZE(BB)
1070 movlps %xmm2, 0 * SIZE(CO1)
1072 movlps %xmm1, 0 * SIZE(AA)
1074 movlps %xmm1, 0 * SIZE(CO1)
1081 #if defined(LT) || defined(RN)
1084 sall $ZBASE_SHIFT, %eax
1103 sall $ZBASE_SHIFT, %eax
1111 sall $ZBASE_SHIFT, %eax
1115 #if defined(LT) || defined(RN)
1118 sall $ZBASE_SHIFT, %eax
1149 sall $1 + ZBASE_SHIFT, %eax
1153 #if defined(LN) || defined(RT)
1156 sall $1 + ZBASE_SHIFT, %eax
1158 leal (BB, %eax, 4), BB
1166 #if defined(LT) || defined(RN)
1177 movaps 0 * SIZE(B), %xmm3
1178 movaps 4 * SIZE(B), %xmm7
1180 pshufd $0x00, %xmm3, %xmm0
1181 pshufd $0x55, %xmm3, %xmm1
1182 pshufd $0xaa, %xmm3, %xmm2
1183 pshufd $0xff, %xmm3, %xmm3
1185 movaps %xmm0, 0 * SIZE(BB)
1186 movaps %xmm1, 4 * SIZE(BB)
1187 movaps %xmm2, 8 * SIZE(BB)
1188 movaps %xmm3, 12 * SIZE(BB)
1190 pshufd $0x00, %xmm7, %xmm4
1191 pshufd $0x55, %xmm7, %xmm5
1192 pshufd $0xaa, %xmm7, %xmm6
1193 pshufd $0xff, %xmm7, %xmm7
1195 movaps %xmm4, 16 * SIZE(BB)
1196 movaps %xmm5, 20 * SIZE(BB)
1197 movaps %xmm6, 24 * SIZE(BB)
1198 movaps %xmm7, 28 * SIZE(BB)
1208 #if defined(LT) || defined(RN)
1220 movaps 0 * SIZE(B), %xmm3
1222 pshufd $0x00, %xmm3, %xmm0
1223 pshufd $0x55, %xmm3, %xmm1
1224 pshufd $0xaa, %xmm3, %xmm2
1225 pshufd $0xff, %xmm3, %xmm3
1227 movaps %xmm0, 0 * SIZE(BB)
1228 movaps %xmm1, 4 * SIZE(BB)
1229 movaps %xmm2, 8 * SIZE(BB)
1230 movaps %xmm3, 12 * SIZE(BB)
1236 #if defined(LT) || defined(RN)
1244 leal (, LDC, 2), %eax
1264 sall $1 + ZBASE_SHIFT, %eax
1268 #if defined(LN) || defined(RT)
1273 sall $1 + ZBASE_SHIFT, %eax
1277 leal BUFFER, BB # boffset1 = boffset
1279 #if defined(LN) || defined(RT)
1281 sall $3 + ZBASE_SHIFT, %eax
1285 movaps 0 * SIZE(AA), %xmm0
1287 movaps 16 * SIZE(AA), %xmm1
1289 movaps 0 * SIZE(BB), %xmm2
1291 movaps 16 * SIZE(BB), %xmm3
1294 PREFETCHW 3 * SIZE(CO1)
1295 PREFETCHW 3 * SIZE(CO1, LDC)
1297 #if defined(LT) || defined(RN)
1317 addl $ 32 * SIZE, AA
1318 addl $128 * SIZE, BB
1324 #if defined(LT) || defined(RN)
1330 andl $7, %eax # if (k & 1)
1338 movaps 4 * SIZE(BB), %xmm2
1341 movaps 8 * SIZE(BB), %xmm2
1343 mulps 12 * SIZE(BB), %xmm0
1345 movaps 16 * SIZE(BB), %xmm2
1347 movaps 4 * SIZE(AA), %xmm0
1356 movaps POSINV, %xmm0
1358 shufps $0xb1, %xmm5, %xmm5
1359 shufps $0xb1, %xmm7, %xmm7
1361 #if defined(LN) || defined(LT)
1377 #if defined(LN) || defined(RT)
1389 sall $ZBASE_SHIFT, %eax
1390 leal (AA, %eax, 2), AA
1391 leal (B, %eax, 2), B
1392 leal (BB, %eax, 8), BB
1395 #if defined(LN) || defined(LT)
1397 unpcklpd %xmm6, %xmm4
1398 unpckhpd %xmm6, %xmm5
1400 movaps 0 * SIZE(B), %xmm2
1401 movaps 4 * SIZE(B), %xmm3
1406 movaps 0 * SIZE(AA), %xmm1
1407 movaps 4 * SIZE(AA), %xmm5
1414 movaps 4 * SIZE(AA), %xmm5
1416 pshufd $0xee, %xmm5, %xmm6
1417 pshufd $0xbb, %xmm5, %xmm7
1419 pshufd $0xa0, %xmm3, %xmm4
1420 pshufd $0xf5, %xmm3, %xmm3
1432 pshufd $0x44, %xmm5, %xmm6
1433 pshufd $0x11, %xmm5, %xmm7
1435 pshufd $0xa0, %xmm3, %xmm4
1436 pshufd $0xf5, %xmm3, %xmm1
1449 movaps 0 * SIZE(AA), %xmm5
1451 pshufd $0x44, %xmm5, %xmm6
1452 pshufd $0x11, %xmm5, %xmm7
1454 pshufd $0xa0, %xmm2, %xmm4
1455 pshufd $0xf5, %xmm2, %xmm2
1469 movaps 0 * SIZE(AA), %xmm5
1471 pshufd $0x44, %xmm5, %xmm6
1472 pshufd $0x11, %xmm5, %xmm7
1474 pshufd $0xa0, %xmm2, %xmm4
1475 pshufd $0xf5, %xmm2, %xmm2
1487 pshufd $0xee, %xmm5, %xmm6
1488 pshufd $0xbb, %xmm5, %xmm7
1490 pshufd $0xa0, %xmm2, %xmm4
1491 pshufd $0xf5, %xmm2, %xmm1
1504 movaps 4 * SIZE(AA), %xmm5
1506 pshufd $0xee, %xmm5, %xmm6
1507 pshufd $0xbb, %xmm5, %xmm7
1509 pshufd $0xa0, %xmm3, %xmm4
1510 pshufd $0xf5, %xmm3, %xmm3
1524 movaps 0 * SIZE(B), %xmm4
1526 pshufd $0x44, %xmm4, %xmm6
1527 pshufd $0x11, %xmm4, %xmm7
1529 pshufd $0xa0, %xmm1, %xmm3
1530 pshufd $0xf5, %xmm1, %xmm1
1543 pshufd $0xee, %xmm4, %xmm6
1544 pshufd $0xbb, %xmm4, %xmm7
1546 pshufd $0xa0, %xmm1, %xmm3
1547 pshufd $0xf5, %xmm1, %xmm2
1561 movaps 4 * SIZE(B), %xmm4
1563 pshufd $0xee, %xmm4, %xmm6
1564 pshufd $0xbb, %xmm4, %xmm7
1566 pshufd $0xa0, %xmm5, %xmm3
1567 pshufd $0xf5, %xmm5, %xmm5
1582 movaps 4 * SIZE(B), %xmm4
1584 pshufd $0xee, %xmm4, %xmm6
1585 pshufd $0xbb, %xmm4, %xmm7
1587 pshufd $0xa0, %xmm5, %xmm3
1588 pshufd $0xf5, %xmm5, %xmm5
1601 pshufd $0x44, %xmm4, %xmm6
1602 pshufd $0x11, %xmm4, %xmm7
1604 pshufd $0xa0, %xmm5, %xmm3
1605 pshufd $0xf5, %xmm5, %xmm2
1619 movaps 0 * SIZE(B), %xmm4
1621 pshufd $0x44, %xmm4, %xmm6
1622 pshufd $0x11, %xmm4, %xmm7
1624 pshufd $0xa0, %xmm1, %xmm3
1625 pshufd $0xf5, %xmm1, %xmm1
1643 #if defined(LN) || defined(LT)
1644 movaps %xmm2, 0 * SIZE(B)
1645 movaps %xmm3, 4 * SIZE(B)
1647 pshufd $0x00, %xmm2, %xmm0
1648 pshufd $0x55, %xmm2, %xmm1
1649 pshufd $0xaa, %xmm2, %xmm4
1650 pshufd $0xff, %xmm2, %xmm5
1652 movaps %xmm0, 0 * SIZE(BB)
1653 movaps %xmm1, 4 * SIZE(BB)
1654 movaps %xmm4, 8 * SIZE(BB)
1655 movaps %xmm5, 12 * SIZE(BB)
1657 pshufd $0x00, %xmm3, %xmm0
1658 pshufd $0x55, %xmm3, %xmm1
1659 pshufd $0xaa, %xmm3, %xmm4
1660 pshufd $0xff, %xmm3, %xmm5
1662 movaps %xmm0, 16 * SIZE(BB)
1663 movaps %xmm1, 20 * SIZE(BB)
1664 movaps %xmm4, 24 * SIZE(BB)
1665 movaps %xmm5, 28 * SIZE(BB)
1667 movlps %xmm2, 0 * SIZE(CO1)
1668 movlps %xmm3, 2 * SIZE(CO1)
1669 movhps %xmm2, 0 * SIZE(CO1, LDC)
1670 movhps %xmm3, 2 * SIZE(CO1, LDC)
1672 movaps %xmm1, 0 * SIZE(AA)
1673 movaps %xmm5, 4 * SIZE(AA)
1675 movlps %xmm1, 0 * SIZE(CO1)
1676 movhps %xmm1, 2 * SIZE(CO1)
1678 movlps %xmm5, 0 * SIZE(CO1, LDC)
1679 movhps %xmm5, 2 * SIZE(CO1, LDC)
1686 #if defined(LT) || defined(RN)
1689 sall $1 + ZBASE_SHIFT, %eax
1708 sall $1 + ZBASE_SHIFT, %eax
1725 sall $ZBASE_SHIFT, %eax
1729 #if defined(LN) || defined(RT)
1734 sall $ZBASE_SHIFT, %eax
1738 leal BUFFER, BB # boffset1 = boffset
1740 #if defined(LN) || defined(RT)
1742 sall $3 + ZBASE_SHIFT, %eax
1754 movsd 0 * SIZE(AA), %xmm0
1758 movsd 8 * SIZE(AA), %xmm1
1759 movaps 0 * SIZE(BB), %xmm2
1760 movaps 16 * SIZE(BB), %xmm3
1762 #if defined(LT) || defined(RN)
1774 prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA)
1776 movaps 4 * SIZE(BB), %xmm2
1779 movaps 8 * SIZE(BB), %xmm2
1781 mulps 12 * SIZE(BB), %xmm0
1783 movaps 32 * SIZE(BB), %xmm2
1785 movsd 2 * SIZE(AA), %xmm0
1788 movaps 20 * SIZE(BB), %xmm3
1791 movaps 24 * SIZE(BB), %xmm3
1793 mulps 28 * SIZE(BB), %xmm0
1795 movaps 48 * SIZE(BB), %xmm3
1797 movsd 4 * SIZE(AA), %xmm0
1800 movaps 36 * SIZE(BB), %xmm2
1803 movaps 40 * SIZE(BB), %xmm2
1805 mulps 44 * SIZE(BB), %xmm0
1807 movaps 64 * SIZE(BB), %xmm2
1809 movsd 6 * SIZE(AA), %xmm0
1812 movaps 52 * SIZE(BB), %xmm3
1815 movaps 56 * SIZE(BB), %xmm3
1817 mulps 60 * SIZE(BB), %xmm0
1819 movaps 80 * SIZE(BB), %xmm3
1821 movsd 16 * SIZE(AA), %xmm0
1823 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
1824 prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
1827 movaps 68 * SIZE(BB), %xmm2
1830 movaps 72 * SIZE(BB), %xmm2
1832 mulps 76 * SIZE(BB), %xmm1
1834 movaps 96 * SIZE(BB), %xmm2
1836 movsd 10 * SIZE(AA), %xmm1
1839 movaps 84 * SIZE(BB), %xmm3
1842 movaps 88 * SIZE(BB), %xmm3
1844 mulps 92 * SIZE(BB), %xmm1
1846 movaps 112 * SIZE(BB), %xmm3
1848 movsd 12 * SIZE(AA), %xmm1
1851 movaps 100 * SIZE(BB), %xmm2
1854 movaps 104 * SIZE(BB), %xmm2
1856 mulps 108 * SIZE(BB), %xmm1
1858 movaps 128 * SIZE(BB), %xmm2
1860 movsd 14 * SIZE(AA), %xmm1
1863 movaps 116 * SIZE(BB), %xmm3
1866 movaps 120 * SIZE(BB), %xmm3
1868 mulps 124 * SIZE(BB), %xmm1
1870 movaps 144 * SIZE(BB), %xmm3
1872 movsd 24 * SIZE(AA), %xmm1
1873 addl $ 16 * SIZE, AA
1874 addl $128 * SIZE, BB
1880 #if defined(LT) || defined(RN)
1886 andl $7, %eax # if (k & 1)
1894 movaps 4 * SIZE(BB), %xmm2
1897 movaps 8 * SIZE(BB), %xmm2
1899 mulps 12 * SIZE(BB), %xmm0
1901 movaps 16 * SIZE(BB), %xmm2
1903 movsd 2 * SIZE(AA), %xmm0
1912 movaps POSINV, %xmm0
1914 shufps $0xb1, %xmm5, %xmm5
1915 shufps $0xb1, %xmm7, %xmm7
1917 #if defined(LN) || defined(LT)
1933 #if defined(LN) || defined(RT)
1945 sall $ZBASE_SHIFT, %eax
1946 leal (AA, %eax, 1), AA
1947 leal (B, %eax, 2), B
1948 leal (BB, %eax, 8), BB
1951 #if defined(LN) || defined(LT)
1952 unpcklpd %xmm6, %xmm4
1954 movaps 0 * SIZE(B), %xmm2
1961 movsd 0 * SIZE(AA), %xmm1
1965 movsd 2 * SIZE(AA), %xmm5
1971 #if defined(LN) || defined(LT)
1972 movaps 0 * SIZE(AA), %xmm5
1974 pshufd $0x44, %xmm5, %xmm6
1975 pshufd $0x11, %xmm5, %xmm7
1977 pshufd $0xa0, %xmm2, %xmm4
1978 pshufd $0xf5, %xmm2, %xmm2
1992 movaps 0 * SIZE(B), %xmm4
1994 pshufd $0x44, %xmm4, %xmm6
1995 pshufd $0x11, %xmm4, %xmm7
1997 pshufd $0xa0, %xmm1, %xmm3
1998 pshufd $0xf5, %xmm1, %xmm1
2011 pshufd $0xee, %xmm4, %xmm6
2012 pshufd $0xbb, %xmm4, %xmm7
2014 pshufd $0xa0, %xmm1, %xmm3
2015 pshufd $0xf5, %xmm1, %xmm2
2029 movaps 4 * SIZE(B), %xmm4
2031 pshufd $0xee, %xmm4, %xmm6
2032 pshufd $0xbb, %xmm4, %xmm7
2034 pshufd $0xa0, %xmm5, %xmm3
2035 pshufd $0xf5, %xmm5, %xmm5
2050 movaps 4 * SIZE(B), %xmm4
2052 pshufd $0xee, %xmm4, %xmm6
2053 pshufd $0xbb, %xmm4, %xmm7
2055 pshufd $0xa0, %xmm5, %xmm3
2056 pshufd $0xf5, %xmm5, %xmm5
2069 pshufd $0x44, %xmm4, %xmm6
2070 pshufd $0x11, %xmm4, %xmm7
2072 pshufd $0xa0, %xmm5, %xmm3
2073 pshufd $0xf5, %xmm5, %xmm2
2087 movaps 0 * SIZE(B), %xmm4
2089 pshufd $0x44, %xmm4, %xmm6
2090 pshufd $0x11, %xmm4, %xmm7
2092 pshufd $0xa0, %xmm1, %xmm3
2093 pshufd $0xf5, %xmm1, %xmm1
2111 #if defined(LN) || defined(LT)
2112 movaps %xmm2, 0 * SIZE(B)
2114 pshufd $0x00, %xmm2, %xmm0
2115 pshufd $0x55, %xmm2, %xmm1
2116 pshufd $0xaa, %xmm2, %xmm4
2117 pshufd $0xff, %xmm2, %xmm5
2119 movaps %xmm0, 0 * SIZE(BB)
2120 movaps %xmm1, 4 * SIZE(BB)
2121 movaps %xmm4, 8 * SIZE(BB)
2122 movaps %xmm5, 12 * SIZE(BB)
2124 movlps %xmm2, 0 * SIZE(CO1)
2125 movhps %xmm2, 0 * SIZE(CO1, LDC)
2127 movlps %xmm1, 0 * SIZE(AA)
2128 movlps %xmm5, 2 * SIZE(AA)
2130 movlps %xmm1, 0 * SIZE(CO1)
2131 movlps %xmm5, 0 * SIZE(CO1, LDC)
2138 #if defined(LT) || defined(RN)
2141 sall $ZBASE_SHIFT, %eax
2160 sall $ZBASE_SHIFT, %eax
2168 sall $1 + ZBASE_SHIFT, %eax
2172 #if defined(LT) || defined(RN)
2175 sall $1 + ZBASE_SHIFT, %eax
2195 movl OLD_STACK, %esp