1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
45 #define STACK_M 4 + STACK + ARGS(%esi)
46 #define STACK_N 8 + STACK + ARGS(%esi)
47 #define STACK_K 12 + STACK + ARGS(%esi)
48 #define STACK_A 24 + STACK + ARGS(%esi)
49 #define STACK_B 28 + STACK + ARGS(%esi)
50 #define STACK_C 32 + STACK + ARGS(%esi)
51 #define STACK_LDC 36 + STACK + ARGS(%esi)
52 #define STACK_OFFT 40 + STACK + ARGS(%esi)
54 #define POSINV 0(%esp)
61 #define OLD_STACK 40(%esp)
62 #define OFFSET 48(%esp)
65 #define AORIG 60(%esp)
66 #define BORIG 64(%esp)
67 #define BUFFER 128(%esp)
75 #define STACK_ALIGN 4096
76 #define STACK_OFFSET 1024
78 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
79 #define PREFETCHSIZE (16 * 10 + 8)
80 #define WPREFETCHSIZE 112
81 #define PREFETCH prefetch
82 #define PREFETCHW prefetchw
85 #if defined(PENTIUM4) || defined(PENTIUMM)
86 #define PREFETCH prefetcht1
87 #define PREFETCHSIZE 168
88 #define PREFETCHW prefetcht0
91 #if defined(PENRYN) || defined(DUNNINGTON)
92 #define PREFETCH prefetcht1
93 #define PREFETCHSIZE 168
94 #define PREFETCHW prefetcht0
97 #if defined(OPTERON) || !defined(HAVE_SSE2)
105 #define KERNEL1(address) \
106 mulps %xmm0, %xmm2; \
107 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
108 addps %xmm2, %xmm4; \
109 movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
110 mulps %xmm0, %xmm2; \
111 addps %xmm2, %xmm5; \
112 movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
113 mulps %xmm0, %xmm2; \
114 mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
115 addps %xmm2, %xmm6; \
116 movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
117 addps %xmm0, %xmm7; \
118 movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
120 #define KERNEL2(address) \
121 mulps %xmm0, %xmm3; \
122 addps %xmm3, %xmm4; \
123 movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
124 mulps %xmm0, %xmm3; \
125 addps %xmm3, %xmm5; \
126 movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
127 mulps %xmm0, %xmm3; \
128 mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
129 addps %xmm3, %xmm6; \
130 movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
131 addps %xmm0, %xmm7; \
132 movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
134 #define KERNEL3(address) \
135 mulps %xmm0, %xmm2; \
136 addps %xmm2, %xmm4; \
137 movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
138 mulps %xmm0, %xmm2; \
139 addps %xmm2, %xmm5; \
140 movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
141 mulps %xmm0, %xmm2; \
142 mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
143 addps %xmm2, %xmm6; \
144 movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
145 addps %xmm0, %xmm7; \
146 movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
148 #define KERNEL4(address) \
149 mulps %xmm0, %xmm3; \
150 addps %xmm3, %xmm4; \
151 movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
152 mulps %xmm0, %xmm3; \
153 addps %xmm3, %xmm5; \
154 movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
155 mulps %xmm0, %xmm3; \
156 mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
157 addps %xmm3, %xmm6; \
158 movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
159 addps %xmm0, %xmm7; \
160 movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
162 #define KERNEL5(address) \
163 mulps %xmm1, %xmm2; \
164 addps %xmm2, %xmm4; \
165 movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
166 mulps %xmm1, %xmm2; \
167 addps %xmm2, %xmm5; \
168 movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
169 mulps %xmm1, %xmm2; \
170 mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
171 addps %xmm2, %xmm6; \
172 movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
173 addps %xmm1, %xmm7; \
174 movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
176 #define KERNEL6(address) \
177 mulps %xmm1, %xmm3; \
178 addps %xmm3, %xmm4; \
179 movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
180 mulps %xmm1, %xmm3; \
181 addps %xmm3, %xmm5; \
182 movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
183 mulps %xmm1, %xmm3; \
184 mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
185 addps %xmm3, %xmm6; \
186 movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
187 addps %xmm1, %xmm7; \
188 movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
190 #define KERNEL7(address) \
191 mulps %xmm1, %xmm2; \
192 addps %xmm2, %xmm4; \
193 movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
194 mulps %xmm1, %xmm2; \
195 addps %xmm2, %xmm5; \
196 movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
197 mulps %xmm1, %xmm2; \
198 mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
199 addps %xmm2, %xmm6; \
200 movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
201 addps %xmm1, %xmm7; \
202 movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
204 #define KERNEL8(address) \
205 mulps %xmm1, %xmm3; \
206 addps %xmm3, %xmm4; \
207 movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
208 mulps %xmm1, %xmm3; \
209 addps %xmm3, %xmm5; \
210 movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
211 mulps %xmm1, %xmm3; \
212 mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
213 addps %xmm3, %xmm6; \
214 movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
215 addps %xmm1, %xmm7; \
216 movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
227 movl %esp, %esi # save old stack
229 subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
230 andl $-STACK_ALIGN, %esp # align stack
231 addl $STACK_OFFSET, %esp
248 movss STACK_OFFT, %xmm4
256 movss %xmm7, 0 + POSINV
257 movss %xmm2, 4 + POSINV
258 movss %xmm7, 8 + POSINV
259 movss %xmm2, 12 + POSINV
261 movss %xmm2, 0 + POSINV
262 movss %xmm7, 4 + POSINV
263 movss %xmm2, 8 + POSINV
264 movss %xmm7, 12 + POSINV
275 sall $ZBASE_SHIFT, LDC
279 sall $ZBASE_SHIFT, %eax
287 sall $ZBASE_SHIFT, %eax
323 sall $1 + ZBASE_SHIFT, %eax
327 #if defined(LN) || defined(RT)
330 sall $1 + ZBASE_SHIFT, %eax
332 leal (BB, %eax, 4), BB
340 #if defined(LT) || defined(RN)
351 movaps 0 * SIZE(B), %xmm3
352 movaps 4 * SIZE(B), %xmm7
354 pshufd $0x00, %xmm3, %xmm0
355 pshufd $0x55, %xmm3, %xmm1
356 pshufd $0xaa, %xmm3, %xmm2
357 pshufd $0xff, %xmm3, %xmm3
359 movaps %xmm0, 0 * SIZE(BB)
360 movaps %xmm1, 4 * SIZE(BB)
361 movaps %xmm2, 8 * SIZE(BB)
362 movaps %xmm3, 12 * SIZE(BB)
364 pshufd $0x00, %xmm7, %xmm4
365 pshufd $0x55, %xmm7, %xmm5
366 pshufd $0xaa, %xmm7, %xmm6
367 pshufd $0xff, %xmm7, %xmm7
369 movaps %xmm4, 16 * SIZE(BB)
370 movaps %xmm5, 20 * SIZE(BB)
371 movaps %xmm6, 24 * SIZE(BB)
372 movaps %xmm7, 28 * SIZE(BB)
382 #if defined(LT) || defined(RN)
394 movaps 0 * SIZE(B), %xmm3
396 pshufd $0x00, %xmm3, %xmm0
397 pshufd $0x55, %xmm3, %xmm1
398 pshufd $0xaa, %xmm3, %xmm2
399 pshufd $0xff, %xmm3, %xmm3
401 movaps %xmm0, 0 * SIZE(BB)
402 movaps %xmm1, 4 * SIZE(BB)
403 movaps %xmm2, 8 * SIZE(BB)
404 movaps %xmm3, 12 * SIZE(BB)
410 #if defined(LT) || defined(RN)
418 leal (, LDC, 2), %eax
438 sall $ZBASE_SHIFT, %eax
442 #if defined(LN) || defined(RT)
447 sall $ZBASE_SHIFT, %eax
451 leal BUFFER, BB # boffset1 = boffset
453 #if defined(LN) || defined(RT)
455 sall $3 + ZBASE_SHIFT, %eax
467 movsd 0 * SIZE(AA), %xmm0
471 movsd 8 * SIZE(AA), %xmm1
472 movaps 0 * SIZE(BB), %xmm2
473 movaps 16 * SIZE(BB), %xmm3
475 #if defined(LT) || defined(RN)
487 prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA)
489 movaps 4 * SIZE(BB), %xmm2
492 movaps 8 * SIZE(BB), %xmm2
494 mulps 12 * SIZE(BB), %xmm0
496 movaps 32 * SIZE(BB), %xmm2
498 movsd 2 * SIZE(AA), %xmm0
501 movaps 20 * SIZE(BB), %xmm3
504 movaps 24 * SIZE(BB), %xmm3
506 mulps 28 * SIZE(BB), %xmm0
508 movaps 48 * SIZE(BB), %xmm3
510 movsd 4 * SIZE(AA), %xmm0
513 movaps 36 * SIZE(BB), %xmm2
516 movaps 40 * SIZE(BB), %xmm2
518 mulps 44 * SIZE(BB), %xmm0
520 movaps 64 * SIZE(BB), %xmm2
522 movsd 6 * SIZE(AA), %xmm0
525 movaps 52 * SIZE(BB), %xmm3
528 movaps 56 * SIZE(BB), %xmm3
530 mulps 60 * SIZE(BB), %xmm0
532 movaps 80 * SIZE(BB), %xmm3
534 movsd 16 * SIZE(AA), %xmm0
536 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
537 prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
540 movaps 68 * SIZE(BB), %xmm2
543 movaps 72 * SIZE(BB), %xmm2
545 mulps 76 * SIZE(BB), %xmm1
547 movaps 96 * SIZE(BB), %xmm2
549 movsd 10 * SIZE(AA), %xmm1
552 movaps 84 * SIZE(BB), %xmm3
555 movaps 88 * SIZE(BB), %xmm3
557 mulps 92 * SIZE(BB), %xmm1
559 movaps 112 * SIZE(BB), %xmm3
561 movsd 12 * SIZE(AA), %xmm1
564 movaps 100 * SIZE(BB), %xmm2
567 movaps 104 * SIZE(BB), %xmm2
569 mulps 108 * SIZE(BB), %xmm1
571 movaps 128 * SIZE(BB), %xmm2
573 movsd 14 * SIZE(AA), %xmm1
576 movaps 116 * SIZE(BB), %xmm3
579 movaps 120 * SIZE(BB), %xmm3
581 mulps 124 * SIZE(BB), %xmm1
583 movaps 144 * SIZE(BB), %xmm3
585 movsd 24 * SIZE(AA), %xmm1
593 #if defined(LT) || defined(RN)
599 andl $7, %eax # if (k & 1)
607 movaps 4 * SIZE(BB), %xmm2
610 movaps 8 * SIZE(BB), %xmm2
612 mulps 12 * SIZE(BB), %xmm0
614 movaps 16 * SIZE(BB), %xmm2
616 movsd 2 * SIZE(AA), %xmm0
627 shufps $0xb1, %xmm5, %xmm5
628 shufps $0xb1, %xmm7, %xmm7
630 #if defined(LN) || defined(LT)
646 #if defined(LN) || defined(RT)
658 sall $ZBASE_SHIFT, %eax
659 leal (AA, %eax, 1), AA
661 leal (BB, %eax, 8), BB
664 #if defined(LN) || defined(LT)
665 unpcklpd %xmm6, %xmm4
667 movaps 0 * SIZE(B), %xmm2
674 movsd 0 * SIZE(AA), %xmm1
678 movsd 2 * SIZE(AA), %xmm5
684 #if defined(LN) || defined(LT)
685 movaps 0 * SIZE(AA), %xmm5
687 pshufd $0x44, %xmm5, %xmm6
688 pshufd $0x11, %xmm5, %xmm7
690 pshufd $0xa0, %xmm2, %xmm4
691 pshufd $0xf5, %xmm2, %xmm2
705 movaps 0 * SIZE(B), %xmm4
707 pshufd $0x44, %xmm4, %xmm6
708 pshufd $0x11, %xmm4, %xmm7
710 pshufd $0xa0, %xmm1, %xmm3
711 pshufd $0xf5, %xmm1, %xmm1
724 pshufd $0xee, %xmm4, %xmm6
725 pshufd $0xbb, %xmm4, %xmm7
727 pshufd $0xa0, %xmm1, %xmm3
728 pshufd $0xf5, %xmm1, %xmm2
742 movaps 4 * SIZE(B), %xmm4
744 pshufd $0xee, %xmm4, %xmm6
745 pshufd $0xbb, %xmm4, %xmm7
747 pshufd $0xa0, %xmm5, %xmm3
748 pshufd $0xf5, %xmm5, %xmm5
763 movaps 4 * SIZE(B), %xmm4
765 pshufd $0xee, %xmm4, %xmm6
766 pshufd $0xbb, %xmm4, %xmm7
768 pshufd $0xa0, %xmm5, %xmm3
769 pshufd $0xf5, %xmm5, %xmm5
782 pshufd $0x44, %xmm4, %xmm6
783 pshufd $0x11, %xmm4, %xmm7
785 pshufd $0xa0, %xmm5, %xmm3
786 pshufd $0xf5, %xmm5, %xmm2
800 movaps 0 * SIZE(B), %xmm4
802 pshufd $0x44, %xmm4, %xmm6
803 pshufd $0x11, %xmm4, %xmm7
805 pshufd $0xa0, %xmm1, %xmm3
806 pshufd $0xf5, %xmm1, %xmm1
824 #if defined(LN) || defined(LT)
825 movaps %xmm2, 0 * SIZE(B)
827 pshufd $0x00, %xmm2, %xmm0
828 pshufd $0x55, %xmm2, %xmm1
829 pshufd $0xaa, %xmm2, %xmm4
830 pshufd $0xff, %xmm2, %xmm5
832 movaps %xmm0, 0 * SIZE(BB)
833 movaps %xmm1, 4 * SIZE(BB)
834 movaps %xmm4, 8 * SIZE(BB)
835 movaps %xmm5, 12 * SIZE(BB)
837 movlps %xmm2, 0 * SIZE(CO1)
838 movhps %xmm2, 0 * SIZE(CO1, LDC)
840 movlps %xmm1, 0 * SIZE(AA)
841 movlps %xmm5, 2 * SIZE(AA)
843 movlps %xmm1, 0 * SIZE(CO1)
844 movlps %xmm5, 0 * SIZE(CO1, LDC)
851 #if defined(LT) || defined(RN)
854 sall $ZBASE_SHIFT, %eax
873 sall $ZBASE_SHIFT, %eax
887 sall $1 + ZBASE_SHIFT, %eax
891 #if defined(LN) || defined(RT)
896 sall $1 + ZBASE_SHIFT, %eax
900 leal BUFFER, BB # boffset1 = boffset
902 #if defined(LN) || defined(RT)
904 sall $3 + ZBASE_SHIFT, %eax
908 movaps 0 * SIZE(AA), %xmm0
910 movaps 16 * SIZE(AA), %xmm1
912 movaps 0 * SIZE(BB), %xmm2
914 movaps 16 * SIZE(BB), %xmm3
917 PREFETCHW -4 * SIZE(CO1)
918 PREFETCHW -4 * SIZE(CO1, LDC)
920 #if defined(LT) || defined(RN)
947 #if defined(LT) || defined(RN)
953 andl $7, %eax # if (k & 1)
961 movaps 4 * SIZE(BB), %xmm2
964 movaps 8 * SIZE(BB), %xmm2
966 mulps 12 * SIZE(BB), %xmm0
968 movaps 16 * SIZE(BB), %xmm2
970 movaps 4 * SIZE(AA), %xmm0
981 shufps $0xb1, %xmm5, %xmm5
982 shufps $0xb1, %xmm7, %xmm7
984 #if defined(LN) || defined(LT)
1000 #if defined(LN) || defined(RT)
1012 sall $ZBASE_SHIFT, %eax
1013 leal (AA, %eax, 2), AA
1014 leal (B, %eax, 2), B
1015 leal (BB, %eax, 8), BB
1018 #if defined(LN) || defined(LT)
1020 unpcklpd %xmm6, %xmm4
1021 unpckhpd %xmm6, %xmm5
1023 movaps 0 * SIZE(B), %xmm2
1024 movaps 4 * SIZE(B), %xmm3
1029 movaps 0 * SIZE(AA), %xmm1
1030 movaps 4 * SIZE(AA), %xmm5
1037 movaps 4 * SIZE(AA), %xmm5
1039 pshufd $0xee, %xmm5, %xmm6
1040 pshufd $0xbb, %xmm5, %xmm7
1042 pshufd $0xa0, %xmm3, %xmm4
1043 pshufd $0xf5, %xmm3, %xmm3
1055 pshufd $0x44, %xmm5, %xmm6
1056 pshufd $0x11, %xmm5, %xmm7
1058 pshufd $0xa0, %xmm3, %xmm4
1059 pshufd $0xf5, %xmm3, %xmm1
1072 movaps 0 * SIZE(AA), %xmm5
1074 pshufd $0x44, %xmm5, %xmm6
1075 pshufd $0x11, %xmm5, %xmm7
1077 pshufd $0xa0, %xmm2, %xmm4
1078 pshufd $0xf5, %xmm2, %xmm2
1092 movaps 0 * SIZE(AA), %xmm5
1094 pshufd $0x44, %xmm5, %xmm6
1095 pshufd $0x11, %xmm5, %xmm7
1097 pshufd $0xa0, %xmm2, %xmm4
1098 pshufd $0xf5, %xmm2, %xmm2
1110 pshufd $0xee, %xmm5, %xmm6
1111 pshufd $0xbb, %xmm5, %xmm7
1113 pshufd $0xa0, %xmm2, %xmm4
1114 pshufd $0xf5, %xmm2, %xmm1
1127 movaps 4 * SIZE(AA), %xmm5
1129 pshufd $0xee, %xmm5, %xmm6
1130 pshufd $0xbb, %xmm5, %xmm7
1132 pshufd $0xa0, %xmm3, %xmm4
1133 pshufd $0xf5, %xmm3, %xmm3
1147 movaps 0 * SIZE(B), %xmm4
1149 pshufd $0x44, %xmm4, %xmm6
1150 pshufd $0x11, %xmm4, %xmm7
1152 pshufd $0xa0, %xmm1, %xmm3
1153 pshufd $0xf5, %xmm1, %xmm1
1166 pshufd $0xee, %xmm4, %xmm6
1167 pshufd $0xbb, %xmm4, %xmm7
1169 pshufd $0xa0, %xmm1, %xmm3
1170 pshufd $0xf5, %xmm1, %xmm2
1184 movaps 4 * SIZE(B), %xmm4
1186 pshufd $0xee, %xmm4, %xmm6
1187 pshufd $0xbb, %xmm4, %xmm7
1189 pshufd $0xa0, %xmm5, %xmm3
1190 pshufd $0xf5, %xmm5, %xmm5
1205 movaps 4 * SIZE(B), %xmm4
1207 pshufd $0xee, %xmm4, %xmm6
1208 pshufd $0xbb, %xmm4, %xmm7
1210 pshufd $0xa0, %xmm5, %xmm3
1211 pshufd $0xf5, %xmm5, %xmm5
1224 pshufd $0x44, %xmm4, %xmm6
1225 pshufd $0x11, %xmm4, %xmm7
1227 pshufd $0xa0, %xmm5, %xmm3
1228 pshufd $0xf5, %xmm5, %xmm2
1242 movaps 0 * SIZE(B), %xmm4
1244 pshufd $0x44, %xmm4, %xmm6
1245 pshufd $0x11, %xmm4, %xmm7
1247 pshufd $0xa0, %xmm1, %xmm3
1248 pshufd $0xf5, %xmm1, %xmm1
1266 #if defined(LN) || defined(LT)
1267 movaps %xmm2, 0 * SIZE(B)
1268 movaps %xmm3, 4 * SIZE(B)
1270 pshufd $0x00, %xmm2, %xmm0
1271 pshufd $0x55, %xmm2, %xmm1
1272 pshufd $0xaa, %xmm2, %xmm4
1273 pshufd $0xff, %xmm2, %xmm5
1275 movaps %xmm0, 0 * SIZE(BB)
1276 movaps %xmm1, 4 * SIZE(BB)
1277 movaps %xmm4, 8 * SIZE(BB)
1278 movaps %xmm5, 12 * SIZE(BB)
1280 pshufd $0x00, %xmm3, %xmm0
1281 pshufd $0x55, %xmm3, %xmm1
1282 pshufd $0xaa, %xmm3, %xmm4
1283 pshufd $0xff, %xmm3, %xmm5
1285 movaps %xmm0, 16 * SIZE(BB)
1286 movaps %xmm1, 20 * SIZE(BB)
1287 movaps %xmm4, 24 * SIZE(BB)
1288 movaps %xmm5, 28 * SIZE(BB)
1290 movlps %xmm2, 0 * SIZE(CO1)
1291 movlps %xmm3, 2 * SIZE(CO1)
1292 movhps %xmm2, 0 * SIZE(CO1, LDC)
1293 movhps %xmm3, 2 * SIZE(CO1, LDC)
1295 movaps %xmm1, 0 * SIZE(AA)
1296 movaps %xmm5, 4 * SIZE(AA)
1298 movlps %xmm1, 0 * SIZE(CO1)
1299 movhps %xmm1, 2 * SIZE(CO1)
1301 movlps %xmm5, 0 * SIZE(CO1, LDC)
1302 movhps %xmm5, 2 * SIZE(CO1, LDC)
1309 #if defined(LT) || defined(RN)
1312 sall $1 + ZBASE_SHIFT, %eax
1331 sall $1 + ZBASE_SHIFT, %eax
1342 sall $1 + ZBASE_SHIFT, %eax
1346 #if defined(LT) || defined(RN)
1349 sall $1 + ZBASE_SHIFT, %eax
1382 sall $ZBASE_SHIFT, %eax
1386 #if defined(LN) || defined(RT)
1389 sall $ZBASE_SHIFT, %eax
1391 leal (BB, %eax, 4), BB
1399 #if defined(LT) || defined(RN)
1410 movaps 0 * SIZE(B), %xmm3
1411 movaps 4 * SIZE(B), %xmm7
1413 pshufd $0x00, %xmm3, %xmm0
1414 pshufd $0x55, %xmm3, %xmm1
1415 pshufd $0xaa, %xmm3, %xmm2
1416 pshufd $0xff, %xmm3, %xmm3
1418 movaps %xmm0, 0 * SIZE(BB)
1419 movaps %xmm1, 4 * SIZE(BB)
1420 movaps %xmm2, 8 * SIZE(BB)
1421 movaps %xmm3, 12 * SIZE(BB)
1423 pshufd $0x00, %xmm7, %xmm4
1424 pshufd $0x55, %xmm7, %xmm5
1425 pshufd $0xaa, %xmm7, %xmm6
1426 pshufd $0xff, %xmm7, %xmm7
1428 movaps %xmm4, 16 * SIZE(BB)
1429 movaps %xmm5, 20 * SIZE(BB)
1430 movaps %xmm6, 24 * SIZE(BB)
1431 movaps %xmm7, 28 * SIZE(BB)
1440 #if defined(LT) || defined(RN)
1455 movsd 0 * SIZE(B), %xmm3
1457 pshufd $0x00, %xmm3, %xmm0
1458 pshufd $0x55, %xmm3, %xmm1
1460 movaps %xmm0, 0 * SIZE(BB)
1461 movaps %xmm1, 4 * SIZE(BB)
1463 addl $ 2 * SIZE, %edi
1464 addl $ 8 * SIZE, %ecx
1470 #if defined(LT) || defined(RN)
1494 sall $ZBASE_SHIFT, %eax
1498 #if defined(LN) || defined(RT)
1503 sall $ZBASE_SHIFT, %eax
1507 leal BUFFER, BB # boffset1 = boffset
1509 #if defined(LN) || defined(RT)
1511 sall $2 + ZBASE_SHIFT, %eax
1518 movsd 0 * SIZE(AA), %xmm0
1523 movsd 8 * SIZE(AA), %xmm1
1525 movaps 0 * SIZE(BB), %xmm2
1527 movaps 16 * SIZE(BB), %xmm3
1530 #if defined(LT) || defined(RN)
1543 movaps 4 * SIZE(BB), %xmm2
1545 movsd 2 * SIZE(AA), %xmm0
1547 movaps 8 * SIZE(BB), %xmm2
1550 movaps 12 * SIZE(BB), %xmm2
1552 movsd 4 * SIZE(AA), %xmm0
1554 movaps 32 * SIZE(BB), %xmm2
1557 movaps 20 * SIZE(BB), %xmm3
1559 movsd 6 * SIZE(AA), %xmm0
1561 movaps 24 * SIZE(BB), %xmm3
1564 movaps 28 * SIZE(BB), %xmm3
1566 movsd 16 * SIZE(AA), %xmm0
1568 movaps 48 * SIZE(BB), %xmm3
1571 movaps 36 * SIZE(BB), %xmm2
1573 movsd 10 * SIZE(AA), %xmm1
1575 movaps 40 * SIZE(BB), %xmm2
1578 movaps 44 * SIZE(BB), %xmm2
1580 movsd 12 * SIZE(AA), %xmm1
1582 movaps 64 * SIZE(BB), %xmm2
1585 movaps 52 * SIZE(BB), %xmm3
1587 movsd 14 * SIZE(AA), %xmm1
1589 movaps 56 * SIZE(BB), %xmm3
1592 movaps 60 * SIZE(BB), %xmm3
1594 movsd 24 * SIZE(AA), %xmm1
1596 movaps 80 * SIZE(BB), %xmm3
1598 addl $ 16 * SIZE, AA
1599 addl $ 64 * SIZE, BB
1605 #if defined(LT) || defined(RN)
1611 andl $7, %eax # if (k & 1)
1618 mulps 4 * SIZE(BB), %xmm0
1620 movaps 8 * SIZE(BB), %xmm2
1622 movsd 2 * SIZE(AA), %xmm0
1634 movaps POSINV, %xmm0
1636 shufps $0xb1, %xmm5, %xmm5
1638 #if defined(LN) || defined(LT)
1650 #if defined(LN) || defined(RT)
1658 sall $ZBASE_SHIFT, %eax
1661 leal (BB, %eax, 4), BB
1664 #if defined(LN) || defined(LT)
1668 movsd 0 * SIZE(B), %xmm2
1675 movsd 0 * SIZE(AA), %xmm1
1680 #if defined(LN) || defined(LT)
1681 movaps 0 * SIZE(AA), %xmm5
1683 pshufd $0x44, %xmm5, %xmm6
1684 pshufd $0x11, %xmm5, %xmm7
1686 pshufd $0xa0, %xmm2, %xmm4
1687 pshufd $0xf5, %xmm2, %xmm2
1700 #if defined(RN) || defined(RT)
1701 movaps 0 * SIZE(B), %xmm4
1703 pshufd $0x44, %xmm4, %xmm6
1704 pshufd $0x11, %xmm4, %xmm7
1706 pshufd $0xa0, %xmm1, %xmm3
1707 pshufd $0xf5, %xmm1, %xmm1
1725 #if defined(LN) || defined(LT)
1726 movlps %xmm2, 0 * SIZE(B)
1728 pshufd $0x00, %xmm2, %xmm0
1729 pshufd $0x55, %xmm2, %xmm1
1731 movaps %xmm0, 0 * SIZE(BB)
1732 movaps %xmm1, 4 * SIZE(BB)
1734 movlps %xmm2, 0 * SIZE(CO1)
1736 movlps %xmm1, 0 * SIZE(AA)
1738 movlps %xmm1, 0 * SIZE(CO1)
1745 #if defined(LT) || defined(RN)
1748 sall $ZBASE_SHIFT, %eax
1767 sall $ZBASE_SHIFT, %eax
1781 sall $1 + ZBASE_SHIFT, %eax
1785 #if defined(LN) || defined(RT)
1790 sall $1 + ZBASE_SHIFT, %eax
1794 leal BUFFER, BB # boffset1 = boffset
1796 #if defined(LN) || defined(RT)
1798 sall $2 + ZBASE_SHIFT, %eax
1807 movaps 0 * SIZE(AA), %xmm0
1808 movaps 16 * SIZE(AA), %xmm1
1809 movaps 0 * SIZE(BB), %xmm2
1810 movaps 16 * SIZE(BB), %xmm3
1812 PREFETCHW -4 * SIZE(CO1)
1814 #if defined(LT) || defined(RN)
1827 movaps 4 * SIZE(BB), %xmm2
1829 movaps 4 * SIZE(AA), %xmm0
1831 movaps 8 * SIZE(BB), %xmm2
1834 movaps 12 * SIZE(BB), %xmm2
1836 movaps 8 * SIZE(AA), %xmm0
1838 movaps 32 * SIZE(BB), %xmm2
1841 movaps 20 * SIZE(BB), %xmm3
1843 movaps 12 * SIZE(AA), %xmm0
1845 movaps 24 * SIZE(BB), %xmm3
1848 movaps 28 * SIZE(BB), %xmm3
1850 movaps 32 * SIZE(AA), %xmm0
1852 movaps 48 * SIZE(BB), %xmm3
1855 movaps 36 * SIZE(BB), %xmm2
1857 movaps 20 * SIZE(AA), %xmm1
1859 movaps 40 * SIZE(BB), %xmm2
1862 movaps 44 * SIZE(BB), %xmm2
1864 movaps 24 * SIZE(AA), %xmm1
1866 movaps 64 * SIZE(BB), %xmm2
1869 movaps 52 * SIZE(BB), %xmm3
1871 movaps 28 * SIZE(AA), %xmm1
1873 movaps 56 * SIZE(BB), %xmm3
1876 movaps 60 * SIZE(BB), %xmm3
1878 movaps 48 * SIZE(AA), %xmm1
1880 movaps 80 * SIZE(BB), %xmm3
1882 addl $ 32 * SIZE, AA
1883 addl $ 64 * SIZE, BB
1889 #if defined(LT) || defined(RN)
1895 andl $7, %eax # if (k & 1)
1902 mulps 4 * SIZE(BB), %xmm0
1904 movaps 8 * SIZE(BB), %xmm2
1906 movaps 4 * SIZE(AA), %xmm0
1918 movaps POSINV, %xmm0
1920 shufps $0xb1, %xmm5, %xmm5
1922 #if defined(LN) || defined(LT)
1934 #if defined(LN) || defined(RT)
1946 sall $ZBASE_SHIFT, %eax
1947 leal (AA, %eax, 2), AA
1948 leal (B, %eax, 1), B
1949 leal (BB, %eax, 4), BB
1952 #if defined(LN) || defined(LT)
1954 unpcklpd %xmm6, %xmm4
1955 unpckhpd %xmm6, %xmm5
1960 movsd 0 * SIZE(B), %xmm2
1964 movsd 2 * SIZE(B), %xmm3
1969 movaps 0 * SIZE(AA), %xmm1
1975 movaps 4 * SIZE(AA), %xmm5
1977 pshufd $0xee, %xmm5, %xmm6
1978 pshufd $0xbb, %xmm5, %xmm7
1980 pshufd $0xa0, %xmm3, %xmm4
1981 pshufd $0xf5, %xmm3, %xmm3
1993 pshufd $0x44, %xmm5, %xmm6
1994 pshufd $0x11, %xmm5, %xmm7
1996 pshufd $0xa0, %xmm3, %xmm4
1997 pshufd $0xf5, %xmm3, %xmm1
2010 movaps 0 * SIZE(AA), %xmm5
2012 pshufd $0x44, %xmm5, %xmm6
2013 pshufd $0x11, %xmm5, %xmm7
2015 pshufd $0xa0, %xmm2, %xmm4
2016 pshufd $0xf5, %xmm2, %xmm2
2030 movaps 0 * SIZE(AA), %xmm5
2032 pshufd $0x44, %xmm5, %xmm6
2033 pshufd $0x11, %xmm5, %xmm7
2035 pshufd $0xa0, %xmm2, %xmm4
2036 pshufd $0xf5, %xmm2, %xmm2
2048 pshufd $0xee, %xmm5, %xmm6
2049 pshufd $0xbb, %xmm5, %xmm7
2051 pshufd $0xa0, %xmm2, %xmm4
2052 pshufd $0xf5, %xmm2, %xmm1
2065 movaps 4 * SIZE(AA), %xmm5
2067 pshufd $0xee, %xmm5, %xmm6
2068 pshufd $0xbb, %xmm5, %xmm7
2070 pshufd $0xa0, %xmm3, %xmm4
2071 pshufd $0xf5, %xmm3, %xmm3
2084 #if defined(RN) || defined(RT)
2085 movaps 0 * SIZE(B), %xmm4
2087 pshufd $0x44, %xmm4, %xmm6
2088 pshufd $0x11, %xmm4, %xmm7
2090 pshufd $0xa0, %xmm1, %xmm3
2091 pshufd $0xf5, %xmm1, %xmm1
2109 #if defined(LN) || defined(LT)
2110 movlps %xmm2, 0 * SIZE(B)
2111 movlps %xmm3, 2 * SIZE(B)
2113 pshufd $0x00, %xmm2, %xmm0
2114 pshufd $0x55, %xmm2, %xmm1
2116 movaps %xmm0, 0 * SIZE(BB)
2117 movaps %xmm1, 4 * SIZE(BB)
2119 pshufd $0x00, %xmm3, %xmm0
2120 pshufd $0x55, %xmm3, %xmm1
2122 movaps %xmm0, 8 * SIZE(BB)
2123 movaps %xmm1, 12 * SIZE(BB)
2125 movlps %xmm2, 0 * SIZE(CO1)
2126 movlps %xmm3, 2 * SIZE(CO1)
2128 movaps %xmm1, 0 * SIZE(AA)
2130 movlps %xmm1, 0 * SIZE(CO1)
2131 movhps %xmm1, 2 * SIZE(CO1)
2138 #if defined(LT) || defined(RN)
2141 sall $1 + ZBASE_SHIFT, %eax
2160 sall $1 + ZBASE_SHIFT, %eax
2171 sall $ZBASE_SHIFT, %eax
2175 #if defined(LT) || defined(RN)
2178 sall $ZBASE_SHIFT, %eax
2194 movl OLD_STACK, %esp