1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
45 #define STACK_M 4 + STACK + ARGS(%esi)
46 #define STACK_N 8 + STACK + ARGS(%esi)
47 #define STACK_K 12 + STACK + ARGS(%esi)
48 #define STACK_A 24 + STACK + ARGS(%esi)
49 #define STACK_B 28 + STACK + ARGS(%esi)
50 #define STACK_C 32 + STACK + ARGS(%esi)
51 #define STACK_LDC 36 + STACK + ARGS(%esi)
52 #define STACK_OFFT 40 + STACK + ARGS(%esi)
54 #define POSINV 0(%esp)
61 #define OLD_STACK 40(%esp)
62 #define OFFSET 48(%esp)
65 #define AORIG 60(%esp)
66 #define BORIG 64(%esp)
67 #define BUFFER 128(%esp)
75 #define STACK_ALIGN 4096
76 #define STACK_OFFSET 1024
78 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
79 #define PREFETCHSIZE (16 * 10 + 8)
80 #define WPREFETCHSIZE 112
81 #define PREFETCH prefetch
82 #define PREFETCHW prefetchw
85 #if defined(PENTIUM4) || defined(PENTIUMM)
86 #define PREFETCH prefetcht1
87 #define PREFETCHSIZE 168
88 #define PREFETCHW prefetcht0
91 #if defined(PENRYN) || defined(DUNNINGTON)
92 #define PREFETCH prefetcht1
93 #define PREFETCHSIZE 168
94 #define PREFETCHW prefetcht0
97 #if defined(OPTERON) || !defined(HAVE_SSE2)
105 #define KERNEL1(address) \
106 mulps %xmm0, %xmm2; \
107 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \
108 addps %xmm2, %xmm4; \
109 movaps 4 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
110 mulps %xmm0, %xmm2; \
111 addps %xmm2, %xmm5; \
112 movaps 8 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
113 mulps %xmm0, %xmm2; \
114 mulps 12 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
115 addps %xmm2, %xmm6; \
116 movaps 32 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
117 addps %xmm0, %xmm7; \
118 movaps 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0
120 #define KERNEL2(address) \
121 mulps %xmm0, %xmm3; \
122 addps %xmm3, %xmm4; \
123 movaps 20 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
124 mulps %xmm0, %xmm3; \
125 addps %xmm3, %xmm5; \
126 movaps 24 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
127 mulps %xmm0, %xmm3; \
128 mulps 28 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
129 addps %xmm3, %xmm6; \
130 movaps 48 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
131 addps %xmm0, %xmm7; \
132 movaps 8 * SIZE + (address) * 1 * SIZE(AA), %xmm0
134 #define KERNEL3(address) \
135 mulps %xmm0, %xmm2; \
136 addps %xmm2, %xmm4; \
137 movaps 36 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
138 mulps %xmm0, %xmm2; \
139 addps %xmm2, %xmm5; \
140 movaps 40 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
141 mulps %xmm0, %xmm2; \
142 mulps 44 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
143 addps %xmm2, %xmm6; \
144 movaps 64 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
145 addps %xmm0, %xmm7; \
146 movaps 12 * SIZE + (address) * 1 * SIZE(AA), %xmm0
148 #define KERNEL4(address) \
149 mulps %xmm0, %xmm3; \
150 addps %xmm3, %xmm4; \
151 movaps 52 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
152 mulps %xmm0, %xmm3; \
153 addps %xmm3, %xmm5; \
154 movaps 56 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
155 mulps %xmm0, %xmm3; \
156 mulps 60 * SIZE + (address) * 4 * SIZE(BB), %xmm0; \
157 addps %xmm3, %xmm6; \
158 movaps 80 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
159 addps %xmm0, %xmm7; \
160 movaps 32 * SIZE + (address) * 1 * SIZE(AA), %xmm0
162 #define KERNEL5(address) \
163 mulps %xmm1, %xmm2; \
164 addps %xmm2, %xmm4; \
165 movaps 68 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
166 mulps %xmm1, %xmm2; \
167 addps %xmm2, %xmm5; \
168 movaps 72 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
169 mulps %xmm1, %xmm2; \
170 mulps 76 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
171 addps %xmm2, %xmm6; \
172 movaps 96 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
173 addps %xmm1, %xmm7; \
174 movaps 20 * SIZE + (address) * 1 * SIZE(AA), %xmm1
176 #define KERNEL6(address) \
177 mulps %xmm1, %xmm3; \
178 addps %xmm3, %xmm4; \
179 movaps 84 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
180 mulps %xmm1, %xmm3; \
181 addps %xmm3, %xmm5; \
182 movaps 88 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
183 mulps %xmm1, %xmm3; \
184 mulps 92 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
185 addps %xmm3, %xmm6; \
186 movaps 112 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
187 addps %xmm1, %xmm7; \
188 movaps 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1
190 #define KERNEL7(address) \
191 mulps %xmm1, %xmm2; \
192 addps %xmm2, %xmm4; \
193 movaps 100 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
194 mulps %xmm1, %xmm2; \
195 addps %xmm2, %xmm5; \
196 movaps 104 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
197 mulps %xmm1, %xmm2; \
198 mulps 108 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
199 addps %xmm2, %xmm6; \
200 movaps 128 * SIZE + (address) * 4 * SIZE(BB), %xmm2; \
201 addps %xmm1, %xmm7; \
202 movaps 28 * SIZE + (address) * 1 * SIZE(AA), %xmm1
204 #define KERNEL8(address) \
205 mulps %xmm1, %xmm3; \
206 addps %xmm3, %xmm4; \
207 movaps 116 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
208 mulps %xmm1, %xmm3; \
209 addps %xmm3, %xmm5; \
210 movaps 120 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
211 mulps %xmm1, %xmm3; \
212 mulps 124 * SIZE + (address) * 4 * SIZE(BB), %xmm1; \
213 addps %xmm3, %xmm6; \
214 movaps 144 * SIZE + (address) * 4 * SIZE(BB), %xmm3; \
215 addps %xmm1, %xmm7; \
216 movaps 48 * SIZE + (address) * 1 * SIZE(AA), %xmm1;
227 movl %esp, %esi # save old stack
229 subl $128 + LOCAL_BUFFER_SIZE + STACK_OFFSET, %esp
230 andl $-STACK_ALIGN, %esp # align stack
231 addl $STACK_OFFSET, %esp
248 movss STACK_OFFT, %xmm4
256 movss %xmm7, 0 + POSINV
257 movss %xmm2, 4 + POSINV
258 movss %xmm7, 8 + POSINV
259 movss %xmm2, 12 + POSINV
261 movss %xmm2, 0 + POSINV
262 movss %xmm7, 4 + POSINV
263 movss %xmm2, 8 + POSINV
264 movss %xmm7, 12 + POSINV
275 sall $ZBASE_SHIFT, LDC
279 sall $ZBASE_SHIFT, %eax
287 sall $ZBASE_SHIFT, %eax
323 sall $1 + ZBASE_SHIFT, %eax
327 #if defined(LN) || defined(RT)
330 sall $1 + ZBASE_SHIFT, %eax
332 leal (BB, %eax, 4), BB
340 #if defined(LT) || defined(RN)
351 movaps 0 * SIZE(B), %xmm3
352 movaps 4 * SIZE(B), %xmm7
354 pshufd $0x00, %xmm3, %xmm0
355 pshufd $0x55, %xmm3, %xmm1
356 pshufd $0xaa, %xmm3, %xmm2
357 pshufd $0xff, %xmm3, %xmm3
359 movaps %xmm0, 0 * SIZE(BB)
360 movaps %xmm1, 4 * SIZE(BB)
361 movaps %xmm2, 8 * SIZE(BB)
362 movaps %xmm3, 12 * SIZE(BB)
364 pshufd $0x00, %xmm7, %xmm4
365 pshufd $0x55, %xmm7, %xmm5
366 pshufd $0xaa, %xmm7, %xmm6
367 pshufd $0xff, %xmm7, %xmm7
369 movaps %xmm4, 16 * SIZE(BB)
370 movaps %xmm5, 20 * SIZE(BB)
371 movaps %xmm6, 24 * SIZE(BB)
372 movaps %xmm7, 28 * SIZE(BB)
382 #if defined(LT) || defined(RN)
394 movaps 0 * SIZE(B), %xmm3
396 pshufd $0x00, %xmm3, %xmm0
397 pshufd $0x55, %xmm3, %xmm1
398 pshufd $0xaa, %xmm3, %xmm2
399 pshufd $0xff, %xmm3, %xmm3
401 movaps %xmm0, 0 * SIZE(BB)
402 movaps %xmm1, 4 * SIZE(BB)
403 movaps %xmm2, 8 * SIZE(BB)
404 movaps %xmm3, 12 * SIZE(BB)
410 #if defined(LT) || defined(RN)
418 leal (, LDC, 2), %eax
438 sall $1 + ZBASE_SHIFT, %eax
442 #if defined(LN) || defined(RT)
447 sall $1 + ZBASE_SHIFT, %eax
451 leal BUFFER, BB # boffset1 = boffset
453 #if defined(LN) || defined(RT)
455 sall $3 + ZBASE_SHIFT, %eax
459 movaps 0 * SIZE(AA), %xmm0
461 movaps 16 * SIZE(AA), %xmm1
463 movaps 0 * SIZE(BB), %xmm2
465 movaps 16 * SIZE(BB), %xmm3
468 PREFETCHW 3 * SIZE(CO1)
469 PREFETCHW 3 * SIZE(CO1, LDC)
471 #if defined(LT) || defined(RN)
498 #if defined(LT) || defined(RN)
504 andl $7, %eax # if (k & 1)
512 movaps 4 * SIZE(BB), %xmm2
515 movaps 8 * SIZE(BB), %xmm2
517 mulps 12 * SIZE(BB), %xmm0
519 movaps 16 * SIZE(BB), %xmm2
521 movaps 4 * SIZE(AA), %xmm0
532 shufps $0xb1, %xmm5, %xmm5
533 shufps $0xb1, %xmm7, %xmm7
535 #if defined(LN) || defined(LT)
551 #if defined(LN) || defined(RT)
563 sall $ZBASE_SHIFT, %eax
564 leal (AA, %eax, 2), AA
566 leal (BB, %eax, 8), BB
569 #if defined(LN) || defined(LT)
571 unpcklpd %xmm6, %xmm4
572 unpckhpd %xmm6, %xmm5
574 movaps 0 * SIZE(B), %xmm2
575 movaps 4 * SIZE(B), %xmm3
580 movaps 0 * SIZE(AA), %xmm1
581 movaps 4 * SIZE(AA), %xmm5
588 movaps 4 * SIZE(AA), %xmm5
590 pshufd $0xee, %xmm5, %xmm6
591 pshufd $0xbb, %xmm5, %xmm7
593 pshufd $0xa0, %xmm3, %xmm4
594 pshufd $0xf5, %xmm3, %xmm3
606 pshufd $0x44, %xmm5, %xmm6
607 pshufd $0x11, %xmm5, %xmm7
609 pshufd $0xa0, %xmm3, %xmm4
610 pshufd $0xf5, %xmm3, %xmm1
623 movaps 0 * SIZE(AA), %xmm5
625 pshufd $0x44, %xmm5, %xmm6
626 pshufd $0x11, %xmm5, %xmm7
628 pshufd $0xa0, %xmm2, %xmm4
629 pshufd $0xf5, %xmm2, %xmm2
643 movaps 0 * SIZE(AA), %xmm5
645 pshufd $0x44, %xmm5, %xmm6
646 pshufd $0x11, %xmm5, %xmm7
648 pshufd $0xa0, %xmm2, %xmm4
649 pshufd $0xf5, %xmm2, %xmm2
661 pshufd $0xee, %xmm5, %xmm6
662 pshufd $0xbb, %xmm5, %xmm7
664 pshufd $0xa0, %xmm2, %xmm4
665 pshufd $0xf5, %xmm2, %xmm1
678 movaps 4 * SIZE(AA), %xmm5
680 pshufd $0xee, %xmm5, %xmm6
681 pshufd $0xbb, %xmm5, %xmm7
683 pshufd $0xa0, %xmm3, %xmm4
684 pshufd $0xf5, %xmm3, %xmm3
698 movaps 0 * SIZE(B), %xmm4
700 pshufd $0x44, %xmm4, %xmm6
701 pshufd $0x11, %xmm4, %xmm7
703 pshufd $0xa0, %xmm1, %xmm3
704 pshufd $0xf5, %xmm1, %xmm1
717 pshufd $0xee, %xmm4, %xmm6
718 pshufd $0xbb, %xmm4, %xmm7
720 pshufd $0xa0, %xmm1, %xmm3
721 pshufd $0xf5, %xmm1, %xmm2
735 movaps 4 * SIZE(B), %xmm4
737 pshufd $0xee, %xmm4, %xmm6
738 pshufd $0xbb, %xmm4, %xmm7
740 pshufd $0xa0, %xmm5, %xmm3
741 pshufd $0xf5, %xmm5, %xmm5
756 movaps 4 * SIZE(B), %xmm4
758 pshufd $0xee, %xmm4, %xmm6
759 pshufd $0xbb, %xmm4, %xmm7
761 pshufd $0xa0, %xmm5, %xmm3
762 pshufd $0xf5, %xmm5, %xmm5
775 pshufd $0x44, %xmm4, %xmm6
776 pshufd $0x11, %xmm4, %xmm7
778 pshufd $0xa0, %xmm5, %xmm3
779 pshufd $0xf5, %xmm5, %xmm2
793 movaps 0 * SIZE(B), %xmm4
795 pshufd $0x44, %xmm4, %xmm6
796 pshufd $0x11, %xmm4, %xmm7
798 pshufd $0xa0, %xmm1, %xmm3
799 pshufd $0xf5, %xmm1, %xmm1
817 #if defined(LN) || defined(LT)
818 movaps %xmm2, 0 * SIZE(B)
819 movaps %xmm3, 4 * SIZE(B)
821 pshufd $0x00, %xmm2, %xmm0
822 pshufd $0x55, %xmm2, %xmm1
823 pshufd $0xaa, %xmm2, %xmm4
824 pshufd $0xff, %xmm2, %xmm5
826 movaps %xmm0, 0 * SIZE(BB)
827 movaps %xmm1, 4 * SIZE(BB)
828 movaps %xmm4, 8 * SIZE(BB)
829 movaps %xmm5, 12 * SIZE(BB)
831 pshufd $0x00, %xmm3, %xmm0
832 pshufd $0x55, %xmm3, %xmm1
833 pshufd $0xaa, %xmm3, %xmm4
834 pshufd $0xff, %xmm3, %xmm5
836 movaps %xmm0, 16 * SIZE(BB)
837 movaps %xmm1, 20 * SIZE(BB)
838 movaps %xmm4, 24 * SIZE(BB)
839 movaps %xmm5, 28 * SIZE(BB)
841 movlps %xmm2, 0 * SIZE(CO1)
842 movlps %xmm3, 2 * SIZE(CO1)
843 movhps %xmm2, 0 * SIZE(CO1, LDC)
844 movhps %xmm3, 2 * SIZE(CO1, LDC)
846 movaps %xmm1, 0 * SIZE(AA)
847 movaps %xmm5, 4 * SIZE(AA)
849 movlps %xmm1, 0 * SIZE(CO1)
850 movhps %xmm1, 2 * SIZE(CO1)
852 movlps %xmm5, 0 * SIZE(CO1, LDC)
853 movhps %xmm5, 2 * SIZE(CO1, LDC)
860 #if defined(LT) || defined(RN)
863 sall $1 + ZBASE_SHIFT, %eax
882 sall $1 + ZBASE_SHIFT, %eax
899 sall $ZBASE_SHIFT, %eax
903 #if defined(LN) || defined(RT)
908 sall $ZBASE_SHIFT, %eax
912 leal BUFFER, BB # boffset1 = boffset
914 #if defined(LN) || defined(RT)
916 sall $3 + ZBASE_SHIFT, %eax
928 movsd 0 * SIZE(AA), %xmm0
932 movsd 8 * SIZE(AA), %xmm1
933 movaps 0 * SIZE(BB), %xmm2
934 movaps 16 * SIZE(BB), %xmm3
936 #if defined(LT) || defined(RN)
948 prefetcht1 (PREFETCHSIZE + 0) * SIZE(AA)
950 movaps 4 * SIZE(BB), %xmm2
953 movaps 8 * SIZE(BB), %xmm2
955 mulps 12 * SIZE(BB), %xmm0
957 movaps 32 * SIZE(BB), %xmm2
959 movsd 2 * SIZE(AA), %xmm0
962 movaps 20 * SIZE(BB), %xmm3
965 movaps 24 * SIZE(BB), %xmm3
967 mulps 28 * SIZE(BB), %xmm0
969 movaps 48 * SIZE(BB), %xmm3
971 movsd 4 * SIZE(AA), %xmm0
974 movaps 36 * SIZE(BB), %xmm2
977 movaps 40 * SIZE(BB), %xmm2
979 mulps 44 * SIZE(BB), %xmm0
981 movaps 64 * SIZE(BB), %xmm2
983 movsd 6 * SIZE(AA), %xmm0
986 movaps 52 * SIZE(BB), %xmm3
989 movaps 56 * SIZE(BB), %xmm3
991 mulps 60 * SIZE(BB), %xmm0
993 movaps 80 * SIZE(BB), %xmm3
995 movsd 16 * SIZE(AA), %xmm0
997 #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
998 prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
1001 movaps 68 * SIZE(BB), %xmm2
1004 movaps 72 * SIZE(BB), %xmm2
1006 mulps 76 * SIZE(BB), %xmm1
1008 movaps 96 * SIZE(BB), %xmm2
1010 movsd 10 * SIZE(AA), %xmm1
1013 movaps 84 * SIZE(BB), %xmm3
1016 movaps 88 * SIZE(BB), %xmm3
1018 mulps 92 * SIZE(BB), %xmm1
1020 movaps 112 * SIZE(BB), %xmm3
1022 movsd 12 * SIZE(AA), %xmm1
1025 movaps 100 * SIZE(BB), %xmm2
1028 movaps 104 * SIZE(BB), %xmm2
1030 mulps 108 * SIZE(BB), %xmm1
1032 movaps 128 * SIZE(BB), %xmm2
1034 movsd 14 * SIZE(AA), %xmm1
1037 movaps 116 * SIZE(BB), %xmm3
1040 movaps 120 * SIZE(BB), %xmm3
1042 mulps 124 * SIZE(BB), %xmm1
1044 movaps 144 * SIZE(BB), %xmm3
1046 movsd 24 * SIZE(AA), %xmm1
1047 addl $ 16 * SIZE, AA
1048 addl $128 * SIZE, BB
1054 #if defined(LT) || defined(RN)
1060 andl $7, %eax # if (k & 1)
1068 movaps 4 * SIZE(BB), %xmm2
1071 movaps 8 * SIZE(BB), %xmm2
1073 mulps 12 * SIZE(BB), %xmm0
1075 movaps 16 * SIZE(BB), %xmm2
1077 movsd 2 * SIZE(AA), %xmm0
1086 movaps POSINV, %xmm0
1088 shufps $0xb1, %xmm5, %xmm5
1089 shufps $0xb1, %xmm7, %xmm7
1091 #if defined(LN) || defined(LT)
1107 #if defined(LN) || defined(RT)
1119 sall $ZBASE_SHIFT, %eax
1120 leal (AA, %eax, 1), AA
1121 leal (B, %eax, 2), B
1122 leal (BB, %eax, 8), BB
1125 #if defined(LN) || defined(LT)
1126 unpcklpd %xmm6, %xmm4
1128 movaps 0 * SIZE(B), %xmm2
1135 movsd 0 * SIZE(AA), %xmm1
1139 movsd 2 * SIZE(AA), %xmm5
1145 #if defined(LN) || defined(LT)
1146 movaps 0 * SIZE(AA), %xmm5
1148 pshufd $0x44, %xmm5, %xmm6
1149 pshufd $0x11, %xmm5, %xmm7
1151 pshufd $0xa0, %xmm2, %xmm4
1152 pshufd $0xf5, %xmm2, %xmm2
1166 movaps 0 * SIZE(B), %xmm4
1168 pshufd $0x44, %xmm4, %xmm6
1169 pshufd $0x11, %xmm4, %xmm7
1171 pshufd $0xa0, %xmm1, %xmm3
1172 pshufd $0xf5, %xmm1, %xmm1
1185 pshufd $0xee, %xmm4, %xmm6
1186 pshufd $0xbb, %xmm4, %xmm7
1188 pshufd $0xa0, %xmm1, %xmm3
1189 pshufd $0xf5, %xmm1, %xmm2
1203 movaps 4 * SIZE(B), %xmm4
1205 pshufd $0xee, %xmm4, %xmm6
1206 pshufd $0xbb, %xmm4, %xmm7
1208 pshufd $0xa0, %xmm5, %xmm3
1209 pshufd $0xf5, %xmm5, %xmm5
1224 movaps 4 * SIZE(B), %xmm4
1226 pshufd $0xee, %xmm4, %xmm6
1227 pshufd $0xbb, %xmm4, %xmm7
1229 pshufd $0xa0, %xmm5, %xmm3
1230 pshufd $0xf5, %xmm5, %xmm5
1243 pshufd $0x44, %xmm4, %xmm6
1244 pshufd $0x11, %xmm4, %xmm7
1246 pshufd $0xa0, %xmm5, %xmm3
1247 pshufd $0xf5, %xmm5, %xmm2
1261 movaps 0 * SIZE(B), %xmm4
1263 pshufd $0x44, %xmm4, %xmm6
1264 pshufd $0x11, %xmm4, %xmm7
1266 pshufd $0xa0, %xmm1, %xmm3
1267 pshufd $0xf5, %xmm1, %xmm1
1285 #if defined(LN) || defined(LT)
1286 movaps %xmm2, 0 * SIZE(B)
1288 pshufd $0x00, %xmm2, %xmm0
1289 pshufd $0x55, %xmm2, %xmm1
1290 pshufd $0xaa, %xmm2, %xmm4
1291 pshufd $0xff, %xmm2, %xmm5
1293 movaps %xmm0, 0 * SIZE(BB)
1294 movaps %xmm1, 4 * SIZE(BB)
1295 movaps %xmm4, 8 * SIZE(BB)
1296 movaps %xmm5, 12 * SIZE(BB)
1298 movlps %xmm2, 0 * SIZE(CO1)
1299 movhps %xmm2, 0 * SIZE(CO1, LDC)
1301 movlps %xmm1, 0 * SIZE(AA)
1302 movlps %xmm5, 2 * SIZE(AA)
1304 movlps %xmm1, 0 * SIZE(CO1)
1305 movlps %xmm5, 0 * SIZE(CO1, LDC)
1312 #if defined(LT) || defined(RN)
1315 sall $ZBASE_SHIFT, %eax
1334 sall $ZBASE_SHIFT, %eax
1342 sall $1 + ZBASE_SHIFT, %eax
1346 #if defined(LT) || defined(RN)
1349 sall $1 + ZBASE_SHIFT, %eax
1382 sall $ZBASE_SHIFT, %eax
1386 #if defined(LN) || defined(RT)
1389 sall $ZBASE_SHIFT, %eax
1391 leal (BB, %eax, 4), BB
1399 #if defined(LT) || defined(RN)
1410 movaps 0 * SIZE(B), %xmm3
1411 movaps 4 * SIZE(B), %xmm7
1413 pshufd $0x00, %xmm3, %xmm0
1414 pshufd $0x55, %xmm3, %xmm1
1415 pshufd $0xaa, %xmm3, %xmm2
1416 pshufd $0xff, %xmm3, %xmm3
1418 movaps %xmm0, 0 * SIZE(BB)
1419 movaps %xmm1, 4 * SIZE(BB)
1420 movaps %xmm2, 8 * SIZE(BB)
1421 movaps %xmm3, 12 * SIZE(BB)
1423 pshufd $0x00, %xmm7, %xmm4
1424 pshufd $0x55, %xmm7, %xmm5
1425 pshufd $0xaa, %xmm7, %xmm6
1426 pshufd $0xff, %xmm7, %xmm7
1428 movaps %xmm4, 16 * SIZE(BB)
1429 movaps %xmm5, 20 * SIZE(BB)
1430 movaps %xmm6, 24 * SIZE(BB)
1431 movaps %xmm7, 28 * SIZE(BB)
1440 #if defined(LT) || defined(RN)
1455 movsd 0 * SIZE(B), %xmm3
1457 pshufd $0x00, %xmm3, %xmm0
1458 pshufd $0x55, %xmm3, %xmm1
1460 movaps %xmm0, 0 * SIZE(BB)
1461 movaps %xmm1, 4 * SIZE(BB)
1463 addl $ 2 * SIZE, %edi
1464 addl $ 8 * SIZE, %ecx
1470 #if defined(LT) || defined(RN)
1496 sall $1 + ZBASE_SHIFT, %eax
1500 #if defined(LN) || defined(RT)
1505 sall $1 + ZBASE_SHIFT, %eax
1509 leal BUFFER, BB # boffset1 = boffset
1511 #if defined(LN) || defined(RT)
1513 sall $2 + ZBASE_SHIFT, %eax
1522 movaps 0 * SIZE(AA), %xmm0
1523 movaps 16 * SIZE(AA), %xmm1
1524 movaps 0 * SIZE(BB), %xmm2
1525 movaps 16 * SIZE(BB), %xmm3
1527 PREFETCHW 3 * SIZE(CO1)
1529 #if defined(LT) || defined(RN)
1542 movaps 4 * SIZE(BB), %xmm2
1544 movaps 4 * SIZE(AA), %xmm0
1546 movaps 8 * SIZE(BB), %xmm2
1549 movaps 12 * SIZE(BB), %xmm2
1551 movaps 8 * SIZE(AA), %xmm0
1553 movaps 32 * SIZE(BB), %xmm2
1556 movaps 20 * SIZE(BB), %xmm3
1558 movaps 12 * SIZE(AA), %xmm0
1560 movaps 24 * SIZE(BB), %xmm3
1563 movaps 28 * SIZE(BB), %xmm3
1565 movaps 32 * SIZE(AA), %xmm0
1567 movaps 48 * SIZE(BB), %xmm3
1570 movaps 36 * SIZE(BB), %xmm2
1572 movaps 20 * SIZE(AA), %xmm1
1574 movaps 40 * SIZE(BB), %xmm2
1577 movaps 44 * SIZE(BB), %xmm2
1579 movaps 24 * SIZE(AA), %xmm1
1581 movaps 64 * SIZE(BB), %xmm2
1584 movaps 52 * SIZE(BB), %xmm3
1586 movaps 28 * SIZE(AA), %xmm1
1588 movaps 56 * SIZE(BB), %xmm3
1591 movaps 60 * SIZE(BB), %xmm3
1593 movaps 48 * SIZE(AA), %xmm1
1595 movaps 80 * SIZE(BB), %xmm3
1597 addl $ 32 * SIZE, AA
1598 addl $ 64 * SIZE, BB
1604 #if defined(LT) || defined(RN)
1610 andl $7, %eax # if (k & 1)
1617 mulps 4 * SIZE(BB), %xmm0
1619 movaps 8 * SIZE(BB), %xmm2
1621 movaps 4 * SIZE(AA), %xmm0
1633 movaps POSINV, %xmm0
1635 shufps $0xb1, %xmm5, %xmm5
1637 #if defined(LN) || defined(LT)
1649 #if defined(LN) || defined(RT)
1661 sall $ZBASE_SHIFT, %eax
1662 leal (AA, %eax, 2), AA
1663 leal (B, %eax, 1), B
1664 leal (BB, %eax, 4), BB
1667 #if defined(LN) || defined(LT)
1669 unpcklpd %xmm6, %xmm4
1670 unpckhpd %xmm6, %xmm5
1675 movsd 0 * SIZE(B), %xmm2
1679 movsd 2 * SIZE(B), %xmm3
1684 movaps 0 * SIZE(AA), %xmm1
1690 movaps 4 * SIZE(AA), %xmm5
1692 pshufd $0xee, %xmm5, %xmm6
1693 pshufd $0xbb, %xmm5, %xmm7
1695 pshufd $0xa0, %xmm3, %xmm4
1696 pshufd $0xf5, %xmm3, %xmm3
1708 pshufd $0x44, %xmm5, %xmm6
1709 pshufd $0x11, %xmm5, %xmm7
1711 pshufd $0xa0, %xmm3, %xmm4
1712 pshufd $0xf5, %xmm3, %xmm1
1725 movaps 0 * SIZE(AA), %xmm5
1727 pshufd $0x44, %xmm5, %xmm6
1728 pshufd $0x11, %xmm5, %xmm7
1730 pshufd $0xa0, %xmm2, %xmm4
1731 pshufd $0xf5, %xmm2, %xmm2
1745 movaps 0 * SIZE(AA), %xmm5
1747 pshufd $0x44, %xmm5, %xmm6
1748 pshufd $0x11, %xmm5, %xmm7
1750 pshufd $0xa0, %xmm2, %xmm4
1751 pshufd $0xf5, %xmm2, %xmm2
1763 pshufd $0xee, %xmm5, %xmm6
1764 pshufd $0xbb, %xmm5, %xmm7
1766 pshufd $0xa0, %xmm2, %xmm4
1767 pshufd $0xf5, %xmm2, %xmm1
1780 movaps 4 * SIZE(AA), %xmm5
1782 pshufd $0xee, %xmm5, %xmm6
1783 pshufd $0xbb, %xmm5, %xmm7
1785 pshufd $0xa0, %xmm3, %xmm4
1786 pshufd $0xf5, %xmm3, %xmm3
1799 #if defined(RN) || defined(RT)
1800 movaps 0 * SIZE(B), %xmm4
1802 pshufd $0x44, %xmm4, %xmm6
1803 pshufd $0x11, %xmm4, %xmm7
1805 pshufd $0xa0, %xmm1, %xmm3
1806 pshufd $0xf5, %xmm1, %xmm1
1824 #if defined(LN) || defined(LT)
1825 movlps %xmm2, 0 * SIZE(B)
1826 movlps %xmm3, 2 * SIZE(B)
1828 pshufd $0x00, %xmm2, %xmm0
1829 pshufd $0x55, %xmm2, %xmm1
1831 movaps %xmm0, 0 * SIZE(BB)
1832 movaps %xmm1, 4 * SIZE(BB)
1834 pshufd $0x00, %xmm3, %xmm0
1835 pshufd $0x55, %xmm3, %xmm1
1837 movaps %xmm0, 8 * SIZE(BB)
1838 movaps %xmm1, 12 * SIZE(BB)
1840 movlps %xmm2, 0 * SIZE(CO1)
1841 movlps %xmm3, 2 * SIZE(CO1)
1843 movaps %xmm1, 0 * SIZE(AA)
1845 movlps %xmm1, 0 * SIZE(CO1)
1846 movhps %xmm1, 2 * SIZE(CO1)
1853 #if defined(LT) || defined(RN)
1856 sall $1 + ZBASE_SHIFT, %eax
1875 sall $1 + ZBASE_SHIFT, %eax
1890 sall $ZBASE_SHIFT, %eax
1894 #if defined(LN) || defined(RT)
1899 sall $ZBASE_SHIFT, %eax
1903 leal BUFFER, BB # boffset1 = boffset
1905 #if defined(LN) || defined(RT)
1907 sall $2 + ZBASE_SHIFT, %eax
1914 movsd 0 * SIZE(AA), %xmm0
1919 movsd 8 * SIZE(AA), %xmm1
1921 movaps 0 * SIZE(BB), %xmm2
1923 movaps 16 * SIZE(BB), %xmm3
1926 #if defined(LT) || defined(RN)
1939 movaps 4 * SIZE(BB), %xmm2
1941 movsd 2 * SIZE(AA), %xmm0
1943 movaps 8 * SIZE(BB), %xmm2
1946 movaps 12 * SIZE(BB), %xmm2
1948 movsd 4 * SIZE(AA), %xmm0
1950 movaps 32 * SIZE(BB), %xmm2
1953 movaps 20 * SIZE(BB), %xmm3
1955 movsd 6 * SIZE(AA), %xmm0
1957 movaps 24 * SIZE(BB), %xmm3
1960 movaps 28 * SIZE(BB), %xmm3
1962 movsd 16 * SIZE(AA), %xmm0
1964 movaps 48 * SIZE(BB), %xmm3
1967 movaps 36 * SIZE(BB), %xmm2
1969 movsd 10 * SIZE(AA), %xmm1
1971 movaps 40 * SIZE(BB), %xmm2
1974 movaps 44 * SIZE(BB), %xmm2
1976 movsd 12 * SIZE(AA), %xmm1
1978 movaps 64 * SIZE(BB), %xmm2
1981 movaps 52 * SIZE(BB), %xmm3
1983 movsd 14 * SIZE(AA), %xmm1
1985 movaps 56 * SIZE(BB), %xmm3
1988 movaps 60 * SIZE(BB), %xmm3
1990 movsd 24 * SIZE(AA), %xmm1
1992 movaps 80 * SIZE(BB), %xmm3
1994 addl $ 16 * SIZE, AA
1995 addl $ 64 * SIZE, BB
2001 #if defined(LT) || defined(RN)
2007 andl $7, %eax # if (k & 1)
2014 mulps 4 * SIZE(BB), %xmm0
2016 movaps 8 * SIZE(BB), %xmm2
2018 movsd 2 * SIZE(AA), %xmm0
2030 movaps POSINV, %xmm0
2032 shufps $0xb1, %xmm5, %xmm5
2034 #if defined(LN) || defined(LT)
2046 #if defined(LN) || defined(RT)
2054 sall $ZBASE_SHIFT, %eax
2057 leal (BB, %eax, 4), BB
2060 #if defined(LN) || defined(LT)
2064 movsd 0 * SIZE(B), %xmm2
2071 movsd 0 * SIZE(AA), %xmm1
2076 #if defined(LN) || defined(LT)
2077 movaps 0 * SIZE(AA), %xmm5
2079 pshufd $0x44, %xmm5, %xmm6
2080 pshufd $0x11, %xmm5, %xmm7
2082 pshufd $0xa0, %xmm2, %xmm4
2083 pshufd $0xf5, %xmm2, %xmm2
2096 #if defined(RN) || defined(RT)
2097 movaps 0 * SIZE(B), %xmm4
2099 pshufd $0x44, %xmm4, %xmm6
2100 pshufd $0x11, %xmm4, %xmm7
2102 pshufd $0xa0, %xmm1, %xmm3
2103 pshufd $0xf5, %xmm1, %xmm1
2121 #if defined(LN) || defined(LT)
2122 movlps %xmm2, 0 * SIZE(B)
2124 pshufd $0x00, %xmm2, %xmm0
2125 pshufd $0x55, %xmm2, %xmm1
2127 movaps %xmm0, 0 * SIZE(BB)
2128 movaps %xmm1, 4 * SIZE(BB)
2130 movlps %xmm2, 0 * SIZE(CO1)
2132 movlps %xmm1, 0 * SIZE(AA)
2134 movlps %xmm1, 0 * SIZE(CO1)
2141 #if defined(LT) || defined(RN)
2144 sall $ZBASE_SHIFT, %eax
2163 sall $ZBASE_SHIFT, %eax
2171 sall $ZBASE_SHIFT, %eax
2175 #if defined(LT) || defined(RN)
2178 sall $ZBASE_SHIFT, %eax
2194 movl OLD_STACK, %esp