1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
43 #define PREFETCH prefetcht0
44 #define PREFETCHW prefetcht0
45 #define PREFETCHSIZE (16 * 12)
49 #define PREFETCH prefetcht0
50 #define PREFETCHW prefetcht0
51 #define PREFETCHSIZE (16 * 12)
54 #if defined(PENRYN) || defined(DUNNINGTON)
55 #define PREFETCH prefetcht0
56 #define PREFETCHW prefetcht0
57 #define PREFETCHSIZE (16 * 12)
60 #if defined(NEHALEM) || defined(SANDYBRIDGE)
61 #define PREFETCH prefetcht0
62 #define PREFETCHW prefetcht0
63 #define PREFETCHSIZE (16 * 12)
67 #define PREFETCH prefetcht0
68 #define PREFETCHW prefetcht0
69 #define PREFETCHSIZE (16 * 20)
73 #define PREFETCH prefetch
74 #define PREFETCHW prefetchw
75 #define PREFETCHSIZE (16 * 8)
79 #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
80 #define PREFETCH prefetch
81 #define PREFETCHW prefetchw
82 #define PREFETCHSIZE (16 * 16)
86 #define PREFETCH prefetcht0
87 #define PREFETCHW prefetcht0
88 #define PREFETCHSIZE (16 * 24)
92 #define PREFETCH prefetcht0
93 #define PREFETCHW prefetcht0
94 #define PREFETCHSIZE (16 * 20)
101 #define OLD_Y 8 + STACKSIZE(%rsp)
102 #define OLD_INCY 16 + STACKSIZE(%rsp)
103 #define OLD_BUFFER 24 + STACKSIZE(%rsp)
114 #define STACKSIZE 256
116 #define OLD_LDA 40 + STACKSIZE(%rsp)
117 #define OLD_X 48 + STACKSIZE(%rsp)
118 #define OLD_INCX 56 + STACKSIZE(%rsp)
119 #define OLD_Y 64 + STACKSIZE(%rsp)
120 #define OLD_INCY 72 + STACKSIZE(%rsp)
121 #define OLD_BUFFER 80 + STACKSIZE(%rsp)
171 subq $STACKSIZE, %rsp
182 movups %xmm6, 64(%rsp)
183 movups %xmm7, 80(%rsp)
184 movups %xmm8, 96(%rsp)
185 movups %xmm9, 112(%rsp)
186 movups %xmm10, 128(%rsp)
187 movups %xmm11, 144(%rsp)
188 movups %xmm12, 160(%rsp)
189 movups %xmm13, 176(%rsp)
190 movups %xmm14, 192(%rsp)
191 movups %xmm15, 208(%rsp)
202 movq OLD_BUFFER, BUFFER
204 leaq (,INCX, SIZE), INCX
205 leaq (,INCY, SIZE), INCY
206 leaq (,LDA, SIZE), LDA
218 shufps $0, ALPHA, ALPHA
228 movss 0 * SIZE(X), %xmm1
230 movss 0 * SIZE(X), %xmm2
232 movss 0 * SIZE(X), %xmm3
234 movss 0 * SIZE(X), %xmm4
236 movss 0 * SIZE(X), %xmm5
238 movss 0 * SIZE(X), %xmm6
240 movss 0 * SIZE(X), %xmm7
242 movss 0 * SIZE(X), %xmm8
254 movss %xmm1, 0 * SIZE(XX)
255 movss %xmm2, 1 * SIZE(XX)
256 movss %xmm3, 2 * SIZE(XX)
257 movss %xmm4, 3 * SIZE(XX)
258 movss %xmm5, 4 * SIZE(XX)
259 movss %xmm6, 5 * SIZE(XX)
260 movss %xmm7, 6 * SIZE(XX)
261 movss %xmm8, 7 * SIZE(XX)
275 movss 0 * SIZE(X), %xmm1
280 movss %xmm1, 0 * SIZE(XX)
288 /* now we don't need original X */
306 movss 0 * SIZE(YY), %xmm0
308 movss 0 * SIZE(YY), %xmm1
310 movss 0 * SIZE(YY), %xmm2
312 movss 0 * SIZE(YY), %xmm3
314 movss 0 * SIZE(YY), %xmm4
316 movss 0 * SIZE(YY), %xmm5
318 movss 0 * SIZE(YY), %xmm6
320 movss 0 * SIZE(YY), %xmm7
323 movss %xmm0, 0 * SIZE(XX)
324 movss %xmm1, 1 * SIZE(XX)
325 movss %xmm2, 2 * SIZE(XX)
326 movss %xmm3, 3 * SIZE(XX)
327 movss %xmm4, 4 * SIZE(XX)
328 movss %xmm5, 5 * SIZE(XX)
329 movss %xmm6, 6 * SIZE(XX)
330 movss %xmm7, 7 * SIZE(XX)
344 movss 0 * SIZE(YY), %xmm0
347 movss %xmm0, 0 * SIZE(XX)
366 movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4
368 pshufd $0x00, atemp4, atemp1
369 pshufd $0x55, atemp4, atemp2
370 pshufd $0xaa, atemp4, atemp3
371 pshufd $0xff, atemp4, atemp4
378 movaps 0 * SIZE(NEW_X), xtemp1
379 movaps 4 * SIZE(NEW_X), xtemp2
381 movsd 0 * SIZE(A1), a1
382 movhps 2 * SIZE(A1), a1
383 movsd 0 * SIZE(A1, LDA, 1), a2
384 movhps 2 * SIZE(A1, LDA, 1), a2
385 movsd 0 * SIZE(A2), a3
386 movhps 2 * SIZE(A2), a3
387 movsd 0 * SIZE(A2, LDA, 1), a4
388 movhps 2 * SIZE(A2, LDA, 1), a4
390 movsd 0 * SIZE(NEW_Y), yy1
391 movhps 2 * SIZE(NEW_Y), yy1
407 movsd 4 * SIZE(A1), a1
408 movhps 6 * SIZE(A1), a1
410 PREFETCH PREFETCHSIZE(A1)
417 movsd 4 * SIZE(A1, LDA, 1), a2
418 movhps 6 * SIZE(A1, LDA, 1), a2
425 movsd 4 * SIZE(A2), a3
426 movhps 6 * SIZE(A2), a3
428 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
429 PREFETCH PREFETCHSIZE(XX)
433 movaps 8 * SIZE(XX), xtemp1
438 movsd 4 * SIZE(A2, LDA, 1), a4
439 movhps 6 * SIZE(A2, LDA, 1), a4
441 movlps yy1, 0 * SIZE(YY)
442 movhps yy1, 2 * SIZE(YY)
443 movsd 4 * SIZE(YY), yy1
444 movhps 6 * SIZE(YY), yy1
451 movsd 8 * SIZE(A1), a1
452 movhps 10 * SIZE(A1), a1
454 PREFETCH PREFETCHSIZE(A1, LDA, 1)
461 movsd 8 * SIZE(A1, LDA, 1), a2
462 movhps 10 * SIZE(A1, LDA, 1), a2
469 movsd 8 * SIZE(A2), a3
470 movhps 10 * SIZE(A2), a3
473 movaps 12 * SIZE(XX), xtemp2
478 movsd 8 * SIZE(A2, LDA, 1), a4
479 movhps 10 * SIZE(A2, LDA, 1), a4
481 movlps yy1, 4 * SIZE(YY)
482 movhps yy1, 6 * SIZE(YY)
483 movsd 8 * SIZE(YY), yy1
484 movhps 10 * SIZE(YY), yy1
492 movsd 12 * SIZE(A1), a1
493 movhps 14 * SIZE(A1), a1
495 PREFETCH PREFETCHSIZE(A2)
502 movsd 12 * SIZE(A1, LDA, 1), a2
503 movhps 14 * SIZE(A1, LDA, 1), a2
510 movsd 12 * SIZE(A2), a3
511 movhps 14 * SIZE(A2), a3
513 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
514 PREFETCHW PREFETCHSIZE(YY)
518 movaps 16 * SIZE(XX), xtemp1
523 movsd 12 * SIZE(A2, LDA, 1), a4
524 movhps 14 * SIZE(A2, LDA, 1), a4
526 movlps yy1, 8 * SIZE(YY)
527 movhps yy1, 10 * SIZE(YY)
528 movsd 12 * SIZE(YY), yy1
529 movhps 14 * SIZE(YY), yy1
536 movsd 16 * SIZE(A1), a1
537 movhps 18 * SIZE(A1), a1
539 PREFETCH PREFETCHSIZE(A2, LDA, 1)
546 movsd 16 * SIZE(A1, LDA, 1), a2
547 movhps 18 * SIZE(A1, LDA, 1), a2
554 movsd 16 * SIZE(A2), a3
555 movhps 18 * SIZE(A2), a3
558 movaps 20 * SIZE(XX), xtemp2
563 movsd 16 * SIZE(A2, LDA, 1), a4
564 movhps 18 * SIZE(A2, LDA, 1), a4
566 movlps yy1, 12 * SIZE(YY)
567 movhps yy1, 14 * SIZE(YY)
568 movsd 16 * SIZE(YY), yy1
569 movhps 18 * SIZE(YY), yy1
589 movsd 4 * SIZE(A1), a1
590 movhps 6 * SIZE(A1), a1
597 movsd 4 * SIZE(A1, LDA, 1), a2
598 movhps 6 * SIZE(A1, LDA, 1), a2
605 movsd 4 * SIZE(A2), a3
606 movhps 6 * SIZE(A2), a3
609 movaps 8 * SIZE(XX), xtemp1
614 movsd 4 * SIZE(A2, LDA, 1), a4
615 movhps 6 * SIZE(A2, LDA, 1), a4
617 movlps yy1, 0 * SIZE(YY)
618 movhps yy1, 2 * SIZE(YY)
619 movsd 4 * SIZE(YY), yy1
620 movhps 6 * SIZE(YY), yy1
627 movsd 8 * SIZE(A1), a1
628 movhps 10 * SIZE(A1), a1
635 movsd 8 * SIZE(A1, LDA, 1), a2
636 movhps 10 * SIZE(A1, LDA, 1), a2
643 movsd 8 * SIZE(A2), a3
644 movhps 10 * SIZE(A2), a3
647 movaps 12 * SIZE(XX), xtemp2
652 movsd 8 * SIZE(A2, LDA, 1), a4
653 movhps 10 * SIZE(A2, LDA, 1), a4
655 movlps yy1, 4 * SIZE(YY)
656 movhps yy1, 6 * SIZE(YY)
657 movsd 8 * SIZE(YY), yy1
658 movhps 10 * SIZE(YY), yy1
694 movlps yy1, 0 * SIZE(YY)
695 movhps yy1, 2 * SIZE(YY)
696 movsd 4 * SIZE(YY), yy1
697 movhps 6 * SIZE(YY), yy1
706 movaps 0 * SIZE(NEW_X, IS, SIZE), atemp1
708 movss 0 * SIZE(A1), a1
709 movss 0 * SIZE(A1, LDA, 1), a2
710 movss 0 * SIZE(A2), a3
711 movss 0 * SIZE(A2, LDA, 1), a4
720 movsd 0 * SIZE(A1, LDA, 1), a1
721 movss 1 * SIZE(A2), a2
722 movhps 1 * SIZE(A2, LDA, 1), a2
729 movsd 0 * SIZE(A2), a1
730 movss 2 * SIZE(A2), a2
731 movhps 2 * SIZE(A2, LDA, 1), a2
738 movsd 0 * SIZE(A2, LDA, 1), a1
739 movhps 2 * SIZE(A2, LDA, 1), a1
747 unpcklps xsum3, xsum1
748 unpckhps xsum3, xtemp1
751 unpcklps xsum4, xsum2
752 unpckhps xsum4, xtemp2
755 unpcklps xsum2, xsum1
756 unpckhps xsum2, xsum3
759 unpcklps xtemp2, xtemp1
760 unpckhps xtemp2, xsum4
774 movlps yy1, 0 * SIZE(YY)
775 movhps yy1, 2 * SIZE(YY)
792 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp4
794 pshufd $0x00, atemp4, atemp1
795 pshufd $0x55, atemp4, atemp2
800 movaps 0 * SIZE(NEW_X), xtemp1
802 movsd 0 * SIZE(A1), a1
803 movhps 2 * SIZE(A1), a1
804 movsd 0 * SIZE(A1, LDA, 1), a2
805 movhps 2 * SIZE(A1, LDA, 1), a2
807 movsd 0 * SIZE(NEW_Y), yy1
808 movhps 2 * SIZE(NEW_Y), yy1
824 movsd 4 * SIZE(A1), a1
825 movhps 6 * SIZE(A1), a1
828 movaps 4 * SIZE(XX), xtemp1
833 movsd 4 * SIZE(A1, LDA, 1), a2
834 movhps 6 * SIZE(A1, LDA, 1), a2
836 movlps yy1, 0 * SIZE(YY)
837 movhps yy1, 2 * SIZE(YY)
838 movsd 4 * SIZE(YY), yy1
839 movhps 6 * SIZE(YY), yy1
850 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1
852 movss 0 * SIZE(A1), a1
853 movss 0 * SIZE(A1, LDA, 1), a2
860 movsd 0 * SIZE(A1, LDA, 1), a1
870 unpcklps xsum2, xsum1
881 movlps yy1, 0 * SIZE(YY)
892 movss 0 * SIZE(NEW_X, IS, SIZE), atemp1
894 pshufd $0x00, atemp1, atemp1
899 movss 0 * SIZE(NEW_Y), yy1
901 movss 0 * SIZE(NEW_X), xtemp1
902 movss 1 * SIZE(NEW_X), xtemp2
904 movss 0 * SIZE(A1), a1
905 movss 1 * SIZE(A1), a2
917 movss 2 * SIZE(XX), xtemp1
922 movss 2 * SIZE(A1), a1
924 movss yy1, 0 * SIZE(YY)
925 movss 1 * SIZE(YY), yy1
928 movss 3 * SIZE(XX), xtemp2
933 movss 3 * SIZE(A1), a2
935 movss yy1, 1 * SIZE(YY)
936 movss 2 * SIZE(YY), yy1
947 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1
949 movss 0 * SIZE(A1), a1
959 unpcklps xsum2, xsum1
969 movss yy1, 0 * SIZE(YY)
984 movss 0 * SIZE(NEW_Y), %xmm0
985 movss 1 * SIZE(NEW_Y), %xmm1
986 movss 2 * SIZE(NEW_Y), %xmm2
987 movss 3 * SIZE(NEW_Y), %xmm3
988 movss 4 * SIZE(NEW_Y), %xmm4
989 movss 5 * SIZE(NEW_Y), %xmm5
990 movss 6 * SIZE(NEW_Y), %xmm6
991 movss 7 * SIZE(NEW_Y), %xmm7
993 movss %xmm0, 0 * SIZE(Y)
995 movss %xmm1, 0 * SIZE(Y)
997 movss %xmm2, 0 * SIZE(Y)
999 movss %xmm3, 0 * SIZE(Y)
1001 movss %xmm4, 0 * SIZE(Y)
1003 movss %xmm5, 0 * SIZE(Y)
1005 movss %xmm6, 0 * SIZE(Y)
1007 movss %xmm7, 0 * SIZE(Y)
1010 addq $8 * SIZE, NEW_Y
1022 movss 0 * SIZE(NEW_Y), %xmm0
1024 movss %xmm0, 0 * SIZE(Y)
1027 addq $1 * SIZE, NEW_Y
1045 movups 64(%rsp), %xmm6
1046 movups 80(%rsp), %xmm7
1047 movups 96(%rsp), %xmm8
1048 movups 112(%rsp), %xmm9
1049 movups 128(%rsp), %xmm10
1050 movups 144(%rsp), %xmm11
1051 movups 160(%rsp), %xmm12
1052 movups 176(%rsp), %xmm13
1053 movups 192(%rsp), %xmm14
1054 movups 208(%rsp), %xmm15
1057 addq $STACKSIZE, %rsp