1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
43 #define PREFETCH prefetcht0
44 #define PREFETCHW prefetcht0
45 #define PREFETCHSIZE (16 * 12)
49 #define PREFETCH prefetcht0
50 #define PREFETCHW prefetcht0
51 #define PREFETCHSIZE (16 * 12)
54 #if defined(PENRYN) || defined(DUNNINGTON)
55 #define PREFETCH prefetcht0
56 #define PREFETCHW prefetcht0
57 #define PREFETCHSIZE (16 * 12)
60 #if defined(NEHALEM) || defined(SANDYBRIDGE)
61 #define PREFETCH prefetcht0
62 #define PREFETCHW prefetcht0
63 #define PREFETCHSIZE (16 * 24)
67 #define PREFETCH prefetcht0
68 #define PREFETCHW prefetcht0
69 #define PREFETCHSIZE (16 * 20)
73 #define PREFETCH prefetch
74 #define PREFETCHW prefetchw
75 #define PREFETCHSIZE (16 * 8)
79 #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
80 #define PREFETCH prefetch
81 #define PREFETCHW prefetchw
82 #define PREFETCHSIZE (16 * 16)
86 #define PREFETCH prefetcht0
87 #define PREFETCHW prefetcht0
88 #define PREFETCHSIZE (8 * 24)
92 #define PREFETCH prefetcht0
93 #define PREFETCHW prefetcht0
94 #define PREFETCHSIZE (16 * 20)
101 #define OLD_Y 8 + STACKSIZE(%rsp)
102 #define OLD_INCY 16 + STACKSIZE(%rsp)
103 #define OLD_BUFFER 24 + STACKSIZE(%rsp)
114 #define STACKSIZE 256
116 #define OLD_LDA 40 + STACKSIZE(%rsp)
117 #define OLD_X 48 + STACKSIZE(%rsp)
118 #define OLD_INCX 56 + STACKSIZE(%rsp)
119 #define OLD_Y 64 + STACKSIZE(%rsp)
120 #define OLD_INCY 72 + STACKSIZE(%rsp)
121 #define OLD_BUFFER 80 + STACKSIZE(%rsp)
170 subq $STACKSIZE, %rsp
181 movups %xmm6, 64(%rsp)
182 movups %xmm7, 80(%rsp)
183 movups %xmm8, 96(%rsp)
184 movups %xmm9, 112(%rsp)
185 movups %xmm10, 128(%rsp)
186 movups %xmm11, 144(%rsp)
187 movups %xmm12, 160(%rsp)
188 movups %xmm13, 176(%rsp)
189 movups %xmm14, 192(%rsp)
190 movups %xmm15, 208(%rsp)
201 movq OLD_BUFFER, BUFFER
203 leaq (,INCX, SIZE), INCX
204 leaq (,INCY, SIZE), INCY
205 leaq (,LDA, SIZE), LDA
217 unpcklpd ALPHA, ALPHA
227 movsd 0 * SIZE(X), %xmm1
229 movhpd 0 * SIZE(X), %xmm1
231 movsd 0 * SIZE(X), %xmm2
233 movhpd 0 * SIZE(X), %xmm2
235 movsd 0 * SIZE(X), %xmm3
237 movhpd 0 * SIZE(X), %xmm3
239 movsd 0 * SIZE(X), %xmm4
241 movhpd 0 * SIZE(X), %xmm4
249 movapd %xmm1, 0 * SIZE(XX)
250 movapd %xmm2, 2 * SIZE(XX)
251 movapd %xmm3, 4 * SIZE(XX)
252 movapd %xmm4, 6 * SIZE(XX)
266 movsd 0 * SIZE(X), %xmm1
271 movlpd %xmm1, 0 * SIZE(XX)
279 /* now we don't need original X */
297 movsd 0 * SIZE(YY), %xmm0
299 movhpd 0 * SIZE(YY), %xmm0
301 movsd 0 * SIZE(YY), %xmm1
303 movhpd 0 * SIZE(YY), %xmm1
305 movsd 0 * SIZE(YY), %xmm2
307 movhpd 0 * SIZE(YY), %xmm2
309 movsd 0 * SIZE(YY), %xmm3
311 movhpd 0 * SIZE(YY), %xmm3
314 movapd %xmm0, 0 * SIZE(XX)
315 movapd %xmm1, 2 * SIZE(XX)
316 movapd %xmm2, 4 * SIZE(XX)
317 movapd %xmm3, 6 * SIZE(XX)
331 movsd 0 * SIZE(YY), %xmm0
334 movsd %xmm0, 0 * SIZE(XX)
354 movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1
355 movddup 1 * SIZE(NEW_X, IS, SIZE), atemp2
356 movddup 2 * SIZE(NEW_X, IS, SIZE), atemp3
357 movddup 3 * SIZE(NEW_X, IS, SIZE), atemp4
359 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1
360 movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1
361 movsd 1 * SIZE(NEW_X, IS, SIZE), atemp2
362 movhpd 1 * SIZE(NEW_X, IS, SIZE), atemp2
363 movsd 2 * SIZE(NEW_X, IS, SIZE), atemp3
364 movhpd 2 * SIZE(NEW_X, IS, SIZE), atemp3
365 movsd 3 * SIZE(NEW_X, IS, SIZE), atemp4
366 movhpd 3 * SIZE(NEW_X, IS, SIZE), atemp4
374 movapd 0 * SIZE(NEW_X), xtemp1
375 movapd 2 * SIZE(NEW_X), xtemp2
377 movsd 0 * SIZE(A1), a1
378 movhpd 1 * SIZE(A1), a1
379 movsd 2 * SIZE(A1), a2
380 movhpd 3 * SIZE(A1), a2
381 movsd 0 * SIZE(A1, LDA, 1), a3
382 movhpd 1 * SIZE(A1, LDA, 1), a3
384 movsd 0 * SIZE(NEW_Y), yy1
385 movhpd 1 * SIZE(NEW_Y), yy1
386 movsd 2 * SIZE(NEW_Y), yy2
387 movhpd 3 * SIZE(NEW_Y), yy2
403 movsd 2 * SIZE(A1, LDA, 1), a1
404 movhpd 3 * SIZE(A1, LDA, 1), a1
406 PREFETCH PREFETCHSIZE(A1)
413 movsd 0 * SIZE(A2), a2
414 movhpd 1 * SIZE(A2), a2
421 movsd 2 * SIZE(A2), a3
422 movhpd 3 * SIZE(A2), a3
424 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
425 PREFETCH PREFETCHSIZE(XX)
433 movsd 0 * SIZE(A2, LDA, 1), a1
434 movhpd 1 * SIZE(A2, LDA, 1), a1
441 movsd 2 * SIZE(A2, LDA, 1), a2
442 movhpd 3 * SIZE(A2, LDA, 1), a2
444 PREFETCH PREFETCHSIZE(A1, LDA, 1)
451 movsd 4 * SIZE(A1), a3
452 movhpd 5 * SIZE(A1), a3
455 movapd 4 * SIZE(XX), xtemp1
460 movsd 6 * SIZE(A1), a1
461 movhpd 7 * SIZE(A1), a1
464 movapd 6 * SIZE(XX), xtemp2
469 movsd 4 * SIZE(A1, LDA, 1), a2
470 movhpd 5 * SIZE(A1, LDA, 1), a2
472 movsd yy1, 0 * SIZE(YY)
473 movhpd yy1, 1 * SIZE(YY)
474 movsd 4 * SIZE(YY), yy1
475 movhpd 5 * SIZE(YY), yy1
477 movsd yy2, 2 * SIZE(YY)
478 movhpd yy2, 3 * SIZE(YY)
479 movsd 6 * SIZE(YY), yy2
480 movhpd 7 * SIZE(YY), yy2
487 movsd 6 * SIZE(A1, LDA, 1), a3
488 movhpd 7 * SIZE(A1, LDA, 1), a3
490 PREFETCH PREFETCHSIZE(A2)
497 movsd 4 * SIZE(A2), a1
498 movhpd 5 * SIZE(A2), a1
505 movsd 6 * SIZE(A2), a2
506 movhpd 7 * SIZE(A2), a2
508 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
509 PREFETCHW PREFETCHSIZE(YY)
517 movsd 4 * SIZE(A2, LDA, 1), a3
518 movhpd 5 * SIZE(A2, LDA, 1), a3
525 movsd 6 * SIZE(A2, LDA, 1), a1
526 movhpd 7 * SIZE(A2, LDA, 1), a1
528 PREFETCH PREFETCHSIZE(A2, LDA, 1)
535 movsd 10 * SIZE(A1), a2
536 movhpd 11 * SIZE(A1), a2
539 movapd 8 * SIZE(XX), xtemp1
544 movsd 8 * SIZE(A1, LDA, 1), a3
545 movhpd 9 * SIZE(A1, LDA, 1), a3
548 movapd 10 * SIZE(XX), xtemp2
553 movsd 8 * SIZE(A1), a1
554 movhpd 9 * SIZE(A1), a1
556 movsd yy1, 4 * SIZE(YY)
557 movhpd yy1, 5 * SIZE(YY)
558 movsd 8 * SIZE(YY), yy1
559 movhpd 9 * SIZE(YY), yy1
561 movsd yy2, 6 * SIZE(YY)
562 movhpd yy2, 7 * SIZE(YY)
563 movsd 10 * SIZE(YY), yy2
564 movhpd 11 * SIZE(YY), yy2
584 movsd 2 * SIZE(A1, LDA, 1), a1
585 movhpd 3 * SIZE(A1, LDA, 1), a1
592 movsd 0 * SIZE(A2), a2
593 movhpd 1 * SIZE(A2), a2
600 movsd 2 * SIZE(A2), a3
601 movhpd 3 * SIZE(A2), a3
608 movsd 0 * SIZE(A2, LDA, 1), a1
609 movhpd 1 * SIZE(A2, LDA, 1), a1
616 movsd 2 * SIZE(A2, LDA, 1), a2
617 movhpd 3 * SIZE(A2, LDA, 1), a2
626 movapd 4 * SIZE(XX), xtemp1
633 movapd 6 * SIZE(XX), xtemp2
639 movsd yy1, 0 * SIZE(YY)
640 movhpd yy1, 1 * SIZE(YY)
641 movsd 4 * SIZE(YY), yy1
642 movhpd 5 * SIZE(YY), yy1
644 movsd yy2, 2 * SIZE(YY)
645 movhpd yy2, 3 * SIZE(YY)
646 movsd 6 * SIZE(YY), yy2
647 movhpd 7 * SIZE(YY), yy2
656 unpckhpd atemp2, atemp1
657 unpckhpd atemp4, atemp3
659 movsd 0 * SIZE(A1), a1
660 movhpd 0 * SIZE(A1, LDA, 1), a1
664 movsd 0 * SIZE(A1, LDA, 1), a1
665 movhpd 1 * SIZE(A1, LDA, 1), a1
669 movsd 0 * SIZE(A2), a1
670 movhpd 1 * SIZE(A2), a1
674 movsd 0 * SIZE(A2, LDA, 1), a1
675 movhpd 1 * SIZE(A2, LDA, 1), a1
679 movsd 0 * SIZE(A2), a1
680 movhpd 0 * SIZE(A2, LDA, 1), a1
684 movsd 1 * SIZE(A2), a1
685 movhpd 1 * SIZE(A2, LDA, 1), a1
689 movsd 2 * SIZE(A2), a1
690 movhpd 2 * SIZE(A2, LDA, 1), a1
694 movsd 2 * SIZE(A2, LDA, 1), a1
695 movhpd 3 * SIZE(A2, LDA, 1), a1
703 unpcklpd xsum2, xsum1
704 unpcklpd xsum4, xsum3
706 unpckhpd xsum2, atemp1
707 unpckhpd xsum4, atemp3
719 movsd yy1, 0 * SIZE(YY)
720 movhpd yy1, 1 * SIZE(YY)
721 movsd yy2, 2 * SIZE(YY)
722 movhpd yy2, 3 * SIZE(YY)
742 movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1
743 movddup 1 * SIZE(NEW_X, IS, SIZE), atemp2
745 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1
746 movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1
747 movsd 1 * SIZE(NEW_X, IS, SIZE), atemp2
748 movhpd 1 * SIZE(NEW_X, IS, SIZE), atemp2
754 movapd 0 * SIZE(NEW_X), xtemp1
756 movsd 0 * SIZE(NEW_Y), yy1
757 movhpd 1 * SIZE(NEW_Y), yy1
759 movsd 0 * SIZE(A1), a1
760 movhpd 1 * SIZE(A1), a1
761 movsd 0 * SIZE(A1, LDA, 1), a2
762 movhpd 1 * SIZE(A1, LDA, 1), a2
778 movsd 2 * SIZE(A1), a1
779 movhpd 3 * SIZE(A1), a1
782 movapd 2 * SIZE(XX), xtemp1
787 movsd 2 * SIZE(A1, LDA, 1), a2
788 movhpd 3 * SIZE(A1, LDA, 1), a2
790 movsd yy1, 0 * SIZE(YY)
791 movhpd yy1, 1 * SIZE(YY)
792 movsd 2 * SIZE(YY), yy1
793 movhpd 3 * SIZE(YY), yy1
804 unpckhpd atemp2, atemp1
806 movsd 0 * SIZE(A1), a1
807 movhpd 0 * SIZE(A1, LDA, 1), a1
811 movsd 0 * SIZE(A1, LDA, 1), a1
812 movhpd 1 * SIZE(A1, LDA, 1), a1
819 unpcklpd xsum2, xsum1
820 unpckhpd xsum2, atemp1
829 movsd yy1, 0 * SIZE(YY)
830 movhpd yy1, 1 * SIZE(YY)
844 movddup 0 * SIZE(NEW_X, IS, SIZE), atemp1
846 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1
847 movhpd 0 * SIZE(NEW_X, IS, SIZE), atemp1
852 movsd 0 * SIZE(NEW_X), xtemp1
853 movsd 0 * SIZE(NEW_Y), yy1
854 movsd 0 * SIZE(A1), a1
870 movsd 1 * SIZE(A1), a1
872 movsd 1 * SIZE(XX), xtemp1
874 movsd yy1, 0 * SIZE(YY)
875 movsd 1 * SIZE(YY), yy1
886 movsd 0 * SIZE(A1), a1
892 movsd yy1, 0 * SIZE(YY)
905 movapd 0 * SIZE(NEW_Y), %xmm0
906 movapd 2 * SIZE(NEW_Y), %xmm1
907 movapd 4 * SIZE(NEW_Y), %xmm2
908 movapd 6 * SIZE(NEW_Y), %xmm3
910 movsd %xmm0, 0 * SIZE(Y)
912 movhpd %xmm0, 0 * SIZE(Y)
914 movsd %xmm1, 0 * SIZE(Y)
916 movhpd %xmm1, 0 * SIZE(Y)
918 movsd %xmm2, 0 * SIZE(Y)
920 movhpd %xmm2, 0 * SIZE(Y)
922 movsd %xmm3, 0 * SIZE(Y)
924 movhpd %xmm3, 0 * SIZE(Y)
927 addq $8 * SIZE, NEW_Y
939 movsd 0 * SIZE(NEW_Y), %xmm0
941 movsd %xmm0, 0 * SIZE(Y)
944 addq $1 * SIZE, NEW_Y
962 movups 64(%rsp), %xmm6
963 movups 80(%rsp), %xmm7
964 movups 96(%rsp), %xmm8
965 movups 112(%rsp), %xmm9
966 movups 128(%rsp), %xmm10
967 movups 144(%rsp), %xmm11
968 movups 160(%rsp), %xmm12
969 movups 176(%rsp), %xmm13
970 movups 192(%rsp), %xmm14
971 movups 208(%rsp), %xmm15
974 addq $STACKSIZE, %rsp