1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
43 #define PREFETCH prefetcht0
44 #define PREFETCHW prefetcht0
45 #define PREFETCHSIZE (16 * 12)
49 #define PREFETCH prefetcht0
50 #define PREFETCHW prefetcht0
51 #define PREFETCHSIZE (16 * 12)
54 #if defined(PENRYN) || defined(DUNNINGTON)
55 #define PREFETCH prefetcht0
56 #define PREFETCHW prefetcht0
57 #define PREFETCHSIZE (16 * 12)
60 #if defined(NEHALEM) || defined(SANDYBRIDGE)
61 #define PREFETCH prefetcht0
62 #define PREFETCHW prefetcht0
63 #define PREFETCHSIZE (16 * 12)
67 #define PREFETCH prefetcht0
68 #define PREFETCHW prefetcht0
69 #define PREFETCHSIZE (16 * 20)
73 #define PREFETCH prefetch
74 #define PREFETCHW prefetchw
75 #define PREFETCHSIZE (16 * 8)
79 #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
80 #define PREFETCH prefetch
81 #define PREFETCHW prefetchw
82 #define PREFETCHSIZE (16 * 16)
86 #define PREFETCH prefetcht0
87 #define PREFETCHW prefetcht0
88 #define PREFETCHSIZE (8 * 24)
92 #define PREFETCH prefetcht0
93 #define PREFETCHW prefetcht0
94 #define PREFETCHSIZE (16 * 20)
101 #define OLD_Y 8 + STACKSIZE(%rsp)
102 #define OLD_INCY 16 + STACKSIZE(%rsp)
103 #define OLD_BUFFER 24 + STACKSIZE(%rsp)
114 #define STACKSIZE 256
116 #define OLD_LDA 40 + STACKSIZE(%rsp)
117 #define OLD_X 48 + STACKSIZE(%rsp)
118 #define OLD_INCX 56 + STACKSIZE(%rsp)
119 #define OLD_Y 64 + STACKSIZE(%rsp)
120 #define OLD_INCY 72 + STACKSIZE(%rsp)
121 #define OLD_BUFFER 80 + STACKSIZE(%rsp)
171 subq $STACKSIZE, %rsp
182 movups %xmm6, 64(%rsp)
183 movups %xmm7, 80(%rsp)
184 movups %xmm8, 96(%rsp)
185 movups %xmm9, 112(%rsp)
186 movups %xmm10, 128(%rsp)
187 movups %xmm11, 144(%rsp)
188 movups %xmm12, 160(%rsp)
189 movups %xmm13, 176(%rsp)
190 movups %xmm14, 192(%rsp)
191 movups %xmm15, 208(%rsp)
202 movq OLD_BUFFER, BUFFER
204 leaq (,INCX, SIZE), INCX
205 leaq (,INCY, SIZE), INCY
206 leaq (,LDA, SIZE), LDA
211 unpcklpd ALPHA, ALPHA
221 movsd 0 * SIZE(X), %xmm1
223 movhpd 0 * SIZE(X), %xmm1
225 movsd 0 * SIZE(X), %xmm2
227 movhpd 0 * SIZE(X), %xmm2
229 movsd 0 * SIZE(X), %xmm3
231 movhpd 0 * SIZE(X), %xmm3
233 movsd 0 * SIZE(X), %xmm4
235 movhpd 0 * SIZE(X), %xmm4
243 movapd %xmm1, 0 * SIZE(XX)
244 movapd %xmm2, 2 * SIZE(XX)
245 movapd %xmm3, 4 * SIZE(XX)
246 movapd %xmm4, 6 * SIZE(XX)
260 movsd 0 * SIZE(X), %xmm1
265 movlpd %xmm1, 0 * SIZE(XX)
273 /* now we don't need original X */
291 movsd 0 * SIZE(YY), %xmm0
293 movhpd 0 * SIZE(YY), %xmm0
295 movsd 0 * SIZE(YY), %xmm1
297 movhpd 0 * SIZE(YY), %xmm1
299 movsd 0 * SIZE(YY), %xmm2
301 movhpd 0 * SIZE(YY), %xmm2
303 movsd 0 * SIZE(YY), %xmm3
305 movhpd 0 * SIZE(YY), %xmm3
308 movapd %xmm0, 0 * SIZE(XX)
309 movapd %xmm1, 2 * SIZE(XX)
310 movapd %xmm2, 4 * SIZE(XX)
311 movapd %xmm3, 6 * SIZE(XX)
325 movsd 0 * SIZE(YY), %xmm0
328 movsd %xmm0, 0 * SIZE(XX)
345 leaq 4 * SIZE(A, LDA, 4), A
347 leaq (NEW_X, IS, SIZE), XX
348 leaq 4 * SIZE(NEW_Y, IS, SIZE), YY
350 movapd 0 * SIZE(XX), atemp2
351 movapd 2 * SIZE(XX), atemp4
353 movsd 0 * SIZE(A1), xsum1
354 movhpd 1 * SIZE(A1), xsum1
357 movsd 1 * SIZE(A1), xsum2
358 movhpd 1 * SIZE(A1, LDA, 1), xsum2
361 movsd 2 * SIZE(A1), xsum3
362 movhpd 2 * SIZE(A1, LDA, 1), xsum3
365 movsd 3 * SIZE(A1), xsum4
366 movhpd 3 * SIZE(A1, LDA, 1), xsum4
369 movsd 2 * SIZE(A1), a1
370 movhpd 3 * SIZE(A1), a1
374 movsd 2 * SIZE(A1, LDA, 1), a1
375 movhpd 3 * SIZE(A1, LDA, 1), a1
379 movsd 2 * SIZE(A2), a1
380 movhpd 3 * SIZE(A2), a1
384 movsd 3 * SIZE(A2), a1
385 movhpd 3 * SIZE(A2, LDA, 1), a1
389 movapd 4 * SIZE(XX), xtemp1
390 movapd 6 * SIZE(XX), xtemp2
392 movsd 4 * SIZE(A1), a1
393 movhpd 5 * SIZE(A1), a1
394 movsd 6 * SIZE(A1), a2
395 movhpd 7 * SIZE(A1), a2
396 movsd 4 * SIZE(A1, LDA, 1), a3
397 movhpd 5 * SIZE(A1, LDA, 1), a3
399 movsd 0 * SIZE(YY), yy1
400 movhpd 1 * SIZE(YY), yy1
401 movsd 2 * SIZE(YY), yy2
402 movhpd 3 * SIZE(YY), yy2
405 movapd atemp2, atemp1
406 unpcklpd atemp1, atemp1
407 unpckhpd atemp2, atemp2
408 movapd atemp4, atemp3
409 unpcklpd atemp3, atemp3
410 unpckhpd atemp4, atemp4
412 movddup atemp2, atemp1
413 unpckhpd atemp2, atemp2
414 movddup atemp4, atemp3
415 unpckhpd atemp4, atemp4
435 movsd 2 * SIZE(A1, LDA, 1), a1
436 movhpd 3 * SIZE(A1, LDA, 1), a1
438 PREFETCH PREFETCHSIZE(A1)
445 movsd 0 * SIZE(A2), a2
446 movhpd 1 * SIZE(A2), a2
453 movsd 2 * SIZE(A2), a3
454 movhpd 3 * SIZE(A2), a3
456 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
457 PREFETCH PREFETCHSIZE(XX)
465 movsd 0 * SIZE(A2, LDA, 1), a1
466 movhpd 1 * SIZE(A2, LDA, 1), a1
473 movsd 2 * SIZE(A2, LDA, 1), a2
474 movhpd 3 * SIZE(A2, LDA, 1), a2
476 PREFETCH PREFETCHSIZE(A1, LDA, 1)
483 movsd 4 * SIZE(A1), a3
484 movhpd 5 * SIZE(A1), a3
487 movapd 4 * SIZE(XX), xtemp1
492 movsd 6 * SIZE(A1), a1
493 movhpd 7 * SIZE(A1), a1
496 movapd 6 * SIZE(XX), xtemp2
501 movsd 4 * SIZE(A1, LDA, 1), a2
502 movhpd 5 * SIZE(A1, LDA, 1), a2
504 movsd yy1, 0 * SIZE(YY)
505 movhpd yy1, 1 * SIZE(YY)
506 movsd 4 * SIZE(YY), yy1
507 movhpd 5 * SIZE(YY), yy1
509 movsd yy2, 2 * SIZE(YY)
510 movhpd yy2, 3 * SIZE(YY)
511 movsd 6 * SIZE(YY), yy2
512 movhpd 7 * SIZE(YY), yy2
519 movsd 6 * SIZE(A1, LDA, 1), a3
520 movhpd 7 * SIZE(A1, LDA, 1), a3
522 PREFETCH PREFETCHSIZE(A2)
529 movsd 4 * SIZE(A2), a1
530 movhpd 5 * SIZE(A2), a1
537 movsd 6 * SIZE(A2), a2
538 movhpd 7 * SIZE(A2), a2
540 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
541 PREFETCHW PREFETCHSIZE(YY)
549 movsd 4 * SIZE(A2, LDA, 1), a3
550 movhpd 5 * SIZE(A2, LDA, 1), a3
557 movsd 6 * SIZE(A2, LDA, 1), a1
558 movhpd 7 * SIZE(A2, LDA, 1), a1
560 PREFETCH PREFETCHSIZE(A2, LDA, 1)
567 movsd 10 * SIZE(A1), a2
568 movhpd 11 * SIZE(A1), a2
571 movapd 8 * SIZE(XX), xtemp1
576 movsd 8 * SIZE(A1, LDA, 1), a3
577 movhpd 9 * SIZE(A1, LDA, 1), a3
580 movapd 10 * SIZE(XX), xtemp2
585 movsd 8 * SIZE(A1), a1
586 movhpd 9 * SIZE(A1), a1
588 movsd yy1, 4 * SIZE(YY)
589 movhpd yy1, 5 * SIZE(YY)
590 movsd 8 * SIZE(YY), yy1
591 movhpd 9 * SIZE(YY), yy1
593 movsd yy2, 6 * SIZE(YY)
594 movhpd yy2, 7 * SIZE(YY)
595 movsd 10 * SIZE(YY), yy2
596 movhpd 11 * SIZE(YY), yy2
619 movsd 2 * SIZE(A1, LDA, 1), a1
620 movhpd 3 * SIZE(A1, LDA, 1), a1
627 movsd 0 * SIZE(A2), a2
628 movhpd 1 * SIZE(A2), a2
635 movsd 2 * SIZE(A2), a3
636 movhpd 3 * SIZE(A2), a3
643 movsd 0 * SIZE(A2, LDA, 1), a1
644 movhpd 1 * SIZE(A2, LDA, 1), a1
651 movsd 2 * SIZE(A2, LDA, 1), a2
652 movhpd 3 * SIZE(A2, LDA, 1), a2
659 movsd 4 * SIZE(A1, LDA, 1), a3
660 movhpd 5 * SIZE(A1, LDA, 1), a3
663 movapd 4 * SIZE(XX), xtemp1
668 movsd 4 * SIZE(A1), a1
669 movhpd 5 * SIZE(A1), a1
672 movapd 6 * SIZE(XX), xtemp2
677 movsd 6 * SIZE(A1), a2
678 movhpd 7 * SIZE(A1), a2
680 movsd yy1, 0 * SIZE(YY)
681 movhpd yy1, 1 * SIZE(YY)
682 movsd 4 * SIZE(YY), yy1
683 movhpd 5 * SIZE(YY), yy1
685 movsd yy2, 2 * SIZE(YY)
686 movhpd yy2, 3 * SIZE(YY)
687 movsd 6 * SIZE(YY), yy2
688 movhpd 7 * SIZE(YY), yy2
705 movsd 0 * SIZE(A1, LDA, 1), a1
706 movhpd 1 * SIZE(A1, LDA, 1), a1
713 movsd 0 * SIZE(A2), a1
714 movhpd 1 * SIZE(A2), a1
721 movsd 0 * SIZE(A2, LDA, 1), a1
722 movhpd 1 * SIZE(A2, LDA, 1), a1
725 movapd 2 * SIZE(XX), xtemp1
730 movsd 2 * SIZE(A1), a1
732 movsd yy1, 0 * SIZE(YY)
733 movhpd yy1, 1 * SIZE(YY)
734 movsd 2 * SIZE(YY), yy1
751 movsd 0 * SIZE(A1, LDA, 1), a1
758 movsd 0 * SIZE(A2), a1
765 movsd 0 * SIZE(A2, LDA, 1), a1
773 movsd yy1, 0 * SIZE(YY)
781 unpcklpd xsum2, xsum1
782 unpcklpd xsum4, xsum3
784 unpckhpd xsum2, atemp1
785 unpckhpd xsum4, atemp3
794 movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
795 movhpd 1 * SIZE(NEW_Y, IS, SIZE), yy1
796 movsd 2 * SIZE(NEW_Y, IS, SIZE), yy2
797 movhpd 3 * SIZE(NEW_Y, IS, SIZE), yy2
802 movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE)
803 movhpd yy1, 1 * SIZE(NEW_Y, IS, SIZE)
804 movsd yy2, 2 * SIZE(NEW_Y, IS, SIZE)
805 movhpd yy2, 3 * SIZE(NEW_Y, IS, SIZE)
820 leaq 2 * SIZE(A, LDA, 2), A
822 movapd 0 * SIZE(NEW_X, IS, SIZE), atemp2
824 movsd 0 * SIZE(A1), xsum1
825 movhpd 1 * SIZE(A1), xsum1
828 movsd 1 * SIZE(A1), xsum2
829 movhpd 1 * SIZE(A1, LDA, 1), xsum2
833 movapd atemp2, atemp1
834 unpcklpd atemp1, atemp1
836 movddup atemp2, atemp1
838 unpckhpd atemp2, atemp2
843 movsd 2 * SIZE(A1), a1
844 movsd 2 * SIZE(A1, LDA, 1), a2
845 movsd 2 * SIZE(NEW_X, IS, SIZE), xtemp1
846 movsd 2 * SIZE(NEW_Y, IS, SIZE), yy1
860 movsd yy1, 2 * SIZE(NEW_Y, IS, SIZE)
866 unpcklpd xsum2, xsum1
867 unpckhpd xsum2, atemp1
873 movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
874 movhpd 1 * SIZE(NEW_Y, IS, SIZE), yy1
878 movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE)
879 movhpd yy1, 1 * SIZE(NEW_Y, IS, SIZE)
888 movsd 0 * SIZE(A), xsum1
889 movsd 0 * SIZE(NEW_X, IS, SIZE), atemp1
890 movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
894 movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE)
907 movapd 0 * SIZE(NEW_Y), %xmm0
908 movapd 2 * SIZE(NEW_Y), %xmm1
909 movapd 4 * SIZE(NEW_Y), %xmm2
910 movapd 6 * SIZE(NEW_Y), %xmm3
912 movsd %xmm0, 0 * SIZE(Y)
914 movhpd %xmm0, 0 * SIZE(Y)
916 movsd %xmm1, 0 * SIZE(Y)
918 movhpd %xmm1, 0 * SIZE(Y)
920 movsd %xmm2, 0 * SIZE(Y)
922 movhpd %xmm2, 0 * SIZE(Y)
924 movsd %xmm3, 0 * SIZE(Y)
926 movhpd %xmm3, 0 * SIZE(Y)
929 addq $8 * SIZE, NEW_Y
941 movsd 0 * SIZE(NEW_Y), %xmm0
943 movsd %xmm0, 0 * SIZE(Y)
946 addq $1 * SIZE, NEW_Y
964 movups 64(%rsp), %xmm6
965 movups 80(%rsp), %xmm7
966 movups 96(%rsp), %xmm8
967 movups 112(%rsp), %xmm9
968 movups 128(%rsp), %xmm10
969 movups 144(%rsp), %xmm11
970 movups 160(%rsp), %xmm12
971 movups 176(%rsp), %xmm13
972 movups 192(%rsp), %xmm14
973 movups 208(%rsp), %xmm15
976 addq $STACKSIZE, %rsp