1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
43 #define PREFETCH prefetcht0
44 #define PREFETCHW prefetcht0
45 #define PREFETCHSIZE (16 * 24)
49 #define PREFETCH prefetcht0
50 #define PREFETCHW prefetcht0
51 #define PREFETCHSIZE (16 * 24)
54 #if defined(PENRYN) || defined(DUNNINGTON)
55 #define PREFETCH prefetcht0
56 #define PREFETCHW prefetcht0
57 #define PREFETCHSIZE (16 * 24)
60 #if defined(NEHALEM) || defined(SANDYBRIDGE)
61 #define PREFETCH prefetcht0
62 #define PREFETCHW prefetcht0
63 #define PREFETCHSIZE (16 * 24)
67 #define PREFETCH prefetcht0
68 #define PREFETCHW prefetcht0
69 #define PREFETCHSIZE (16 * 28)
73 #define PREFETCH prefetch
74 #define PREFETCHW prefetchw
75 #define PREFETCHSIZE (16 * 12)
79 #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
80 #define PREFETCH prefetch
81 #define PREFETCHW prefetchw
82 #define PREFETCHSIZE (16 * 16)
86 #define PREFETCH prefetcht0
87 #define PREFETCHW prefetcht0
88 #define PREFETCHSIZE (8 * 24)
92 #define PREFETCH prefetcht0
93 #define PREFETCHW prefetcht0
94 #define PREFETCHSIZE (16 * 28)
101 #define OLD_Y 8 + STACKSIZE(%rsp)
102 #define OLD_INCY 16 + STACKSIZE(%rsp)
103 #define OLD_BUFFER 24 + STACKSIZE(%rsp)
114 #define STACKSIZE 256
116 #define OLD_A 40 + STACKSIZE(%rsp)
117 #define OLD_LDA 48 + STACKSIZE(%rsp)
118 #define OLD_X 56 + STACKSIZE(%rsp)
119 #define OLD_INCX 64 + STACKSIZE(%rsp)
120 #define OLD_Y 72 + STACKSIZE(%rsp)
121 #define OLD_INCY 80 + STACKSIZE(%rsp)
122 #define OLD_BUFFER 88 + STACKSIZE(%rsp)
146 #define ALPHA_R %xmm0
147 #define ALPHA_I %xmm1
169 #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
170 #define MOVDDUP(a, b, c) movddup a(b), c
171 #define MOVDDUP2(a, b, c) movddup a##b, c
173 #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c
174 #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c
186 subq $STACKSIZE, %rsp
197 movups %xmm6, 64(%rsp)
198 movups %xmm7, 80(%rsp)
199 movups %xmm8, 96(%rsp)
200 movups %xmm9, 112(%rsp)
201 movups %xmm10, 128(%rsp)
202 movups %xmm11, 144(%rsp)
203 movups %xmm12, 160(%rsp)
204 movups %xmm13, 176(%rsp)
205 movups %xmm14, 192(%rsp)
206 movups %xmm15, 208(%rsp)
219 movq OLD_BUFFER, BUFFER
221 salq $ZBASE_SHIFT, INCX
222 salq $ZBASE_SHIFT, INCY
223 salq $ZBASE_SHIFT, LDA
238 unpcklpd %xmm3, %xmm2
240 unpcklpd ALPHA_I, ALPHA_R
241 unpcklpd ALPHA_R, ALPHA_I
252 MOVDDUP(0 * SIZE, X, %xmm3)
253 MOVDDUP(1 * SIZE, X, %xmm4)
255 MOVDDUP(0 * SIZE, X, %xmm5)
256 MOVDDUP(1 * SIZE, X, %xmm6)
267 movapd %xmm3, 0 * SIZE(XX)
268 SHUFPD_1 %xmm3, %xmm3
270 movapd %xmm3, 2 * SIZE(XX)
272 movapd %xmm5, 4 * SIZE(XX)
273 SHUFPD_1 %xmm5, %xmm5
275 movapd %xmm5, 6 * SIZE(XX)
277 MOVDDUP(0 * SIZE, X, %xmm3)
278 MOVDDUP(1 * SIZE, X, %xmm4)
280 MOVDDUP(0 * SIZE, X, %xmm5)
281 MOVDDUP(1 * SIZE, X, %xmm6)
292 movapd %xmm3, 8 * SIZE(XX)
293 SHUFPD_1 %xmm3, %xmm3
295 movapd %xmm3, 10 * SIZE(XX)
297 movapd %xmm5, 12 * SIZE(XX)
298 SHUFPD_1 %xmm5, %xmm5
300 movapd %xmm5, 14 * SIZE(XX)
314 MOVDDUP(0 * SIZE, X, %xmm3)
315 MOVDDUP(1 * SIZE, X, %xmm4)
323 movapd %xmm3, 0 * SIZE(XX)
324 SHUFPD_1 %xmm3, %xmm3
326 movapd %xmm3, 2 * SIZE(XX)
334 /* now we don't need original X */
352 movsd 0 * SIZE(YY), %xmm0
353 movhpd 1 * SIZE(YY), %xmm0
355 movsd 0 * SIZE(YY), %xmm1
356 movhpd 1 * SIZE(YY), %xmm1
358 movsd 0 * SIZE(YY), %xmm2
359 movhpd 1 * SIZE(YY), %xmm2
361 movsd 0 * SIZE(YY), %xmm3
362 movhpd 1 * SIZE(YY), %xmm3
365 movapd %xmm0, 0 * SIZE(XX)
366 movapd %xmm1, 2 * SIZE(XX)
367 movapd %xmm2, 4 * SIZE(XX)
368 movapd %xmm3, 6 * SIZE(XX)
382 movsd 0 * SIZE(YY), %xmm0
383 movhpd 1 * SIZE(YY), %xmm0
386 movapd %xmm0, 0 * SIZE(XX)
407 movapd 0 * SIZE(NEW_X, I, SIZE), atemp1
408 movapd 2 * SIZE(NEW_X, I, SIZE), atemp2
409 movapd 4 * SIZE(NEW_X, I, SIZE), atemp3
410 movapd 6 * SIZE(NEW_X, I, SIZE), atemp4
415 movsd 0 * SIZE(NEW_Y), yy1
416 movhpd 1 * SIZE(NEW_Y), yy1
417 movsd 2 * SIZE(NEW_Y), yy2
418 movhpd 3 * SIZE(NEW_Y), yy2
420 movapd 0 * SIZE(NEW_X), xtemp1
421 movapd 2 * SIZE(NEW_X), xtemp2
422 movapd 4 * SIZE(NEW_X), xtemp3
423 movapd 6 * SIZE(NEW_X), xtemp4
425 MOVDDUP(0 * SIZE, A1, a1)
426 MOVDDUP(2 * SIZE, A2, a2)
427 MOVDDUP(1 * SIZE, A1, a3)
443 MOVDDUP(3 * SIZE, A2, a1)
445 PREFETCH PREFETCHSIZE(A1)
452 MOVDDUP(2 * SIZE, A1, a2)
459 MOVDDUP(0 * SIZE, A2, a3)
466 MOVDDUP(3 * SIZE, A1, a1)
468 PREFETCH PREFETCHSIZE(XX)
471 movapd 12 * SIZE(XX), xtemp3
476 MOVDDUP(1 * SIZE, A2, a2)
479 movapd 8 * SIZE(XX), xtemp1
484 MOVDDUP(4 * SIZE, A1, a3)
487 movapd 14 * SIZE(XX), xtemp4
492 MOVDDUP(6 * SIZE, A2, a1)
494 movlpd yy2, 2 * SIZE(YY)
495 movhpd yy2, 3 * SIZE(YY)
496 movsd 6 * SIZE(YY), yy2
497 movhpd 7 * SIZE(YY), yy2
500 movapd 10 * SIZE(XX), xtemp2
505 MOVDDUP(5 * SIZE, A1, a2)
507 PREFETCH PREFETCHSIZE(A2)
509 movlpd yy1, 0 * SIZE(YY)
510 movhpd yy1, 1 * SIZE(YY)
511 movsd 4 * SIZE(YY), yy1
512 movhpd 5 * SIZE(YY), yy1
519 MOVDDUP(7 * SIZE, A2, a3)
526 MOVDDUP(6 * SIZE, A1, a1)
533 MOVDDUP(4 * SIZE, A2, a2)
535 PREFETCHW PREFETCHSIZE(YY)
542 MOVDDUP(7 * SIZE, A1, a3)
545 movapd 20 * SIZE(XX), xtemp3
550 MOVDDUP(5 * SIZE, A2, a1)
553 movapd 16 * SIZE(XX), xtemp1
558 MOVDDUP(10 * SIZE, A2, a2)
561 movapd 22 * SIZE(XX), xtemp4
566 MOVDDUP( 9 * SIZE, A1, a3)
568 movlpd yy2, 6 * SIZE(YY)
569 movhpd yy2, 7 * SIZE(YY)
570 movsd 10 * SIZE(YY), yy2
571 movhpd 11 * SIZE(YY), yy2
574 movapd 18 * SIZE(XX), xtemp2
579 MOVDDUP( 8 * SIZE, A1, a1)
581 movlpd yy1, 4 * SIZE(YY)
582 movhpd yy1, 5 * SIZE(YY)
583 movsd 8 * SIZE(YY), yy1
584 movhpd 9 * SIZE(YY), yy1
604 MOVDDUP(1 * SIZE, A1, a1)
611 MOVDDUP(3 * SIZE, A2, a2)
618 MOVDDUP(2 * SIZE, A1, a1)
625 MOVDDUP(0 * SIZE, A2, a2)
632 MOVDDUP(3 * SIZE, A1, a1)
639 MOVDDUP(1 * SIZE, A2, a2)
647 movlpd yy2, 2 * SIZE(YY)
648 movhpd yy2, 3 * SIZE(YY)
649 movsd 6 * SIZE(YY), yy2
650 movhpd 7 * SIZE(YY), yy2
658 movlpd yy1, 0 * SIZE(YY)
659 movhpd yy1, 1 * SIZE(YY)
660 movsd 4 * SIZE(YY), yy1
661 movhpd 5 * SIZE(YY), yy1
669 MOVDDUP(0 * SIZE, A1, a1)
670 MOVDDUP(0 * SIZE, A2, a2)
678 MOVDDUP(1 * SIZE, A1, a1)
679 MOVDDUP(1 * SIZE, A2, a2)
686 MOVDDUP(1 * SIZE, A2, a2)
692 MOVDDUP(0 * SIZE, A2, a1)
693 MOVDDUP(2 * SIZE, A2, a2)
701 MOVDDUP(1 * SIZE, A2, a1)
702 MOVDDUP(3 * SIZE, A2, a2)
709 MOVDDUP(1 * SIZE, A2, a1)
718 movlpd yy1, 0 * SIZE(YY)
719 movhpd yy1, 1 * SIZE(YY)
720 movlpd yy2, 2 * SIZE(YY)
721 movhpd yy2, 3 * SIZE(YY)
738 movapd 0 * SIZE(NEW_X, I, SIZE), atemp1
739 movapd 2 * SIZE(NEW_X, I, SIZE), atemp2
744 MOVDDUP(0 * SIZE, A1, a1)
745 MOVDDUP(1 * SIZE, A1, a2)
747 movapd 0 * SIZE(NEW_X), xtemp1
748 movapd 2 * SIZE(NEW_X), xtemp2
749 movapd 4 * SIZE(NEW_X), xtemp3
750 movapd 6 * SIZE(NEW_X), xtemp4
752 movsd 0 * SIZE(NEW_Y), yy1
753 movhpd 1 * SIZE(NEW_Y), yy1
754 movsd 2 * SIZE(NEW_Y), yy2
755 movhpd 3 * SIZE(NEW_Y), yy2
767 movapd 8 * SIZE(XX), xtemp1
772 MOVDDUP(2 * SIZE, A1, a1)
775 movapd 10 * SIZE(XX), xtemp2
780 MOVDDUP(3 * SIZE, A1, a2)
782 movlpd yy1, 0 * SIZE(YY)
783 movhpd yy1, 1 * SIZE(YY)
784 movsd 4 * SIZE(YY), yy1
785 movhpd 5 * SIZE(YY), yy1
788 movapd 12 * SIZE(XX), xtemp3
793 MOVDDUP(4 * SIZE, A1, a1)
796 movapd 14 * SIZE(XX), xtemp4
801 MOVDDUP(5 * SIZE, A1, a2)
803 movlpd yy2, 2 * SIZE(YY)
804 movhpd yy2, 3 * SIZE(YY)
805 movsd 6 * SIZE(YY), yy2
806 movhpd 7 * SIZE(YY), yy2
817 MOVDDUP(0 * SIZE, A1, a1)
820 MOVDDUP(1 * SIZE, A1, a2)
835 movlpd yy1, 0 * SIZE(YY)
836 movhpd yy1, 1 * SIZE(YY)
849 movapd 0 * SIZE(NEW_Y), %xmm0
850 movapd 2 * SIZE(NEW_Y), %xmm1
851 movapd 4 * SIZE(NEW_Y), %xmm2
852 movapd 6 * SIZE(NEW_Y), %xmm3
854 movsd %xmm0, 0 * SIZE(Y)
855 movhpd %xmm0, 1 * SIZE(Y)
857 movsd %xmm1, 0 * SIZE(Y)
858 movhpd %xmm1, 1 * SIZE(Y)
860 movsd %xmm2, 0 * SIZE(Y)
861 movhpd %xmm2, 1 * SIZE(Y)
863 movsd %xmm3, 0 * SIZE(Y)
864 movhpd %xmm3, 1 * SIZE(Y)
867 addq $8 * SIZE, NEW_Y
879 movapd 0 * SIZE(NEW_Y), %xmm0
881 movsd %xmm0, 0 * SIZE(Y)
882 movhpd %xmm0, 1 * SIZE(Y)
885 addq $2 * SIZE, NEW_Y
902 movups 64(%rsp), %xmm6
903 movups 80(%rsp), %xmm7
904 movups 96(%rsp), %xmm8
905 movups 112(%rsp), %xmm9
906 movups 128(%rsp), %xmm10
907 movups 144(%rsp), %xmm11
908 movups 160(%rsp), %xmm12
909 movups 176(%rsp), %xmm13
910 movups 192(%rsp), %xmm14
911 movups 208(%rsp), %xmm15
914 addq $STACKSIZE, %rsp