1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
43 #define PREFETCH prefetcht0
44 #define PREFETCHW prefetcht0
45 #define PREFETCHSIZE (16 * 24)
49 #define PREFETCH prefetcht0
50 #define PREFETCHW prefetcht0
51 #define PREFETCHSIZE (16 * 24)
54 #if defined(PENRYN) || defined(DUNNINGTON)
55 #define PREFETCH prefetcht0
56 #define PREFETCHW prefetcht0
57 #define PREFETCHSIZE (16 * 24)
60 #if defined(NEHALEM) || defined(SANDYBRIDGE)
61 #define PREFETCH prefetcht0
62 #define PREFETCHW prefetcht0
63 #define PREFETCHSIZE (16 * 24)
67 #define PREFETCH prefetcht0
68 #define PREFETCHW prefetcht0
69 #define PREFETCHSIZE (16 * 28)
73 #define PREFETCH prefetch
74 #define PREFETCHW prefetchw
75 #define PREFETCHSIZE (16 * 12)
79 #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
80 #define PREFETCH prefetch
81 #define PREFETCHW prefetchw
82 #define PREFETCHSIZE (16 * 16)
86 #define PREFETCH prefetcht0
87 #define PREFETCHW prefetcht0
88 #define PREFETCHSIZE (8 * 24)
92 #define PREFETCH prefetcht0
93 #define PREFETCHW prefetcht0
94 #define PREFETCHSIZE (16 * 12)
101 #define OLD_Y 8 + STACKSIZE(%rsp)
102 #define OLD_INCY 16 + STACKSIZE(%rsp)
103 #define OLD_BUFFER 24 + STACKSIZE(%rsp)
114 #define STACKSIZE 256
116 #define OLD_A 40 + STACKSIZE(%rsp)
117 #define OLD_LDA 48 + STACKSIZE(%rsp)
118 #define OLD_X 56 + STACKSIZE(%rsp)
119 #define OLD_INCX 64 + STACKSIZE(%rsp)
120 #define OLD_Y 72 + STACKSIZE(%rsp)
121 #define OLD_INCY 80 + STACKSIZE(%rsp)
122 #define OLD_BUFFER 88 + STACKSIZE(%rsp)
147 #define ALPHA_R %xmm0
148 #define ALPHA_I %xmm1
170 #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
171 #define MOVDDUP(a, b, c) movddup a(b), c
172 #define MOVDDUP2(a, b, c) movddup a##b, c
174 #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c
175 #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c
187 subq $STACKSIZE, %rsp
198 movups %xmm6, 64(%rsp)
199 movups %xmm7, 80(%rsp)
200 movups %xmm8, 96(%rsp)
201 movups %xmm9, 112(%rsp)
202 movups %xmm10, 128(%rsp)
203 movups %xmm11, 144(%rsp)
204 movups %xmm12, 160(%rsp)
205 movups %xmm13, 176(%rsp)
206 movups %xmm14, 192(%rsp)
207 movups %xmm15, 208(%rsp)
220 movq OLD_BUFFER, BUFFER
222 salq $ZBASE_SHIFT, INCX
223 salq $ZBASE_SHIFT, INCY
224 salq $ZBASE_SHIFT, LDA
232 unpcklpd %xmm3, %xmm2
234 unpcklpd ALPHA_I, ALPHA_R
235 unpcklpd ALPHA_R, ALPHA_I
246 MOVDDUP(0 * SIZE, X, %xmm3)
247 MOVDDUP(1 * SIZE, X, %xmm4)
249 MOVDDUP(0 * SIZE, X, %xmm5)
250 MOVDDUP(1 * SIZE, X, %xmm6)
261 movapd %xmm3, 0 * SIZE(XX)
262 SHUFPD_1 %xmm3, %xmm3
264 movapd %xmm3, 2 * SIZE(XX)
266 movapd %xmm5, 4 * SIZE(XX)
267 SHUFPD_1 %xmm5, %xmm5
269 movapd %xmm5, 6 * SIZE(XX)
271 MOVDDUP(0 * SIZE, X, %xmm3)
272 MOVDDUP(1 * SIZE, X, %xmm4)
274 MOVDDUP(0 * SIZE, X, %xmm5)
275 MOVDDUP(1 * SIZE, X, %xmm6)
286 movapd %xmm3, 8 * SIZE(XX)
287 SHUFPD_1 %xmm3, %xmm3
289 movapd %xmm3, 10 * SIZE(XX)
291 movapd %xmm5, 12 * SIZE(XX)
292 SHUFPD_1 %xmm5, %xmm5
294 movapd %xmm5, 14 * SIZE(XX)
308 MOVDDUP(0 * SIZE, X, %xmm3)
309 MOVDDUP(1 * SIZE, X, %xmm4)
317 movapd %xmm3, 0 * SIZE(XX)
318 SHUFPD_1 %xmm3, %xmm3
320 movapd %xmm3, 2 * SIZE(XX)
328 /* now we don't need original X */
346 movsd 0 * SIZE(YY), %xmm0
347 movhpd 1 * SIZE(YY), %xmm0
349 movsd 0 * SIZE(YY), %xmm1
350 movhpd 1 * SIZE(YY), %xmm1
352 movsd 0 * SIZE(YY), %xmm2
353 movhpd 1 * SIZE(YY), %xmm2
355 movsd 0 * SIZE(YY), %xmm3
356 movhpd 1 * SIZE(YY), %xmm3
359 movapd %xmm0, 0 * SIZE(XX)
360 movapd %xmm1, 2 * SIZE(XX)
361 movapd %xmm2, 4 * SIZE(XX)
362 movapd %xmm3, 6 * SIZE(XX)
376 movsd 0 * SIZE(YY), %xmm0
377 movhpd 1 * SIZE(YY), %xmm0
380 movapd %xmm0, 0 * SIZE(XX)
397 leaq 4 * SIZE(A, LDA, 2), A
401 leaq 0 * SIZE(NEW_X, I, 4), XX
402 leaq 4 * SIZE(NEW_Y, I, 2), YY
404 movapd 0 * SIZE(XX), atemp1
405 movapd 2 * SIZE(XX), atemp2
406 movapd 4 * SIZE(XX), atemp3
407 movapd 6 * SIZE(XX), atemp4
409 MOVDDUP(0 * SIZE, A1, xsum1)
410 MOVDDUP(2 * SIZE, A1, xsum2)
416 MOVDDUP(1 * SIZE, A1, a1)
417 MOVDDUP(3 * SIZE, A1, a2)
424 MOVDDUP(3 * SIZE, A1, a2)
430 MOVDDUP(2 * SIZE, A1, a1)
431 MOVDDUP(2 * SIZE, A2, a2)
439 MOVDDUP(3 * SIZE, A1, a1)
440 MOVDDUP(3 * SIZE, A2, a2)
447 MOVDDUP(3 * SIZE, A1, a1)
453 MOVDDUP(4 * SIZE, A1, a1)
454 MOVDDUP(6 * SIZE, A2, a2)
456 movsd 0 * SIZE(YY), yy1
457 movhpd 1 * SIZE(YY), yy1
458 movsd 2 * SIZE(YY), yy2
459 movhpd 3 * SIZE(YY), yy2
461 movapd 8 * SIZE(XX), xtemp1
462 movapd 10 * SIZE(XX), xtemp2
463 movapd 12 * SIZE(XX), xtemp3
464 movapd 14 * SIZE(XX), xtemp4
483 MOVDDUP(1 * SIZE, A1, a1)
485 PREFETCH PREFETCHSIZE(A1)
492 MOVDDUP(3 * SIZE, A2, a2)
499 MOVDDUP(2 * SIZE, A1, a1)
506 MOVDDUP(0 * SIZE, A2, a2)
508 PREFETCH PREFETCHSIZE(XX)
511 movapd 12 * SIZE(XX), xtemp3
516 MOVDDUP(3 * SIZE, A1, a1)
519 movapd 8 * SIZE(XX), xtemp1
524 MOVDDUP(1 * SIZE, A2, a2)
527 movapd 14 * SIZE(XX), xtemp4
532 MOVDDUP(4 * SIZE, A1, a1)
534 movlpd yy2, 2 * SIZE(YY)
535 movhpd yy2, 3 * SIZE(YY)
536 movsd 6 * SIZE(YY), yy2
537 movhpd 7 * SIZE(YY), yy2
540 movapd 10 * SIZE(XX), xtemp2
545 MOVDDUP(6 * SIZE, A2, a2)
547 PREFETCH PREFETCHSIZE(A2)
549 movlpd yy1, 0 * SIZE(YY)
550 movhpd yy1, 1 * SIZE(YY)
551 movsd 4 * SIZE(YY), yy1
552 movhpd 5 * SIZE(YY), yy1
559 MOVDDUP(5 * SIZE, A1, a1)
566 MOVDDUP(7 * SIZE, A2, a2)
573 MOVDDUP(6 * SIZE, A1, a1)
575 PREFETCHW PREFETCHSIZE(YY)
582 MOVDDUP(4 * SIZE, A2, a2)
585 movapd 20 * SIZE(XX), xtemp3
590 MOVDDUP(7 * SIZE, A1, a1)
593 movapd 16 * SIZE(XX), xtemp1
598 MOVDDUP(5 * SIZE, A2, a2)
601 movapd 22 * SIZE(XX), xtemp4
606 MOVDDUP( 8 * SIZE, A1, a1)
608 movlpd yy2, 6 * SIZE(YY)
609 movhpd yy2, 7 * SIZE(YY)
610 movsd 10 * SIZE(YY), yy2
611 movhpd 11 * SIZE(YY), yy2
614 movapd 18 * SIZE(XX), xtemp2
619 MOVDDUP(10 * SIZE, A2, a2)
621 movlpd yy1, 4 * SIZE(YY)
622 movhpd yy1, 5 * SIZE(YY)
623 movsd 8 * SIZE(YY), yy1
624 movhpd 9 * SIZE(YY), yy1
647 MOVDDUP(1 * SIZE, A1, a1)
654 MOVDDUP(3 * SIZE, A2, a2)
661 MOVDDUP(2 * SIZE, A1, a1)
668 MOVDDUP(0 * SIZE, A2, a2)
671 movapd 12 * SIZE(XX), xtemp3
676 MOVDDUP(3 * SIZE, A1, a1)
679 movapd 8 * SIZE(XX), xtemp1
684 MOVDDUP(1 * SIZE, A2, a2)
687 movapd 14 * SIZE(XX), xtemp4
692 MOVDDUP(4 * SIZE, A1, a1)
694 movlpd yy2, 2 * SIZE(YY)
695 movhpd yy2, 3 * SIZE(YY)
696 movsd 6 * SIZE(YY), yy2
697 movhpd 7 * SIZE(YY), yy2
700 movapd 10 * SIZE(XX), xtemp2
706 movlpd yy1, 0 * SIZE(YY)
707 movhpd yy1, 1 * SIZE(YY)
708 movsd 4 * SIZE(YY), yy1
709 movhpd 5 * SIZE(YY), yy1
720 MOVDDUP(1 * SIZE, A1, a2)
728 MOVDDUP(0 * SIZE, A2, a1)
736 MOVDDUP(1 * SIZE, A2, a2)
750 movlpd yy1, 0 * SIZE(YY)
751 movhpd yy1, 1 * SIZE(YY)
757 movsd 0 * SIZE(NEW_Y, I, 2), yy1
758 movhpd 1 * SIZE(NEW_Y, I, 2), yy1
759 movsd 2 * SIZE(NEW_Y, I, 2), yy2
760 movhpd 3 * SIZE(NEW_Y, I, 2), yy2
765 movlpd yy1, 0 * SIZE(NEW_Y, I, 2)
766 movhpd yy1, 1 * SIZE(NEW_Y, I, 2)
767 movlpd yy2, 2 * SIZE(NEW_Y, I, 2)
768 movhpd yy2, 3 * SIZE(NEW_Y, I, 2)
784 movapd 0 * SIZE(NEW_X, I, 4), atemp1
785 movapd 2 * SIZE(NEW_X, I, 4), atemp2
787 movsd 0 * SIZE(NEW_Y, I, 2), yy1
788 movhpd 1 * SIZE(NEW_Y, I, 2), yy1
791 MOVDDUP(0 * SIZE, A, a1)
792 MOVDDUP(1 * SIZE, A, a2)
799 MOVDDUP(0 * SIZE, A, a1)
805 movlpd yy1, 0 * SIZE(NEW_Y, I, 2)
806 movhpd yy1, 1 * SIZE(NEW_Y, I, 2)
819 movapd 0 * SIZE(NEW_Y), %xmm0
820 movapd 2 * SIZE(NEW_Y), %xmm1
821 movapd 4 * SIZE(NEW_Y), %xmm2
822 movapd 6 * SIZE(NEW_Y), %xmm3
824 movsd %xmm0, 0 * SIZE(Y)
825 movhpd %xmm0, 1 * SIZE(Y)
827 movsd %xmm1, 0 * SIZE(Y)
828 movhpd %xmm1, 1 * SIZE(Y)
830 movsd %xmm2, 0 * SIZE(Y)
831 movhpd %xmm2, 1 * SIZE(Y)
833 movsd %xmm3, 0 * SIZE(Y)
834 movhpd %xmm3, 1 * SIZE(Y)
837 addq $8 * SIZE, NEW_Y
849 movapd 0 * SIZE(NEW_Y), %xmm0
851 movsd %xmm0, 0 * SIZE(Y)
852 movhpd %xmm0, 1 * SIZE(Y)
855 addq $2 * SIZE, NEW_Y
872 movups 64(%rsp), %xmm6
873 movups 80(%rsp), %xmm7
874 movups 96(%rsp), %xmm8
875 movups 112(%rsp), %xmm9
876 movups 128(%rsp), %xmm10
877 movups 144(%rsp), %xmm11
878 movups 160(%rsp), %xmm12
879 movups 176(%rsp), %xmm13
880 movups 192(%rsp), %xmm14
881 movups 208(%rsp), %xmm15
884 addq $STACKSIZE, %rsp