1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
43 #define PREFETCH prefetcht0
44 #define PREFETCHW prefetcht0
45 #define PREFETCHSIZE (16 * 24)
49 #define PREFETCH prefetcht0
50 #define PREFETCHW prefetcht0
51 #define PREFETCHSIZE (16 * 24)
54 #if defined(PENRYN) || defined(DUNNINGTON)
55 #define PREFETCH prefetcht0
56 #define PREFETCHW prefetcht0
57 #define PREFETCHSIZE (16 * 24)
60 #if defined(NEHALEM) || defined(SANDYBRIDGE)
61 #define PREFETCH prefetcht0
62 #define PREFETCHW prefetcht0
63 #define PREFETCHSIZE (16 * 24)
67 #define PREFETCH prefetcht0
68 #define PREFETCHW prefetcht0
69 #define PREFETCHSIZE (16 * 28)
73 #define PREFETCH prefetch
74 #define PREFETCHW prefetchw
75 #define PREFETCHSIZE (16 * 12)
80 #define PREFETCH prefetcht0
81 #define PREFETCHW prefetcht0
82 #define PREFETCHSIZE (16 * 24)
86 #define PREFETCH prefetcht0
87 #define PREFETCHW prefetcht0
88 #define PREFETCHSIZE (16 * 12)
95 #define OLD_Y 8 + STACKSIZE(%rsp)
96 #define OLD_INCY 16 + STACKSIZE(%rsp)
97 #define OLD_BUFFER 24 + STACKSIZE(%rsp)
108 #define STACKSIZE 256
110 #define OLD_A 40 + STACKSIZE(%rsp)
111 #define OLD_LDA 48 + STACKSIZE(%rsp)
112 #define OLD_X 56 + STACKSIZE(%rsp)
113 #define OLD_INCX 64 + STACKSIZE(%rsp)
114 #define OLD_Y 72 + STACKSIZE(%rsp)
115 #define OLD_INCY 80 + STACKSIZE(%rsp)
116 #define OLD_BUFFER 88 + STACKSIZE(%rsp)
140 #define ALPHA_R %xmm0
141 #define ALPHA_I %xmm1
163 #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
164 #define MOVDDUP(a, b, c) movddup a(b), c
165 #define MOVDDUP2(a, b, c) movddup a##b, c
167 #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c
168 #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c
174 subq $STACKSIZE, %rsp
184 movq OLD_BUFFER, BUFFER
186 salq $ZBASE_SHIFT, INCX
187 salq $ZBASE_SHIFT, INCY
188 salq $ZBASE_SHIFT, LDA
196 unpcklpd %xmm3, %xmm2
198 unpcklpd ALPHA_I, ALPHA_R
199 unpcklpd ALPHA_R, ALPHA_I
210 MOVDDUP(0 * SIZE, X, %xmm3)
211 MOVDDUP(1 * SIZE, X, %xmm4)
213 MOVDDUP(0 * SIZE, X, %xmm5)
214 MOVDDUP(1 * SIZE, X, %xmm6)
225 movapd %xmm3, 0 * SIZE(XX)
226 SHUFPD_1 %xmm3, %xmm3
228 movapd %xmm3, 2 * SIZE(XX)
230 movapd %xmm5, 4 * SIZE(XX)
231 SHUFPD_1 %xmm5, %xmm5
233 movapd %xmm5, 6 * SIZE(XX)
235 MOVDDUP(0 * SIZE, X, %xmm3)
236 MOVDDUP(1 * SIZE, X, %xmm4)
238 MOVDDUP(0 * SIZE, X, %xmm5)
239 MOVDDUP(1 * SIZE, X, %xmm6)
250 movapd %xmm3, 8 * SIZE(XX)
251 SHUFPD_1 %xmm3, %xmm3
253 movapd %xmm3, 10 * SIZE(XX)
255 movapd %xmm5, 12 * SIZE(XX)
256 SHUFPD_1 %xmm5, %xmm5
258 movapd %xmm5, 14 * SIZE(XX)
272 MOVDDUP(0 * SIZE, X, %xmm3)
273 MOVDDUP(1 * SIZE, X, %xmm4)
281 movapd %xmm3, 0 * SIZE(XX)
282 SHUFPD_1 %xmm3, %xmm3
284 movapd %xmm3, 2 * SIZE(XX)
292 /* now we don't need original X */
310 movsd 0 * SIZE(YY), %xmm0
311 movhpd 1 * SIZE(YY), %xmm0
313 movsd 0 * SIZE(YY), %xmm1
314 movhpd 1 * SIZE(YY), %xmm1
316 movsd 0 * SIZE(YY), %xmm2
317 movhpd 1 * SIZE(YY), %xmm2
319 movsd 0 * SIZE(YY), %xmm3
320 movhpd 1 * SIZE(YY), %xmm3
323 movapd %xmm0, 0 * SIZE(XX)
324 movapd %xmm1, 2 * SIZE(XX)
325 movapd %xmm2, 4 * SIZE(XX)
326 movapd %xmm3, 6 * SIZE(XX)
340 movsd 0 * SIZE(YY), %xmm0
341 movhpd 1 * SIZE(YY), %xmm0
344 movapd %xmm0, 0 * SIZE(XX)
361 leaq 4 * SIZE(A, LDA, 2), A
365 leaq 0 * SIZE(NEW_X, I, 4), XX
366 leaq 4 * SIZE(NEW_Y, I, 2), YY
368 movapd 0 * SIZE(XX), atemp1
369 movapd 2 * SIZE(XX), atemp2
370 movapd 4 * SIZE(XX), atemp3
371 movapd 6 * SIZE(XX), atemp4
373 MOVDDUP(0 * SIZE, A1, xsum1)
374 MOVDDUP(2 * SIZE, A1, xsum2)
379 MOVDDUP(1 * SIZE, A1, a1)
380 MOVDDUP(3 * SIZE, A1, a2)
387 MOVDDUP(2 * SIZE, A1, a1)
388 MOVDDUP(2 * SIZE, A2, a2)
395 MOVDDUP(3 * SIZE, A1, a1)
396 MOVDDUP(3 * SIZE, A2, a2)
403 MOVDDUP(4 * SIZE, A1, a1)
404 MOVDDUP(6 * SIZE, A2, a2)
406 movsd 0 * SIZE(YY), yy1
407 movhpd 1 * SIZE(YY), yy1
408 movsd 2 * SIZE(YY), yy2
409 movhpd 3 * SIZE(YY), yy2
411 movapd 8 * SIZE(XX), xtemp1
412 movapd 10 * SIZE(XX), xtemp2
413 movapd 12 * SIZE(XX), xtemp3
414 movapd 14 * SIZE(XX), xtemp4
433 MOVDDUP(1 * SIZE, A1, a1)
435 PREFETCH PREFETCHSIZE(A1)
442 MOVDDUP(3 * SIZE, A2, a2)
449 MOVDDUP(2 * SIZE, A1, a1)
456 MOVDDUP(0 * SIZE, A2, a2)
458 PREFETCH PREFETCHSIZE(XX)
461 movapd 12 * SIZE(XX), xtemp3
466 MOVDDUP(3 * SIZE, A1, a1)
469 movapd 8 * SIZE(XX), xtemp1
474 MOVDDUP(1 * SIZE, A2, a2)
477 movapd 14 * SIZE(XX), xtemp4
482 MOVDDUP(4 * SIZE, A1, a1)
484 movlpd yy2, 2 * SIZE(YY)
485 movhpd yy2, 3 * SIZE(YY)
486 movsd 6 * SIZE(YY), yy2
487 movhpd 7 * SIZE(YY), yy2
490 movapd 10 * SIZE(XX), xtemp2
495 MOVDDUP(6 * SIZE, A2, a2)
497 PREFETCH PREFETCHSIZE(A2)
499 movlpd yy1, 0 * SIZE(YY)
500 movhpd yy1, 1 * SIZE(YY)
501 movsd 4 * SIZE(YY), yy1
502 movhpd 5 * SIZE(YY), yy1
509 MOVDDUP(5 * SIZE, A1, a1)
516 MOVDDUP(7 * SIZE, A2, a2)
523 MOVDDUP(6 * SIZE, A1, a1)
525 PREFETCHW PREFETCHSIZE(YY)
532 MOVDDUP(4 * SIZE, A2, a2)
535 movapd 20 * SIZE(XX), xtemp3
540 MOVDDUP(7 * SIZE, A1, a1)
543 movapd 16 * SIZE(XX), xtemp1
548 MOVDDUP(5 * SIZE, A2, a2)
551 movapd 22 * SIZE(XX), xtemp4
556 MOVDDUP( 8 * SIZE, A1, a1)
558 movlpd yy2, 6 * SIZE(YY)
559 movhpd yy2, 7 * SIZE(YY)
560 movsd 10 * SIZE(YY), yy2
561 movhpd 11 * SIZE(YY), yy2
564 movapd 18 * SIZE(XX), xtemp2
569 MOVDDUP(10 * SIZE, A2, a2)
571 movlpd yy1, 4 * SIZE(YY)
572 movhpd yy1, 5 * SIZE(YY)
573 movsd 8 * SIZE(YY), yy1
574 movhpd 9 * SIZE(YY), yy1
597 MOVDDUP(1 * SIZE, A1, a1)
604 MOVDDUP(3 * SIZE, A2, a2)
611 MOVDDUP(2 * SIZE, A1, a1)
618 MOVDDUP(0 * SIZE, A2, a2)
621 movapd 12 * SIZE(XX), xtemp3
626 MOVDDUP(3 * SIZE, A1, a1)
629 movapd 8 * SIZE(XX), xtemp1
634 MOVDDUP(1 * SIZE, A2, a2)
637 movapd 14 * SIZE(XX), xtemp4
642 MOVDDUP(4 * SIZE, A1, a1)
644 movlpd yy2, 2 * SIZE(YY)
645 movhpd yy2, 3 * SIZE(YY)
646 movsd 6 * SIZE(YY), yy2
647 movhpd 7 * SIZE(YY), yy2
650 movapd 10 * SIZE(XX), xtemp2
656 movlpd yy1, 0 * SIZE(YY)
657 movhpd yy1, 1 * SIZE(YY)
658 movsd 4 * SIZE(YY), yy1
659 movhpd 5 * SIZE(YY), yy1
670 MOVDDUP(1 * SIZE, A1, a2)
678 MOVDDUP(0 * SIZE, A2, a1)
686 MOVDDUP(1 * SIZE, A2, a2)
700 movlpd yy1, 0 * SIZE(YY)
701 movhpd yy1, 1 * SIZE(YY)
707 movsd 0 * SIZE(NEW_Y, I, 2), yy1
708 movhpd 1 * SIZE(NEW_Y, I, 2), yy1
709 movsd 2 * SIZE(NEW_Y, I, 2), yy2
710 movhpd 3 * SIZE(NEW_Y, I, 2), yy2
715 movlpd yy1, 0 * SIZE(NEW_Y, I, 2)
716 movhpd yy1, 1 * SIZE(NEW_Y, I, 2)
717 movlpd yy2, 2 * SIZE(NEW_Y, I, 2)
718 movhpd yy2, 3 * SIZE(NEW_Y, I, 2)
735 movapd 0 * SIZE(NEW_X, I, 4), atemp1
736 movapd 2 * SIZE(NEW_X, I, 4), atemp2
738 movsd 0 * SIZE(NEW_Y, I, 2), yy1
739 movhpd 1 * SIZE(NEW_Y, I, 2), yy1
741 MOVDDUP(0 * SIZE, A, a1)
742 MOVDDUP(1 * SIZE, A, a2)
749 movlpd yy1, 0 * SIZE(NEW_Y, I, 2)
750 movhpd yy1, 1 * SIZE(NEW_Y, I, 2)
763 movapd 0 * SIZE(NEW_Y), %xmm0
764 movapd 2 * SIZE(NEW_Y), %xmm1
765 movapd 4 * SIZE(NEW_Y), %xmm2
766 movapd 6 * SIZE(NEW_Y), %xmm3
768 movsd %xmm0, 0 * SIZE(Y)
769 movhpd %xmm0, 1 * SIZE(Y)
771 movsd %xmm1, 0 * SIZE(Y)
772 movhpd %xmm1, 1 * SIZE(Y)
774 movsd %xmm2, 0 * SIZE(Y)
775 movhpd %xmm2, 1 * SIZE(Y)
777 movsd %xmm3, 0 * SIZE(Y)
778 movhpd %xmm3, 1 * SIZE(Y)
781 addq $8 * SIZE, NEW_Y
793 movapd 0 * SIZE(NEW_Y), %xmm0
795 movsd %xmm0, 0 * SIZE(Y)
796 movhpd %xmm0, 1 * SIZE(Y)
799 addq $2 * SIZE, NEW_Y
812 addq $STACKSIZE, %rsp