1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
43 #define PREFETCH prefetcht0
44 #define PREFETCHW prefetcht0
45 #define PREFETCHSIZE (16 * 12)
49 #define PREFETCH prefetcht0
50 #define PREFETCHW prefetcht0
51 #define PREFETCHSIZE (16 * 12)
54 #if defined(PENRYN) || defined(DUNNINGTON)
55 #define PREFETCH prefetcht0
56 #define PREFETCHW prefetcht0
57 #define PREFETCHSIZE (16 * 12)
60 #if defined(NEHALEM) || defined(SANDYBRIDGE)
61 #define PREFETCH prefetcht0
62 #define PREFETCHW prefetcht0
63 #define PREFETCHSIZE (16 * 12)
67 #define PREFETCH prefetcht0
68 #define PREFETCHW prefetcht0
69 #define PREFETCHSIZE (16 * 20)
73 #define PREFETCH prefetch
74 #define PREFETCHW prefetchw
75 #define PREFETCHSIZE (16 * 8)
79 #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
80 #define PREFETCH prefetch
81 #define PREFETCHW prefetchw
82 #define PREFETCHSIZE (16 * 16)
86 #define PREFETCH prefetcht0
87 #define PREFETCHW prefetcht0
88 #define PREFETCHSIZE (16 * 24)
92 #define PREFETCH prefetcht0
93 #define PREFETCHW prefetcht0
94 #define PREFETCHSIZE (16 * 20)
101 #define OLD_Y 8 + STACKSIZE(%rsp)
102 #define OLD_INCY 16 + STACKSIZE(%rsp)
103 #define OLD_BUFFER 24 + STACKSIZE(%rsp)
114 #define STACKSIZE 256
116 #define OLD_LDA 40 + STACKSIZE(%rsp)
117 #define OLD_X 48 + STACKSIZE(%rsp)
118 #define OLD_INCX 56 + STACKSIZE(%rsp)
119 #define OLD_Y 64 + STACKSIZE(%rsp)
120 #define OLD_INCY 72 + STACKSIZE(%rsp)
121 #define OLD_BUFFER 80 + STACKSIZE(%rsp)
171 subq $STACKSIZE, %rsp
182 movups %xmm6, 64(%rsp)
183 movups %xmm7, 80(%rsp)
184 movups %xmm8, 96(%rsp)
185 movups %xmm9, 112(%rsp)
186 movups %xmm10, 128(%rsp)
187 movups %xmm11, 144(%rsp)
188 movups %xmm12, 160(%rsp)
189 movups %xmm13, 176(%rsp)
190 movups %xmm14, 192(%rsp)
191 movups %xmm15, 208(%rsp)
202 movq OLD_BUFFER, BUFFER
204 leaq (,INCX, SIZE), INCX
205 leaq (,INCY, SIZE), INCY
206 leaq (,LDA, SIZE), LDA
211 shufps $0, ALPHA, ALPHA
221 movss 0 * SIZE(X), %xmm1
223 movss 0 * SIZE(X), %xmm2
225 movss 0 * SIZE(X), %xmm3
227 movss 0 * SIZE(X), %xmm4
229 movss 0 * SIZE(X), %xmm5
231 movss 0 * SIZE(X), %xmm6
233 movss 0 * SIZE(X), %xmm7
235 movss 0 * SIZE(X), %xmm8
247 movss %xmm1, 0 * SIZE(XX)
248 movss %xmm2, 1 * SIZE(XX)
249 movss %xmm3, 2 * SIZE(XX)
250 movss %xmm4, 3 * SIZE(XX)
251 movss %xmm5, 4 * SIZE(XX)
252 movss %xmm6, 5 * SIZE(XX)
253 movss %xmm7, 6 * SIZE(XX)
254 movss %xmm8, 7 * SIZE(XX)
268 movss 0 * SIZE(X), %xmm1
273 movss %xmm1, 0 * SIZE(XX)
281 /* now we don't need original X */
299 movss 0 * SIZE(YY), %xmm0
301 movss 0 * SIZE(YY), %xmm1
303 movss 0 * SIZE(YY), %xmm2
305 movss 0 * SIZE(YY), %xmm3
307 movss 0 * SIZE(YY), %xmm4
309 movss 0 * SIZE(YY), %xmm5
311 movss 0 * SIZE(YY), %xmm6
313 movss 0 * SIZE(YY), %xmm7
316 movss %xmm0, 0 * SIZE(XX)
317 movss %xmm1, 1 * SIZE(XX)
318 movss %xmm2, 2 * SIZE(XX)
319 movss %xmm3, 3 * SIZE(XX)
320 movss %xmm4, 4 * SIZE(XX)
321 movss %xmm5, 5 * SIZE(XX)
322 movss %xmm6, 6 * SIZE(XX)
323 movss %xmm7, 7 * SIZE(XX)
337 movss 0 * SIZE(YY), %xmm0
340 movss %xmm0, 0 * SIZE(XX)
357 leaq 4 * SIZE(A, LDA, 4), A
359 leaq (NEW_X, IS, SIZE), XX
360 leaq 4 * SIZE(NEW_Y, IS, SIZE), YY
362 movaps 0 * SIZE(XX), atemp4
364 movsd 0 * SIZE(A1), xsum1
365 movhps 2 * SIZE(A1), xsum1
368 movss 1 * SIZE(A1), xsum2
369 movss 1 * SIZE(A1, LDA, 1), a2
370 movss 2 * SIZE(A1, LDA, 1), a3
371 movss 3 * SIZE(A1, LDA, 1), a4
377 movss 2 * SIZE(A1), xsum3
378 movss 2 * SIZE(A1, LDA, 1), a2
379 movss 2 * SIZE(A2), a3
380 movss 3 * SIZE(A2), a4
386 movss 3 * SIZE(A1), xsum4
387 movss 3 * SIZE(A1, LDA, 1), a2
388 movss 3 * SIZE(A2), a3
389 movss 3 * SIZE(A2, LDA, 1), a4
395 pshufd $0x00, atemp4, atemp1
396 pshufd $0x55, atemp4, atemp2
397 pshufd $0xaa, atemp4, atemp3
398 pshufd $0xff, atemp4, atemp4
400 movaps 4 * SIZE(XX), xtemp1
401 movaps 8 * SIZE(XX), xtemp2
403 movsd 0 * SIZE(YY), yy1
404 movhps 2 * SIZE(YY), yy1
406 movsd 4 * SIZE(A1), a1
407 movhps 6 * SIZE(A1), a1
408 movsd 4 * SIZE(A1, LDA, 1), a2
409 movhps 6 * SIZE(A1, LDA, 1), a2
410 movsd 4 * SIZE(A2), a3
411 movhps 6 * SIZE(A2), a3
412 movsd 4 * SIZE(A2, LDA, 1), a4
413 movhps 6 * SIZE(A2, LDA, 1), a4
432 movsd 4 * SIZE(A1), a1
433 movhps 6 * SIZE(A1), a1
435 PREFETCH PREFETCHSIZE(A1)
442 movsd 4 * SIZE(A1, LDA, 1), a2
443 movhps 6 * SIZE(A1, LDA, 1), a2
450 movsd 4 * SIZE(A2), a3
451 movhps 6 * SIZE(A2), a3
453 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
454 PREFETCH PREFETCHSIZE(XX)
458 movaps 8 * SIZE(XX), xtemp1
463 movsd 4 * SIZE(A2, LDA, 1), a4
464 movhps 6 * SIZE(A2, LDA, 1), a4
466 movlps yy1, 0 * SIZE(YY)
467 movhps yy1, 2 * SIZE(YY)
468 movsd 4 * SIZE(YY), yy1
469 movhps 6 * SIZE(YY), yy1
476 movsd 8 * SIZE(A1), a1
477 movhps 10 * SIZE(A1), a1
479 PREFETCH PREFETCHSIZE(A1, LDA, 1)
486 movsd 8 * SIZE(A1, LDA, 1), a2
487 movhps 10 * SIZE(A1, LDA, 1), a2
494 movsd 8 * SIZE(A2), a3
495 movhps 10 * SIZE(A2), a3
498 movaps 12 * SIZE(XX), xtemp2
503 movsd 8 * SIZE(A2, LDA, 1), a4
504 movhps 10 * SIZE(A2, LDA, 1), a4
506 movlps yy1, 4 * SIZE(YY)
507 movhps yy1, 6 * SIZE(YY)
508 movsd 8 * SIZE(YY), yy1
509 movhps 10 * SIZE(YY), yy1
517 movsd 12 * SIZE(A1), a1
518 movhps 14 * SIZE(A1), a1
520 PREFETCH PREFETCHSIZE(A2)
527 movsd 12 * SIZE(A1, LDA, 1), a2
528 movhps 14 * SIZE(A1, LDA, 1), a2
535 movsd 12 * SIZE(A2), a3
536 movhps 14 * SIZE(A2), a3
538 #if !defined(CORE2) && !defined(PENRYN) && !defined(DUNNINGTON)
539 PREFETCHW PREFETCHSIZE(YY)
543 movaps 16 * SIZE(XX), xtemp1
548 movsd 12 * SIZE(A2, LDA, 1), a4
549 movhps 14 * SIZE(A2, LDA, 1), a4
551 movlps yy1, 8 * SIZE(YY)
552 movhps yy1, 10 * SIZE(YY)
553 movsd 12 * SIZE(YY), yy1
554 movhps 14 * SIZE(YY), yy1
561 movsd 16 * SIZE(A1), a1
562 movhps 18 * SIZE(A1), a1
564 PREFETCH PREFETCHSIZE(A2, LDA, 1)
571 movsd 16 * SIZE(A1, LDA, 1), a2
572 movhps 18 * SIZE(A1, LDA, 1), a2
579 movsd 16 * SIZE(A2), a3
580 movhps 18 * SIZE(A2), a3
583 movaps 20 * SIZE(XX), xtemp2
588 movsd 16 * SIZE(A2, LDA, 1), a4
589 movhps 18 * SIZE(A2, LDA, 1), a4
591 movlps yy1, 12 * SIZE(YY)
592 movhps yy1, 14 * SIZE(YY)
593 movsd 16 * SIZE(YY), yy1
594 movhps 18 * SIZE(YY), yy1
617 movsd 4 * SIZE(A1), a1
618 movhps 6 * SIZE(A1), a1
625 movsd 4 * SIZE(A1, LDA, 1), a2
626 movhps 6 * SIZE(A1, LDA, 1), a2
633 movsd 4 * SIZE(A2), a3
634 movhps 6 * SIZE(A2), a3
637 movaps 8 * SIZE(XX), xtemp1
642 movsd 4 * SIZE(A2, LDA, 1), a4
643 movhps 6 * SIZE(A2, LDA, 1), a4
645 movlps yy1, 0 * SIZE(YY)
646 movhps yy1, 2 * SIZE(YY)
647 movsd 4 * SIZE(YY), yy1
648 movhps 6 * SIZE(YY), yy1
655 movsd 8 * SIZE(A1), a1
656 movhps 10 * SIZE(A1), a1
663 movsd 8 * SIZE(A1, LDA, 1), a2
664 movhps 10 * SIZE(A1, LDA, 1), a2
671 movsd 8 * SIZE(A2), a3
672 movhps 10 * SIZE(A2), a3
675 movaps 12 * SIZE(XX), xtemp2
680 movsd 8 * SIZE(A2, LDA, 1), a4
681 movhps 10 * SIZE(A2, LDA, 1), a4
683 movlps yy1, 4 * SIZE(YY)
684 movhps yy1, 6 * SIZE(YY)
685 movsd 8 * SIZE(YY), yy1
686 movhps 10 * SIZE(YY), yy1
703 movsd 4 * SIZE(A1), a1
710 movsd 4 * SIZE(A1, LDA, 1), a2
717 movsd 4 * SIZE(A2), a3
720 movsd 4 * SIZE(XX), xtemp1
725 movsd 4 * SIZE(A2, LDA, 1), a4
727 movlps yy1, 0 * SIZE(YY)
728 movhps yy1, 2 * SIZE(YY)
729 movsd 4 * SIZE(YY), yy1
749 movss 2 * SIZE(A1), a1
757 movss 2 * SIZE(A1, LDA, 1), a2
765 movss 2 * SIZE(A2), a3
769 movss 2 * SIZE(XX), xtemp1
774 movss 2 * SIZE(A2, LDA, 1), a4
776 movlps yy1, 0 * SIZE(YY)
777 movss 2 * SIZE(YY), yy1
789 movss 0 * SIZE(XX), xtemp1
791 movss 0 * SIZE(YY), yy1
793 movss 0 * SIZE(A1), a1
794 movss 0 * SIZE(A1, LDA, 1), a2
795 movss 0 * SIZE(A2), a3
796 movss 0 * SIZE(A2, LDA, 1), a4
822 movss yy1, 0 * SIZE(YY)
828 unpcklps xsum3, xsum1
829 unpckhps xsum3, xtemp1
832 unpcklps xsum4, xsum2
833 unpckhps xsum4, xtemp2
836 unpcklps xsum2, xsum1
837 unpckhps xsum2, xsum3
840 unpcklps xtemp2, xtemp1
841 unpckhps xtemp2, xsum4
853 movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
854 movhps 2 * SIZE(NEW_Y, IS, SIZE), yy1
858 movsd yy1, 0 * SIZE(NEW_Y, IS, SIZE)
859 movhps yy1, 2 * SIZE(NEW_Y, IS, SIZE)
874 leaq 2 * SIZE(A, LDA, 2), A
876 movaps 0 * SIZE(NEW_X, IS, SIZE), atemp4
881 movsd 0 * SIZE(A1), xsum1
884 movss 1 * SIZE(A1), xsum2
885 movss 1 * SIZE(A1, LDA, 1), a2
889 pshufd $0x00, atemp4, atemp1
890 pshufd $0x55, atemp4, atemp2
895 movss 2 * SIZE(A1), a1
896 movss 2 * SIZE(A1, LDA, 1), a2
897 movss 2 * SIZE(NEW_X, IS, SIZE), xtemp1
898 movss 2 * SIZE(NEW_Y, IS, SIZE), yy1
912 movss yy1, 2 * SIZE(NEW_Y, IS, SIZE)
918 unpcklps xsum2, xsum1
926 movsd 0 * SIZE(NEW_Y, IS, SIZE), yy1
930 movlps yy1, 0 * SIZE(NEW_Y, IS, SIZE)
939 movss 0 * SIZE(NEW_X, IS, SIZE), xsum1
940 mulss 0 * SIZE(A), xsum1
941 addss 0 * SIZE(NEW_Y, IS, SIZE), xsum1
942 movss xsum1, 0 * SIZE(NEW_Y, IS, SIZE)
955 movss 0 * SIZE(NEW_Y), %xmm0
956 movss 1 * SIZE(NEW_Y), %xmm1
957 movss 2 * SIZE(NEW_Y), %xmm2
958 movss 3 * SIZE(NEW_Y), %xmm3
959 movss 4 * SIZE(NEW_Y), %xmm4
960 movss 5 * SIZE(NEW_Y), %xmm5
961 movss 6 * SIZE(NEW_Y), %xmm6
962 movss 7 * SIZE(NEW_Y), %xmm7
964 movss %xmm0, 0 * SIZE(Y)
966 movss %xmm1, 0 * SIZE(Y)
968 movss %xmm2, 0 * SIZE(Y)
970 movss %xmm3, 0 * SIZE(Y)
972 movss %xmm4, 0 * SIZE(Y)
974 movss %xmm5, 0 * SIZE(Y)
976 movss %xmm6, 0 * SIZE(Y)
978 movss %xmm7, 0 * SIZE(Y)
981 addq $8 * SIZE, NEW_Y
993 movss 0 * SIZE(NEW_Y), %xmm0
995 movss %xmm0, 0 * SIZE(Y)
998 addq $1 * SIZE, NEW_Y
1015 movups 64(%rsp), %xmm6
1016 movups 80(%rsp), %xmm7
1017 movups 96(%rsp), %xmm8
1018 movups 112(%rsp), %xmm9
1019 movups 128(%rsp), %xmm10
1020 movups 144(%rsp), %xmm11
1021 movups 160(%rsp), %xmm12
1022 movups 176(%rsp), %xmm13
1023 movups 192(%rsp), %xmm14
1024 movups 208(%rsp), %xmm15
1027 addq $STACKSIZE, %rsp