1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
43 #define PREFETCH prefetcht0
44 #define PREFETCHW prefetcht0
45 #define PREFETCHSIZE (16 * 24)
49 #define PREFETCH prefetcht0
50 #define PREFETCHW prefetcht0
51 #define PREFETCHSIZE (16 * 24)
54 #if defined(PENRYN) || defined(DUNNINGTON)
55 #define PREFETCH prefetcht0
56 #define PREFETCHW prefetcht0
57 #define PREFETCHSIZE (16 * 24)
60 #if defined(NEHALEM) || defined(SANDYBRIDGE)
61 #define PREFETCH prefetcht0
62 #define PREFETCHW prefetcht0
63 #define PREFETCHSIZE (16 * 24)
67 #define PREFETCH prefetcht0
68 #define PREFETCHW prefetcht0
69 #define PREFETCHSIZE (16 * 28)
73 #define PREFETCH prefetch
74 #define PREFETCHW prefetchw
75 #define PREFETCHSIZE (16 * 12)
79 #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
80 #define PREFETCH prefetch
81 #define PREFETCHW prefetchw
82 #define PREFETCHSIZE (16 * 16)
86 #define PREFETCH prefetcht0
87 #define PREFETCHW prefetcht0
88 #define PREFETCHSIZE (16 * 24)
92 #define PREFETCH prefetcht0
93 #define PREFETCHW prefetcht0
94 #define PREFETCHSIZE (16 * 14)
101 #define OLD_Y 8 + STACKSIZE(%rsp)
102 #define OLD_INCY 16 + STACKSIZE(%rsp)
103 #define OLD_BUFFER 24 + STACKSIZE(%rsp)
114 #define STACKSIZE 256
116 #define OLD_A 40 + STACKSIZE(%rsp)
117 #define OLD_LDA 48 + STACKSIZE(%rsp)
118 #define OLD_X 56 + STACKSIZE(%rsp)
119 #define OLD_INCX 64 + STACKSIZE(%rsp)
120 #define OLD_Y 72 + STACKSIZE(%rsp)
121 #define OLD_INCY 80 + STACKSIZE(%rsp)
122 #define OLD_BUFFER 88 + STACKSIZE(%rsp)
146 #define ALPHA_R %xmm0
147 #define ALPHA_I %xmm1
169 #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
170 #define MOVDDUP(a, b, c) movddup a(b), c
171 #define MOVDDUP2(a, b, c) movddup a##b, c
173 #define MOVDDUP(a, b, c) movlpd a(b), c;movhpd a(b), c
174 #define MOVDDUP2(a, b, c) movlpd a##b, c;movhpd a##b, c
180 subq $STACKSIZE, %rsp
191 movups %xmm6, 64(%rsp)
192 movups %xmm7, 80(%rsp)
193 movups %xmm8, 96(%rsp)
194 movups %xmm9, 112(%rsp)
195 movups %xmm10, 128(%rsp)
196 movups %xmm11, 144(%rsp)
197 movups %xmm12, 160(%rsp)
198 movups %xmm13, 176(%rsp)
199 movups %xmm14, 192(%rsp)
200 movups %xmm15, 208(%rsp)
213 movq OLD_BUFFER, BUFFER
215 salq $ZBASE_SHIFT, INCX
216 salq $ZBASE_SHIFT, INCY
217 salq $ZBASE_SHIFT, LDA
232 unpckhps %xmm3, %xmm2
234 shufps $0, ALPHA_R, ALPHA_R
235 shufps $0, ALPHA_I, ALPHA_I
236 movaps ALPHA_I, %xmm3
238 unpcklps ALPHA_R, ALPHA_I
239 unpcklps %xmm3, ALPHA_R
250 movsd 0 * SIZE(X), %xmm4
252 movhps 0 * SIZE(X), %xmm4
254 movsd 0 * SIZE(X), %xmm6
256 movhps 0 * SIZE(X), %xmm6
259 movsldup %xmm4, %xmm3
260 movshdup %xmm4, %xmm4
261 movsldup %xmm6, %xmm5
262 movshdup %xmm6, %xmm6
272 movaps %xmm3, 4 * SIZE(XX)
273 movaps %xmm5, 12 * SIZE(XX)
275 shufps $0xb1, %xmm3, %xmm3
276 shufps $0xb1, %xmm5, %xmm5
281 movaps %xmm3, 0 * SIZE(XX)
282 movaps %xmm5, 8 * SIZE(XX)
293 movsd 0 * SIZE(X), %xmm4
295 movhps 0 * SIZE(X), %xmm4
298 movsldup %xmm4, %xmm3
299 movshdup %xmm4, %xmm4
306 movaps %xmm3, 4 * SIZE(XX)
308 shufps $0xb1, %xmm3, %xmm3
310 movaps %xmm3, 0 * SIZE(XX)
319 movsd 0 * SIZE(X), %xmm4
322 movsldup %xmm4, %xmm3
323 movshdup %xmm4, %xmm4
330 movlps %xmm3, 2 * SIZE(XX)
332 shufps $0xb1, %xmm3, %xmm3
334 movlps %xmm3, 0 * SIZE(XX)
340 /* now we don't need original X */
358 movsd 0 * SIZE(YY), %xmm0
360 movhps 0 * SIZE(YY), %xmm0
362 movsd 0 * SIZE(YY), %xmm1
364 movhps 0 * SIZE(YY), %xmm1
367 movaps %xmm0, 0 * SIZE(XX)
368 movaps %xmm1, 8 * SIZE(XX)
382 movsd 0 * SIZE(YY), %xmm0
385 movlps %xmm0, 0 * SIZE(XX)
406 movsd 0 * SIZE(NEW_X, I, SIZE), atemp2
407 movhps 4 * SIZE(NEW_X, I, SIZE), atemp2
408 movsd 2 * SIZE(NEW_X, I, SIZE), atemp4
409 movhps 6 * SIZE(NEW_X, I, SIZE), atemp4
411 pshufd $0xcc, atemp2, atemp1
412 pshufd $0x99, atemp2, atemp2
413 pshufd $0xcc, atemp4, atemp3
414 pshufd $0x99, atemp4, atemp4
445 movsd 0 * SIZE(YY), yy1
446 movhps 2 * SIZE(YY), yy1
448 movaps 0 * SIZE(XX), xtemp1
449 movaps 4 * SIZE(XX), xtemp2
451 movsd 0 * SIZE(A1), a1
452 movhps 2 * SIZE(A1), a1
461 pshufd $0xb1, a1, xt2
467 movsd 0 * SIZE(A2), a1
468 movhps 2 * SIZE(A2), a1
477 pshufd $0xb1, a1, xt2
483 movlps yy1, 0 * SIZE(YY)
484 movhps yy1, 2 * SIZE(YY)
495 movaps 0 * SIZE(NEW_X, I, SIZE), atemp1
496 movaps 4 * SIZE(NEW_X, I, SIZE), atemp2
498 movlps 0 * SIZE(YY), yy1
499 movhps 2 * SIZE(YY), yy1
501 movsd 0 * SIZE(A1), a1
502 movhps 0 * SIZE(A2), a1
510 movsd 0 * SIZE(A2), a1
511 movhps 2 * SIZE(A2), a1
525 movlps yy1, 0 * SIZE(YY)
526 movhps yy1, 2 * SIZE(YY)
551 movaps 0 * SIZE(NEW_Y), %xmm0
552 movaps 4 * SIZE(NEW_Y), %xmm1
554 movlps %xmm0, 0 * SIZE(Y)
556 movhps %xmm0, 0 * SIZE(Y)
558 movlps %xmm1, 0 * SIZE(Y)
560 movhps %xmm1, 0 * SIZE(Y)
563 addq $8 * SIZE, NEW_Y
575 movlps 0 * SIZE(NEW_Y), %xmm0
576 addq $2 * SIZE, NEW_Y
578 movlps %xmm0, 0 * SIZE(Y)
592 addq $STACKSIZE, %rsp