1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
49 static void init_parameter(void);
51 gotoblas_t TABLE_NAME = {
54 GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN,
57 SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N),
58 #ifdef HAVE_EXCLUSIVE_CACHE
64 samax_kTS, samin_kTS, smax_kTS, smin_kTS,
65 isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS,
66 snrm2_kTS, sasum_kTS, scopy_kTS, sdot_kTS,
68 srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS,
69 sgemv_nTS, sgemv_tTS, sger_kTS,
72 sgemm_kernelTS, sgemm_betaTS,
73 #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
74 sgemm_incopyTS, sgemm_itcopyTS,
76 sgemm_oncopyTS, sgemm_otcopyTS,
78 sgemm_oncopyTS, sgemm_otcopyTS,
79 strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS,
80 #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
81 strsm_iunucopyTS, strsm_iunncopyTS, strsm_iutucopyTS, strsm_iutncopyTS,
82 strsm_ilnucopyTS, strsm_ilnncopyTS, strsm_iltucopyTS, strsm_iltncopyTS,
84 strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS,
85 strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS,
87 strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS,
88 strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS,
89 strmm_kernel_RNTS, strmm_kernel_RTTS, strmm_kernel_LNTS, strmm_kernel_LTTS,
90 #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
91 strmm_iunucopyTS, strmm_iunncopyTS, strmm_iutucopyTS, strmm_iutncopyTS,
92 strmm_ilnucopyTS, strmm_ilnncopyTS, strmm_iltucopyTS, strmm_iltncopyTS,
94 strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS,
95 strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS,
97 strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS,
98 strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS,
99 #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
100 ssymm_iutcopyTS, ssymm_iltcopyTS,
102 ssymm_outcopyTS, ssymm_oltcopyTS,
104 ssymm_outcopyTS, ssymm_oltcopyTS,
107 sneg_tcopyTS, slaswp_ncopyTS,
113 DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N),
115 damax_kTS, damin_kTS, dmax_kTS, dmin_kTS,
116 idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS,
117 dnrm2_kTS, dasum_kTS, dcopy_kTS, ddot_kTS,
118 drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS,
119 dgemv_nTS, dgemv_tTS, dger_kTS,
120 dsymv_LTS, dsymv_UTS,
122 dgemm_kernelTS, dgemm_betaTS,
123 #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
124 dgemm_incopyTS, dgemm_itcopyTS,
126 dgemm_oncopyTS, dgemm_otcopyTS,
128 dgemm_oncopyTS, dgemm_otcopyTS,
129 dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS,
130 #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
131 dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS,
132 dtrsm_ilnucopyTS, dtrsm_ilnncopyTS, dtrsm_iltucopyTS, dtrsm_iltncopyTS,
134 dtrsm_ounucopyTS, dtrsm_ounncopyTS, dtrsm_outucopyTS, dtrsm_outncopyTS,
135 dtrsm_olnucopyTS, dtrsm_olnncopyTS, dtrsm_oltucopyTS, dtrsm_oltncopyTS,
137 dtrsm_ounucopyTS, dtrsm_ounncopyTS, dtrsm_outucopyTS, dtrsm_outncopyTS,
138 dtrsm_olnucopyTS, dtrsm_olnncopyTS, dtrsm_oltucopyTS, dtrsm_oltncopyTS,
139 dtrmm_kernel_RNTS, dtrmm_kernel_RTTS, dtrmm_kernel_LNTS, dtrmm_kernel_LTTS,
140 #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
141 dtrmm_iunucopyTS, dtrmm_iunncopyTS, dtrmm_iutucopyTS, dtrmm_iutncopyTS,
142 dtrmm_ilnucopyTS, dtrmm_ilnncopyTS, dtrmm_iltucopyTS, dtrmm_iltncopyTS,
144 dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS,
145 dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS,
147 dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS,
148 dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS,
149 #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
150 dsymm_iutcopyTS, dsymm_iltcopyTS,
152 dsymm_outcopyTS, dsymm_oltcopyTS,
154 dsymm_outcopyTS, dsymm_oltcopyTS,
157 dneg_tcopyTS, dlaswp_ncopyTS,
165 QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N),
167 qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS,
168 iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS,
169 qnrm2_kTS, qasum_kTS, qcopy_kTS, qdot_kTS,
170 qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS,
171 qgemv_nTS, qgemv_tTS, qger_kTS,
172 qsymv_LTS, qsymv_UTS,
174 qgemm_kernelTS, qgemm_betaTS,
175 #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
176 qgemm_incopyTS, qgemm_itcopyTS,
178 qgemm_oncopyTS, qgemm_otcopyTS,
180 qgemm_oncopyTS, qgemm_otcopyTS,
181 qtrsm_kernel_LNTS, qtrsm_kernel_LTTS, qtrsm_kernel_RNTS, qtrsm_kernel_RTTS,
182 #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
183 qtrsm_iunucopyTS, qtrsm_iunncopyTS, qtrsm_iutucopyTS, qtrsm_iutncopyTS,
184 qtrsm_ilnucopyTS, qtrsm_ilnncopyTS, qtrsm_iltucopyTS, qtrsm_iltncopyTS,
186 qtrsm_ounucopyTS, qtrsm_ounncopyTS, qtrsm_outucopyTS, qtrsm_outncopyTS,
187 qtrsm_olnucopyTS, qtrsm_olnncopyTS, qtrsm_oltucopyTS, qtrsm_oltncopyTS,
189 qtrsm_ounucopyTS, qtrsm_ounncopyTS, qtrsm_outucopyTS, qtrsm_outncopyTS,
190 qtrsm_olnucopyTS, qtrsm_olnncopyTS, qtrsm_oltucopyTS, qtrsm_oltncopyTS,
191 qtrmm_kernel_RNTS, qtrmm_kernel_RTTS, qtrmm_kernel_LNTS, qtrmm_kernel_LTTS,
192 #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
193 qtrmm_iunucopyTS, qtrmm_iunncopyTS, qtrmm_iutucopyTS, qtrmm_iutncopyTS,
194 qtrmm_ilnucopyTS, qtrmm_ilnncopyTS, qtrmm_iltucopyTS, qtrmm_iltncopyTS,
196 qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS,
197 qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS,
199 qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS,
200 qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS,
201 #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
202 qsymm_iutcopyTS, qsymm_iltcopyTS,
204 qsymm_outcopyTS, qsymm_oltcopyTS,
206 qsymm_outcopyTS, qsymm_oltcopyTS,
209 qneg_tcopyTS, qlaswp_ncopyTS,
217 CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N, MAX(CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N),
219 camax_kTS, camin_kTS, icamax_kTS, icamin_kTS,
220 cnrm2_kTS, casum_kTS, ccopy_kTS,
221 cdotu_kTS, cdotc_kTS, csrot_kTS,
222 caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS,
224 cgemv_nTS, cgemv_tTS, cgemv_rTS, cgemv_cTS,
225 cgemv_oTS, cgemv_uTS, cgemv_sTS, cgemv_dTS,
226 cgeru_kTS, cgerc_kTS, cgerv_kTS, cgerd_kTS,
227 csymv_LTS, csymv_UTS,
228 chemv_LTS, chemv_UTS, chemv_MTS, chemv_VTS,
230 cgemm_kernel_nTS, cgemm_kernel_lTS, cgemm_kernel_rTS, cgemm_kernel_bTS,
233 #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
234 cgemm_incopyTS, cgemm_itcopyTS,
236 cgemm_oncopyTS, cgemm_otcopyTS,
238 cgemm_oncopyTS, cgemm_otcopyTS,
240 ctrsm_kernel_LNTS, ctrsm_kernel_LTTS, ctrsm_kernel_LRTS, ctrsm_kernel_LCTS,
241 ctrsm_kernel_RNTS, ctrsm_kernel_RTTS, ctrsm_kernel_RRTS, ctrsm_kernel_RCTS,
243 #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
244 ctrsm_iunucopyTS, ctrsm_iunncopyTS, ctrsm_iutucopyTS, ctrsm_iutncopyTS,
245 ctrsm_ilnucopyTS, ctrsm_ilnncopyTS, ctrsm_iltucopyTS, ctrsm_iltncopyTS,
247 ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS,
248 ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS,
250 ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS,
251 ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS,
253 ctrmm_kernel_RNTS, ctrmm_kernel_RTTS, ctrmm_kernel_RRTS, ctrmm_kernel_RCTS,
254 ctrmm_kernel_LNTS, ctrmm_kernel_LTTS, ctrmm_kernel_LRTS, ctrmm_kernel_LCTS,
256 #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
257 ctrmm_iunucopyTS, ctrmm_iunncopyTS, ctrmm_iutucopyTS, ctrmm_iutncopyTS,
258 ctrmm_ilnucopyTS, ctrmm_ilnncopyTS, ctrmm_iltucopyTS, ctrmm_iltncopyTS,
260 ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS,
261 ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS,
263 ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS,
264 ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS,
266 #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
267 csymm_iutcopyTS, csymm_iltcopyTS,
269 csymm_outcopyTS, csymm_oltcopyTS,
271 csymm_outcopyTS, csymm_oltcopyTS,
272 #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
273 chemm_iutcopyTS, chemm_iltcopyTS,
275 chemm_outcopyTS, chemm_oltcopyTS,
277 chemm_outcopyTS, chemm_oltcopyTS,
281 cgemm3m_incopybTS, cgemm3m_incopyrTS,
282 cgemm3m_incopyiTS, cgemm3m_itcopybTS,
283 cgemm3m_itcopyrTS, cgemm3m_itcopyiTS,
284 cgemm3m_oncopybTS, cgemm3m_oncopyrTS,
285 cgemm3m_oncopyiTS, cgemm3m_otcopybTS,
286 cgemm3m_otcopyrTS, cgemm3m_otcopyiTS,
288 csymm3m_iucopybTS, csymm3m_ilcopybTS,
289 csymm3m_iucopyrTS, csymm3m_ilcopyrTS,
290 csymm3m_iucopyiTS, csymm3m_ilcopyiTS,
291 csymm3m_oucopybTS, csymm3m_olcopybTS,
292 csymm3m_oucopyrTS, csymm3m_olcopyrTS,
293 csymm3m_oucopyiTS, csymm3m_olcopyiTS,
295 chemm3m_iucopybTS, chemm3m_ilcopybTS,
296 chemm3m_iucopyrTS, chemm3m_ilcopyrTS,
297 chemm3m_iucopyiTS, chemm3m_ilcopyiTS,
299 chemm3m_oucopybTS, chemm3m_olcopybTS,
300 chemm3m_oucopyrTS, chemm3m_olcopyrTS,
301 chemm3m_oucopyiTS, chemm3m_olcopyiTS,
304 cneg_tcopyTS, claswp_ncopyTS,
310 ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N, MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N),
312 zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS,
313 znrm2_kTS, zasum_kTS, zcopy_kTS,
314 zdotu_kTS, zdotc_kTS, zdrot_kTS,
315 zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS,
317 zgemv_nTS, zgemv_tTS, zgemv_rTS, zgemv_cTS,
318 zgemv_oTS, zgemv_uTS, zgemv_sTS, zgemv_dTS,
319 zgeru_kTS, zgerc_kTS, zgerv_kTS, zgerd_kTS,
320 zsymv_LTS, zsymv_UTS,
321 zhemv_LTS, zhemv_UTS, zhemv_MTS, zhemv_VTS,
323 zgemm_kernel_nTS, zgemm_kernel_lTS, zgemm_kernel_rTS, zgemm_kernel_bTS,
326 #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
327 zgemm_incopyTS, zgemm_itcopyTS,
329 zgemm_oncopyTS, zgemm_otcopyTS,
331 zgemm_oncopyTS, zgemm_otcopyTS,
333 ztrsm_kernel_LNTS, ztrsm_kernel_LTTS, ztrsm_kernel_LRTS, ztrsm_kernel_LCTS,
334 ztrsm_kernel_RNTS, ztrsm_kernel_RTTS, ztrsm_kernel_RRTS, ztrsm_kernel_RCTS,
336 #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
337 ztrsm_iunucopyTS, ztrsm_iunncopyTS, ztrsm_iutucopyTS, ztrsm_iutncopyTS,
338 ztrsm_ilnucopyTS, ztrsm_ilnncopyTS, ztrsm_iltucopyTS, ztrsm_iltncopyTS,
340 ztrsm_ounucopyTS, ztrsm_ounncopyTS, ztrsm_outucopyTS, ztrsm_outncopyTS,
341 ztrsm_olnucopyTS, ztrsm_olnncopyTS, ztrsm_oltucopyTS, ztrsm_oltncopyTS,
343 ztrsm_ounucopyTS, ztrsm_ounncopyTS, ztrsm_outucopyTS, ztrsm_outncopyTS,
344 ztrsm_olnucopyTS, ztrsm_olnncopyTS, ztrsm_oltucopyTS, ztrsm_oltncopyTS,
346 ztrmm_kernel_RNTS, ztrmm_kernel_RTTS, ztrmm_kernel_RRTS, ztrmm_kernel_RCTS,
347 ztrmm_kernel_LNTS, ztrmm_kernel_LTTS, ztrmm_kernel_LRTS, ztrmm_kernel_LCTS,
349 #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
350 ztrmm_iunucopyTS, ztrmm_iunncopyTS, ztrmm_iutucopyTS, ztrmm_iutncopyTS,
351 ztrmm_ilnucopyTS, ztrmm_ilnncopyTS, ztrmm_iltucopyTS, ztrmm_iltncopyTS,
353 ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS,
354 ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS,
356 ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS,
357 ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS,
359 #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
360 zsymm_iutcopyTS, zsymm_iltcopyTS,
362 zsymm_outcopyTS, zsymm_oltcopyTS,
364 zsymm_outcopyTS, zsymm_oltcopyTS,
365 #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
366 zhemm_iutcopyTS, zhemm_iltcopyTS,
368 zhemm_outcopyTS, zhemm_oltcopyTS,
370 zhemm_outcopyTS, zhemm_oltcopyTS,
374 zgemm3m_incopybTS, zgemm3m_incopyrTS,
375 zgemm3m_incopyiTS, zgemm3m_itcopybTS,
376 zgemm3m_itcopyrTS, zgemm3m_itcopyiTS,
377 zgemm3m_oncopybTS, zgemm3m_oncopyrTS,
378 zgemm3m_oncopyiTS, zgemm3m_otcopybTS,
379 zgemm3m_otcopyrTS, zgemm3m_otcopyiTS,
381 zsymm3m_iucopybTS, zsymm3m_ilcopybTS,
382 zsymm3m_iucopyrTS, zsymm3m_ilcopyrTS,
383 zsymm3m_iucopyiTS, zsymm3m_ilcopyiTS,
384 zsymm3m_oucopybTS, zsymm3m_olcopybTS,
385 zsymm3m_oucopyrTS, zsymm3m_olcopyrTS,
386 zsymm3m_oucopyiTS, zsymm3m_olcopyiTS,
388 zhemm3m_iucopybTS, zhemm3m_ilcopybTS,
389 zhemm3m_iucopyrTS, zhemm3m_ilcopyrTS,
390 zhemm3m_iucopyiTS, zhemm3m_ilcopyiTS,
392 zhemm3m_oucopybTS, zhemm3m_olcopybTS,
393 zhemm3m_oucopyrTS, zhemm3m_olcopyrTS,
394 zhemm3m_oucopyiTS, zhemm3m_olcopyiTS,
397 zneg_tcopyTS, zlaswp_ncopyTS,
405 XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N),
407 xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS,
408 xnrm2_kTS, xasum_kTS, xcopy_kTS,
409 xdotu_kTS, xdotc_kTS, xqrot_kTS,
410 xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS,
412 xgemv_nTS, xgemv_tTS, xgemv_rTS, xgemv_cTS,
413 xgemv_oTS, xgemv_uTS, xgemv_sTS, xgemv_dTS,
414 xgeru_kTS, xgerc_kTS, xgerv_kTS, xgerd_kTS,
415 xsymv_LTS, xsymv_UTS,
416 xhemv_LTS, xhemv_UTS, xhemv_MTS, xhemv_VTS,
418 xgemm_kernel_nTS, xgemm_kernel_lTS, xgemm_kernel_rTS, xgemm_kernel_bTS,
421 #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
422 xgemm_incopyTS, xgemm_itcopyTS,
424 xgemm_oncopyTS, xgemm_otcopyTS,
426 xgemm_oncopyTS, xgemm_otcopyTS,
428 xtrsm_kernel_LNTS, xtrsm_kernel_LTTS, xtrsm_kernel_LRTS, xtrsm_kernel_LCTS,
429 xtrsm_kernel_RNTS, xtrsm_kernel_RTTS, xtrsm_kernel_RRTS, xtrsm_kernel_RCTS,
431 #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
432 xtrsm_iunucopyTS, xtrsm_iunncopyTS, xtrsm_iutucopyTS, xtrsm_iutncopyTS,
433 xtrsm_ilnucopyTS, xtrsm_ilnncopyTS, xtrsm_iltucopyTS, xtrsm_iltncopyTS,
435 xtrsm_ounucopyTS, xtrsm_ounncopyTS, xtrsm_outucopyTS, xtrsm_outncopyTS,
436 xtrsm_olnucopyTS, xtrsm_olnncopyTS, xtrsm_oltucopyTS, xtrsm_oltncopyTS,
438 xtrsm_ounucopyTS, xtrsm_ounncopyTS, xtrsm_outucopyTS, xtrsm_outncopyTS,
439 xtrsm_olnucopyTS, xtrsm_olnncopyTS, xtrsm_oltucopyTS, xtrsm_oltncopyTS,
441 xtrmm_kernel_RNTS, xtrmm_kernel_RTTS, xtrmm_kernel_RRTS, xtrmm_kernel_RCTS,
442 xtrmm_kernel_LNTS, xtrmm_kernel_LTTS, xtrmm_kernel_LRTS, xtrmm_kernel_LCTS,
444 #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
445 xtrmm_iunucopyTS, xtrmm_iunncopyTS, xtrmm_iutucopyTS, xtrmm_iutncopyTS,
446 xtrmm_ilnucopyTS, xtrmm_ilnncopyTS, xtrmm_iltucopyTS, xtrmm_iltncopyTS,
448 xtrmm_ounucopyTS, xtrmm_ounncopyTS, xtrmm_outucopyTS, xtrmm_outncopyTS,
449 xtrmm_olnucopyTS, xtrmm_olnncopyTS, xtrmm_oltucopyTS, xtrmm_oltncopyTS,
451 xtrmm_ounucopyTS, xtrmm_ounncopyTS, xtrmm_outucopyTS, xtrmm_outncopyTS,
452 xtrmm_olnucopyTS, xtrmm_olnncopyTS, xtrmm_oltucopyTS, xtrmm_oltncopyTS,
454 #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
455 xsymm_iutcopyTS, xsymm_iltcopyTS,
457 xsymm_outcopyTS, xsymm_oltcopyTS,
459 xsymm_outcopyTS, xsymm_oltcopyTS,
460 #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
461 xhemm_iutcopyTS, xhemm_iltcopyTS,
463 xhemm_outcopyTS, xhemm_oltcopyTS,
465 xhemm_outcopyTS, xhemm_oltcopyTS,
469 xgemm3m_incopybTS, xgemm3m_incopyrTS,
470 xgemm3m_incopyiTS, xgemm3m_itcopybTS,
471 xgemm3m_itcopyrTS, xgemm3m_itcopyiTS,
472 xgemm3m_oncopybTS, xgemm3m_oncopyrTS,
473 xgemm3m_oncopyiTS, xgemm3m_otcopybTS,
474 xgemm3m_otcopyrTS, xgemm3m_otcopyiTS,
476 xsymm3m_iucopybTS, xsymm3m_ilcopybTS,
477 xsymm3m_iucopyrTS, xsymm3m_ilcopyrTS,
478 xsymm3m_iucopyiTS, xsymm3m_ilcopyiTS,
479 xsymm3m_oucopybTS, xsymm3m_olcopybTS,
480 xsymm3m_oucopyrTS, xsymm3m_olcopyrTS,
481 xsymm3m_oucopyiTS, xsymm3m_olcopyiTS,
483 xhemm3m_iucopybTS, xhemm3m_ilcopybTS,
484 xhemm3m_iucopyrTS, xhemm3m_ilcopyrTS,
485 xhemm3m_iucopyiTS, xhemm3m_ilcopyiTS,
487 xhemm3m_oucopybTS, xhemm3m_olcopybTS,
488 xhemm3m_oucopyrTS, xhemm3m_olcopyrTS,
489 xhemm3m_oucopyiTS, xhemm3m_olcopyiTS,
492 xneg_tcopyTS, xlaswp_ncopyTS,
501 SNUMOPT, DNUMOPT, QNUMOPT,
506 static int get_l2_size_old(void){
507 int i, eax, ebx, ecx, edx, cpuid_level;
510 cpuid(2, &eax, &ebx, &ecx, &edx);
512 info[ 0] = BITMASK(eax, 8, 0xff);
513 info[ 1] = BITMASK(eax, 16, 0xff);
514 info[ 2] = BITMASK(eax, 24, 0xff);
516 info[ 3] = BITMASK(ebx, 0, 0xff);
517 info[ 4] = BITMASK(ebx, 8, 0xff);
518 info[ 5] = BITMASK(ebx, 16, 0xff);
519 info[ 6] = BITMASK(ebx, 24, 0xff);
521 info[ 7] = BITMASK(ecx, 0, 0xff);
522 info[ 8] = BITMASK(ecx, 8, 0xff);
523 info[ 9] = BITMASK(ecx, 16, 0xff);
524 info[10] = BITMASK(ecx, 24, 0xff);
526 info[11] = BITMASK(edx, 0, 0xff);
527 info[12] = BITMASK(edx, 8, 0xff);
528 info[13] = BITMASK(edx, 16, 0xff);
529 info[14] = BITMASK(edx, 24, 0xff);
531 for (i = 0; i < 15; i++){
535 /* This table is from http://www.sandpile.org/ia32/cpuid.htm */
595 static __inline__ int get_l2_size(void){
597 int eax, ebx, ecx, edx, l2;
599 cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
601 l2 = BITMASK(ecx, 16, 0xffff);
608 if (l2 > 0) return l2;
610 return get_l2_size_old();
614 static __inline__ int get_l3_size(void){
616 int eax, ebx, ecx, edx;
618 cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
620 return BITMASK(edx, 18, 0x3fff) * 512;
624 static void init_parameter(void) {
626 int l2 = get_l2_size();
628 TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
629 TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
630 TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
631 TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
633 TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q;
634 TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
637 #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON)
640 fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n");
643 TABLE_NAME.sgemm_p = 64 * (l2 >> 7);
644 TABLE_NAME.dgemm_p = 32 * (l2 >> 7);
645 TABLE_NAME.cgemm_p = 32 * (l2 >> 7);
646 TABLE_NAME.zgemm_p = 16 * (l2 >> 7);
648 TABLE_NAME.qgemm_p = 16 * (l2 >> 7);
649 TABLE_NAME.xgemm_p = 8 * (l2 >> 7);
653 #ifdef CORE_NORTHWOOD
656 fprintf(stderr, "Northwood\n");
659 TABLE_NAME.sgemm_p = 96 * (l2 >> 7);
660 TABLE_NAME.dgemm_p = 48 * (l2 >> 7);
661 TABLE_NAME.cgemm_p = 48 * (l2 >> 7);
662 TABLE_NAME.zgemm_p = 24 * (l2 >> 7);
664 TABLE_NAME.qgemm_p = 24 * (l2 >> 7);
665 TABLE_NAME.xgemm_p = 12 * (l2 >> 7);
672 fprintf(stderr, "Atom\n");
675 TABLE_NAME.sgemm_p = 256;
676 TABLE_NAME.dgemm_p = 128;
677 TABLE_NAME.cgemm_p = 128;
678 TABLE_NAME.zgemm_p = 64;
680 TABLE_NAME.qgemm_p = 64;
681 TABLE_NAME.xgemm_p = 32;
688 fprintf(stderr, "Prescott\n");
691 TABLE_NAME.sgemm_p = 56 * (l2 >> 7);
692 TABLE_NAME.dgemm_p = 28 * (l2 >> 7);
693 TABLE_NAME.cgemm_p = 28 * (l2 >> 7);
694 TABLE_NAME.zgemm_p = 14 * (l2 >> 7);
696 TABLE_NAME.qgemm_p = 14 * (l2 >> 7);
697 TABLE_NAME.xgemm_p = 7 * (l2 >> 7);
704 fprintf(stderr, "Core2\n");
707 TABLE_NAME.sgemm_p = 92 * (l2 >> 9);
708 TABLE_NAME.dgemm_p = 46 * (l2 >> 9);
709 TABLE_NAME.cgemm_p = 46 * (l2 >> 9);
710 TABLE_NAME.zgemm_p = 23 * (l2 >> 9);
712 TABLE_NAME.qgemm_p = 92 * (l2 >> 9);
713 TABLE_NAME.xgemm_p = 46 * (l2 >> 9);
720 fprintf(stderr, "Penryn\n");
723 TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8;
724 TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8;
725 TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4;
726 TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4;
728 TABLE_NAME.qgemm_p = 42 * (l2 >> 9) + 8;
729 TABLE_NAME.xgemm_p = 21 * (l2 >> 9) + 4;
736 fprintf(stderr, "Nehalem\n");
739 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
740 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
741 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
742 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
744 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
745 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
752 fprintf(stderr, "Sandybridge\n");
755 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
756 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
757 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
758 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
760 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
761 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
768 fprintf(stderr, "Opteron\n");
771 TABLE_NAME.sgemm_p = 224 + 56 * (l2 >> 7);
772 TABLE_NAME.dgemm_p = 112 + 28 * (l2 >> 7);
773 TABLE_NAME.cgemm_p = 112 + 28 * (l2 >> 7);
774 TABLE_NAME.zgemm_p = 56 + 14 * (l2 >> 7);
776 TABLE_NAME.qgemm_p = 56 + 14 * (l2 >> 7);
777 TABLE_NAME.xgemm_p = 28 + 7 * (l2 >> 7);
784 fprintf(stderr, "Barcelona\n");
787 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
788 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
789 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
790 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
792 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
793 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
800 fprintf(stderr, "Bobcate\n");
803 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
804 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
805 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
806 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
808 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
809 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
816 fprintf(stderr, "Bulldozer\n");
819 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
820 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
821 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
822 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
824 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
825 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
832 fprintf(stderr, "NANO\n");
835 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
836 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
837 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
838 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
840 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
841 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
846 TABLE_NAME.sgemm_p = (TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1) & ~(SGEMM_DEFAULT_UNROLL_M - 1);
847 TABLE_NAME.dgemm_p = (TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1) & ~(DGEMM_DEFAULT_UNROLL_M - 1);
848 TABLE_NAME.cgemm_p = (TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1) & ~(CGEMM_DEFAULT_UNROLL_M - 1);
849 TABLE_NAME.zgemm_p = (TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1) & ~(ZGEMM_DEFAULT_UNROLL_M - 1);
850 #ifdef QUAD_PRECISION
851 TABLE_NAME.qgemm_p = (TABLE_NAME.qgemm_p + QGEMM_DEFAULT_UNROLL_M - 1) & ~(QGEMM_DEFAULT_UNROLL_M - 1);
852 TABLE_NAME.xgemm_p = (TABLE_NAME.xgemm_p + XGEMM_DEFAULT_UNROLL_M - 1) & ~(XGEMM_DEFAULT_UNROLL_M - 1);
856 fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p);
859 TABLE_NAME.sgemm_r = (((BUFFER_SIZE -
860 ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA
861 + TABLE_NAME.align) & ~TABLE_NAME.align)
862 ) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15);
864 TABLE_NAME.dgemm_r = (((BUFFER_SIZE -
865 ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA
866 + TABLE_NAME.align) & ~TABLE_NAME.align)
867 ) / (TABLE_NAME.dgemm_q * 8) - 15) & ~15);
870 TABLE_NAME.qgemm_r = (((BUFFER_SIZE -
871 ((TABLE_NAME.qgemm_p * TABLE_NAME.qgemm_q * 16 + TABLE_NAME.offsetA
872 + TABLE_NAME.align) & ~TABLE_NAME.align)
873 ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15);
876 TABLE_NAME.cgemm_r = (((BUFFER_SIZE -
877 ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA
878 + TABLE_NAME.align) & ~TABLE_NAME.align)
879 ) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15);
881 TABLE_NAME.zgemm_r = (((BUFFER_SIZE -
882 ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA
883 + TABLE_NAME.align) & ~TABLE_NAME.align)
884 ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15);
887 TABLE_NAME.xgemm_r = (((BUFFER_SIZE -
888 ((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA
889 + TABLE_NAME.align) & ~TABLE_NAME.align)
890 ) / (TABLE_NAME.xgemm_q * 32) - 15) & ~15);