1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
49 static void init_parameter(void);
51 gotoblas_t TABLE_NAME = {
54 GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN,
57 SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N,
58 #ifdef SGEMM_DEFAULT_UNROLL_MN
59 SGEMM_DEFAULT_UNROLL_MN,
61 MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N),
65 #ifdef HAVE_EXCLUSIVE_CACHE
71 samax_kTS, samin_kTS, smax_kTS, smin_kTS,
72 isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS,
73 snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS,
75 srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS,
76 sgemv_nTS, sgemv_tTS, sger_kTS,
79 sgemm_kernelTS, sgemm_betaTS,
80 #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
81 sgemm_incopyTS, sgemm_itcopyTS,
83 sgemm_oncopyTS, sgemm_otcopyTS,
85 sgemm_oncopyTS, sgemm_otcopyTS,
86 strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS,
87 #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
88 strsm_iunucopyTS, strsm_iunncopyTS, strsm_iutucopyTS, strsm_iutncopyTS,
89 strsm_ilnucopyTS, strsm_ilnncopyTS, strsm_iltucopyTS, strsm_iltncopyTS,
91 strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS,
92 strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS,
94 strsm_ounucopyTS, strsm_ounncopyTS, strsm_outucopyTS, strsm_outncopyTS,
95 strsm_olnucopyTS, strsm_olnncopyTS, strsm_oltucopyTS, strsm_oltncopyTS,
96 strmm_kernel_RNTS, strmm_kernel_RTTS, strmm_kernel_LNTS, strmm_kernel_LTTS,
97 #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
98 strmm_iunucopyTS, strmm_iunncopyTS, strmm_iutucopyTS, strmm_iutncopyTS,
99 strmm_ilnucopyTS, strmm_ilnncopyTS, strmm_iltucopyTS, strmm_iltncopyTS,
101 strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS,
102 strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS,
104 strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS,
105 strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS,
106 #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
107 ssymm_iutcopyTS, ssymm_iltcopyTS,
109 ssymm_outcopyTS, ssymm_oltcopyTS,
111 ssymm_outcopyTS, ssymm_oltcopyTS,
114 sneg_tcopyTS, slaswp_ncopyTS,
120 DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N,
121 #ifdef DGEMM_DEFAULT_UNROLL_MN
122 DGEMM_DEFAULT_UNROLL_MN,
124 MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N),
127 damax_kTS, damin_kTS, dmax_kTS, dmin_kTS,
128 idamax_kTS, idamin_kTS, idmax_kTS, idmin_kTS,
129 dnrm2_kTS, dasum_kTS, dsum_kTS, dcopy_kTS, ddot_kTS,
130 drot_kTS, daxpy_kTS, dscal_kTS, dswap_kTS,
131 dgemv_nTS, dgemv_tTS, dger_kTS,
132 dsymv_LTS, dsymv_UTS,
134 dgemm_kernelTS, dgemm_betaTS,
135 #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
136 dgemm_incopyTS, dgemm_itcopyTS,
138 dgemm_oncopyTS, dgemm_otcopyTS,
140 dgemm_oncopyTS, dgemm_otcopyTS,
141 dtrsm_kernel_LNTS, dtrsm_kernel_LTTS, dtrsm_kernel_RNTS, dtrsm_kernel_RTTS,
142 #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
143 dtrsm_iunucopyTS, dtrsm_iunncopyTS, dtrsm_iutucopyTS, dtrsm_iutncopyTS,
144 dtrsm_ilnucopyTS, dtrsm_ilnncopyTS, dtrsm_iltucopyTS, dtrsm_iltncopyTS,
146 dtrsm_ounucopyTS, dtrsm_ounncopyTS, dtrsm_outucopyTS, dtrsm_outncopyTS,
147 dtrsm_olnucopyTS, dtrsm_olnncopyTS, dtrsm_oltucopyTS, dtrsm_oltncopyTS,
149 dtrsm_ounucopyTS, dtrsm_ounncopyTS, dtrsm_outucopyTS, dtrsm_outncopyTS,
150 dtrsm_olnucopyTS, dtrsm_olnncopyTS, dtrsm_oltucopyTS, dtrsm_oltncopyTS,
151 dtrmm_kernel_RNTS, dtrmm_kernel_RTTS, dtrmm_kernel_LNTS, dtrmm_kernel_LTTS,
152 #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
153 dtrmm_iunucopyTS, dtrmm_iunncopyTS, dtrmm_iutucopyTS, dtrmm_iutncopyTS,
154 dtrmm_ilnucopyTS, dtrmm_ilnncopyTS, dtrmm_iltucopyTS, dtrmm_iltncopyTS,
156 dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS,
157 dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS,
159 dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS,
160 dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS,
161 #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N
162 dsymm_iutcopyTS, dsymm_iltcopyTS,
164 dsymm_outcopyTS, dsymm_oltcopyTS,
166 dsymm_outcopyTS, dsymm_oltcopyTS,
169 dneg_tcopyTS, dlaswp_ncopyTS,
177 QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N),
179 qamax_kTS, qamin_kTS, qmax_kTS, qmin_kTS,
180 iqamax_kTS, iqamin_kTS, iqmax_kTS, iqmin_kTS,
181 qnrm2_kTS, qasum_kTS, qsum_kTS, qcopy_kTS, qdot_kTS,
182 qrot_kTS, qaxpy_kTS, qscal_kTS, qswap_kTS,
183 qgemv_nTS, qgemv_tTS, qger_kTS,
184 qsymv_LTS, qsymv_UTS,
186 qgemm_kernelTS, qgemm_betaTS,
187 #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
188 qgemm_incopyTS, qgemm_itcopyTS,
190 qgemm_oncopyTS, qgemm_otcopyTS,
192 qgemm_oncopyTS, qgemm_otcopyTS,
193 qtrsm_kernel_LNTS, qtrsm_kernel_LTTS, qtrsm_kernel_RNTS, qtrsm_kernel_RTTS,
194 #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
195 qtrsm_iunucopyTS, qtrsm_iunncopyTS, qtrsm_iutucopyTS, qtrsm_iutncopyTS,
196 qtrsm_ilnucopyTS, qtrsm_ilnncopyTS, qtrsm_iltucopyTS, qtrsm_iltncopyTS,
198 qtrsm_ounucopyTS, qtrsm_ounncopyTS, qtrsm_outucopyTS, qtrsm_outncopyTS,
199 qtrsm_olnucopyTS, qtrsm_olnncopyTS, qtrsm_oltucopyTS, qtrsm_oltncopyTS,
201 qtrsm_ounucopyTS, qtrsm_ounncopyTS, qtrsm_outucopyTS, qtrsm_outncopyTS,
202 qtrsm_olnucopyTS, qtrsm_olnncopyTS, qtrsm_oltucopyTS, qtrsm_oltncopyTS,
203 qtrmm_kernel_RNTS, qtrmm_kernel_RTTS, qtrmm_kernel_LNTS, qtrmm_kernel_LTTS,
204 #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
205 qtrmm_iunucopyTS, qtrmm_iunncopyTS, qtrmm_iutucopyTS, qtrmm_iutncopyTS,
206 qtrmm_ilnucopyTS, qtrmm_ilnncopyTS, qtrmm_iltucopyTS, qtrmm_iltncopyTS,
208 qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS,
209 qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS,
211 qtrmm_ounucopyTS, qtrmm_ounncopyTS, qtrmm_outucopyTS, qtrmm_outncopyTS,
212 qtrmm_olnucopyTS, qtrmm_olnncopyTS, qtrmm_oltucopyTS, qtrmm_oltncopyTS,
213 #if QGEMM_DEFAULT_UNROLL_M != QGEMM_DEFAULT_UNROLL_N
214 qsymm_iutcopyTS, qsymm_iltcopyTS,
216 qsymm_outcopyTS, qsymm_oltcopyTS,
218 qsymm_outcopyTS, qsymm_oltcopyTS,
221 qneg_tcopyTS, qlaswp_ncopyTS,
229 CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N,
230 #ifdef CGEMM_DEFAULT_UNROLL_MN
231 CGEMM_DEFAULT_UNROLL_MN,
233 MAX(CGEMM_DEFAULT_UNROLL_M, CGEMM_DEFAULT_UNROLL_N),
236 camax_kTS, camin_kTS, icamax_kTS, icamin_kTS,
237 cnrm2_kTS, casum_kTS, csum_kTS, ccopy_kTS,
238 cdotu_kTS, cdotc_kTS, csrot_kTS,
239 caxpy_kTS, caxpyc_kTS, cscal_kTS, cswap_kTS,
241 cgemv_nTS, cgemv_tTS, cgemv_rTS, cgemv_cTS,
242 cgemv_oTS, cgemv_uTS, cgemv_sTS, cgemv_dTS,
243 cgeru_kTS, cgerc_kTS, cgerv_kTS, cgerd_kTS,
244 csymv_LTS, csymv_UTS,
245 chemv_LTS, chemv_UTS, chemv_MTS, chemv_VTS,
247 cgemm_kernel_nTS, cgemm_kernel_lTS, cgemm_kernel_rTS, cgemm_kernel_bTS,
250 #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
251 cgemm_incopyTS, cgemm_itcopyTS,
253 cgemm_oncopyTS, cgemm_otcopyTS,
255 cgemm_oncopyTS, cgemm_otcopyTS,
257 ctrsm_kernel_LNTS, ctrsm_kernel_LTTS, ctrsm_kernel_LRTS, ctrsm_kernel_LCTS,
258 ctrsm_kernel_RNTS, ctrsm_kernel_RTTS, ctrsm_kernel_RRTS, ctrsm_kernel_RCTS,
260 #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
261 ctrsm_iunucopyTS, ctrsm_iunncopyTS, ctrsm_iutucopyTS, ctrsm_iutncopyTS,
262 ctrsm_ilnucopyTS, ctrsm_ilnncopyTS, ctrsm_iltucopyTS, ctrsm_iltncopyTS,
264 ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS,
265 ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS,
267 ctrsm_ounucopyTS, ctrsm_ounncopyTS, ctrsm_outucopyTS, ctrsm_outncopyTS,
268 ctrsm_olnucopyTS, ctrsm_olnncopyTS, ctrsm_oltucopyTS, ctrsm_oltncopyTS,
270 ctrmm_kernel_RNTS, ctrmm_kernel_RTTS, ctrmm_kernel_RRTS, ctrmm_kernel_RCTS,
271 ctrmm_kernel_LNTS, ctrmm_kernel_LTTS, ctrmm_kernel_LRTS, ctrmm_kernel_LCTS,
273 #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
274 ctrmm_iunucopyTS, ctrmm_iunncopyTS, ctrmm_iutucopyTS, ctrmm_iutncopyTS,
275 ctrmm_ilnucopyTS, ctrmm_ilnncopyTS, ctrmm_iltucopyTS, ctrmm_iltncopyTS,
277 ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS,
278 ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS,
280 ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS,
281 ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS,
283 #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
284 csymm_iutcopyTS, csymm_iltcopyTS,
286 csymm_outcopyTS, csymm_oltcopyTS,
288 csymm_outcopyTS, csymm_oltcopyTS,
289 #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N
290 chemm_iutcopyTS, chemm_iltcopyTS,
292 chemm_outcopyTS, chemm_oltcopyTS,
294 chemm_outcopyTS, chemm_oltcopyTS,
298 #if defined(USE_GEMM3M)
299 #ifdef CGEMM3M_DEFAULT_UNROLL_M
300 CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N, MAX(CGEMM3M_DEFAULT_UNROLL_M, CGEMM3M_DEFAULT_UNROLL_N),
302 SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, MAX(SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N),
308 cgemm3m_incopybTS, cgemm3m_incopyrTS,
309 cgemm3m_incopyiTS, cgemm3m_itcopybTS,
310 cgemm3m_itcopyrTS, cgemm3m_itcopyiTS,
311 cgemm3m_oncopybTS, cgemm3m_oncopyrTS,
312 cgemm3m_oncopyiTS, cgemm3m_otcopybTS,
313 cgemm3m_otcopyrTS, cgemm3m_otcopyiTS,
315 csymm3m_iucopybTS, csymm3m_ilcopybTS,
316 csymm3m_iucopyrTS, csymm3m_ilcopyrTS,
317 csymm3m_iucopyiTS, csymm3m_ilcopyiTS,
318 csymm3m_oucopybTS, csymm3m_olcopybTS,
319 csymm3m_oucopyrTS, csymm3m_olcopyrTS,
320 csymm3m_oucopyiTS, csymm3m_olcopyiTS,
322 chemm3m_iucopybTS, chemm3m_ilcopybTS,
323 chemm3m_iucopyrTS, chemm3m_ilcopyrTS,
324 chemm3m_iucopyiTS, chemm3m_ilcopyiTS,
326 chemm3m_oucopybTS, chemm3m_olcopybTS,
327 chemm3m_oucopyrTS, chemm3m_olcopyrTS,
328 chemm3m_oucopyiTS, chemm3m_olcopyiTS,
358 cneg_tcopyTS, claswp_ncopyTS,
364 ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N,
365 #ifdef ZGEMM_DEFAULT_UNROLL_MN
366 ZGEMM_DEFAULT_UNROLL_MN,
368 MAX(ZGEMM_DEFAULT_UNROLL_M, ZGEMM_DEFAULT_UNROLL_N),
371 zamax_kTS, zamin_kTS, izamax_kTS, izamin_kTS,
372 znrm2_kTS, zasum_kTS, zsum_kTS, zcopy_kTS,
373 zdotu_kTS, zdotc_kTS, zdrot_kTS,
374 zaxpy_kTS, zaxpyc_kTS, zscal_kTS, zswap_kTS,
376 zgemv_nTS, zgemv_tTS, zgemv_rTS, zgemv_cTS,
377 zgemv_oTS, zgemv_uTS, zgemv_sTS, zgemv_dTS,
378 zgeru_kTS, zgerc_kTS, zgerv_kTS, zgerd_kTS,
379 zsymv_LTS, zsymv_UTS,
380 zhemv_LTS, zhemv_UTS, zhemv_MTS, zhemv_VTS,
382 zgemm_kernel_nTS, zgemm_kernel_lTS, zgemm_kernel_rTS, zgemm_kernel_bTS,
385 #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
386 zgemm_incopyTS, zgemm_itcopyTS,
388 zgemm_oncopyTS, zgemm_otcopyTS,
390 zgemm_oncopyTS, zgemm_otcopyTS,
392 ztrsm_kernel_LNTS, ztrsm_kernel_LTTS, ztrsm_kernel_LRTS, ztrsm_kernel_LCTS,
393 ztrsm_kernel_RNTS, ztrsm_kernel_RTTS, ztrsm_kernel_RRTS, ztrsm_kernel_RCTS,
395 #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
396 ztrsm_iunucopyTS, ztrsm_iunncopyTS, ztrsm_iutucopyTS, ztrsm_iutncopyTS,
397 ztrsm_ilnucopyTS, ztrsm_ilnncopyTS, ztrsm_iltucopyTS, ztrsm_iltncopyTS,
399 ztrsm_ounucopyTS, ztrsm_ounncopyTS, ztrsm_outucopyTS, ztrsm_outncopyTS,
400 ztrsm_olnucopyTS, ztrsm_olnncopyTS, ztrsm_oltucopyTS, ztrsm_oltncopyTS,
402 ztrsm_ounucopyTS, ztrsm_ounncopyTS, ztrsm_outucopyTS, ztrsm_outncopyTS,
403 ztrsm_olnucopyTS, ztrsm_olnncopyTS, ztrsm_oltucopyTS, ztrsm_oltncopyTS,
405 ztrmm_kernel_RNTS, ztrmm_kernel_RTTS, ztrmm_kernel_RRTS, ztrmm_kernel_RCTS,
406 ztrmm_kernel_LNTS, ztrmm_kernel_LTTS, ztrmm_kernel_LRTS, ztrmm_kernel_LCTS,
408 #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
409 ztrmm_iunucopyTS, ztrmm_iunncopyTS, ztrmm_iutucopyTS, ztrmm_iutncopyTS,
410 ztrmm_ilnucopyTS, ztrmm_ilnncopyTS, ztrmm_iltucopyTS, ztrmm_iltncopyTS,
412 ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS,
413 ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS,
415 ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS,
416 ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS,
418 #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
419 zsymm_iutcopyTS, zsymm_iltcopyTS,
421 zsymm_outcopyTS, zsymm_oltcopyTS,
423 zsymm_outcopyTS, zsymm_oltcopyTS,
424 #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N
425 zhemm_iutcopyTS, zhemm_iltcopyTS,
427 zhemm_outcopyTS, zhemm_oltcopyTS,
429 zhemm_outcopyTS, zhemm_oltcopyTS,
432 #if defined(USE_GEMM3M)
433 #ifdef ZGEMM3M_DEFAULT_UNROLL_M
434 ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N, MAX(ZGEMM3M_DEFAULT_UNROLL_M, ZGEMM3M_DEFAULT_UNROLL_N),
436 DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N, MAX(DGEMM_DEFAULT_UNROLL_M, DGEMM_DEFAULT_UNROLL_N),
442 zgemm3m_incopybTS, zgemm3m_incopyrTS,
443 zgemm3m_incopyiTS, zgemm3m_itcopybTS,
444 zgemm3m_itcopyrTS, zgemm3m_itcopyiTS,
445 zgemm3m_oncopybTS, zgemm3m_oncopyrTS,
446 zgemm3m_oncopyiTS, zgemm3m_otcopybTS,
447 zgemm3m_otcopyrTS, zgemm3m_otcopyiTS,
449 zsymm3m_iucopybTS, zsymm3m_ilcopybTS,
450 zsymm3m_iucopyrTS, zsymm3m_ilcopyrTS,
451 zsymm3m_iucopyiTS, zsymm3m_ilcopyiTS,
452 zsymm3m_oucopybTS, zsymm3m_olcopybTS,
453 zsymm3m_oucopyrTS, zsymm3m_olcopyrTS,
454 zsymm3m_oucopyiTS, zsymm3m_olcopyiTS,
456 zhemm3m_iucopybTS, zhemm3m_ilcopybTS,
457 zhemm3m_iucopyrTS, zhemm3m_ilcopyrTS,
458 zhemm3m_iucopyiTS, zhemm3m_ilcopyiTS,
460 zhemm3m_oucopybTS, zhemm3m_olcopybTS,
461 zhemm3m_oucopyrTS, zhemm3m_olcopyrTS,
462 zhemm3m_oucopyiTS, zhemm3m_olcopyiTS,
492 zneg_tcopyTS, zlaswp_ncopyTS,
500 XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N, MAX(XGEMM_DEFAULT_UNROLL_M, XGEMM_DEFAULT_UNROLL_N),
502 xamax_kTS, xamin_kTS, ixamax_kTS, ixamin_kTS,
503 xnrm2_kTS, xasum_kTS, xsum_kTS, xcopy_kTS,
504 xdotu_kTS, xdotc_kTS, xqrot_kTS,
505 xaxpy_kTS, xaxpyc_kTS, xscal_kTS, xswap_kTS,
507 xgemv_nTS, xgemv_tTS, xgemv_rTS, xgemv_cTS,
508 xgemv_oTS, xgemv_uTS, xgemv_sTS, xgemv_dTS,
509 xgeru_kTS, xgerc_kTS, xgerv_kTS, xgerd_kTS,
510 xsymv_LTS, xsymv_UTS,
511 xhemv_LTS, xhemv_UTS, xhemv_MTS, xhemv_VTS,
513 xgemm_kernel_nTS, xgemm_kernel_lTS, xgemm_kernel_rTS, xgemm_kernel_bTS,
516 #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
517 xgemm_incopyTS, xgemm_itcopyTS,
519 xgemm_oncopyTS, xgemm_otcopyTS,
521 xgemm_oncopyTS, xgemm_otcopyTS,
523 xtrsm_kernel_LNTS, xtrsm_kernel_LTTS, xtrsm_kernel_LRTS, xtrsm_kernel_LCTS,
524 xtrsm_kernel_RNTS, xtrsm_kernel_RTTS, xtrsm_kernel_RRTS, xtrsm_kernel_RCTS,
526 #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
527 xtrsm_iunucopyTS, xtrsm_iunncopyTS, xtrsm_iutucopyTS, xtrsm_iutncopyTS,
528 xtrsm_ilnucopyTS, xtrsm_ilnncopyTS, xtrsm_iltucopyTS, xtrsm_iltncopyTS,
530 xtrsm_ounucopyTS, xtrsm_ounncopyTS, xtrsm_outucopyTS, xtrsm_outncopyTS,
531 xtrsm_olnucopyTS, xtrsm_olnncopyTS, xtrsm_oltucopyTS, xtrsm_oltncopyTS,
533 xtrsm_ounucopyTS, xtrsm_ounncopyTS, xtrsm_outucopyTS, xtrsm_outncopyTS,
534 xtrsm_olnucopyTS, xtrsm_olnncopyTS, xtrsm_oltucopyTS, xtrsm_oltncopyTS,
536 xtrmm_kernel_RNTS, xtrmm_kernel_RTTS, xtrmm_kernel_RRTS, xtrmm_kernel_RCTS,
537 xtrmm_kernel_LNTS, xtrmm_kernel_LTTS, xtrmm_kernel_LRTS, xtrmm_kernel_LCTS,
539 #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
540 xtrmm_iunucopyTS, xtrmm_iunncopyTS, xtrmm_iutucopyTS, xtrmm_iutncopyTS,
541 xtrmm_ilnucopyTS, xtrmm_ilnncopyTS, xtrmm_iltucopyTS, xtrmm_iltncopyTS,
543 xtrmm_ounucopyTS, xtrmm_ounncopyTS, xtrmm_outucopyTS, xtrmm_outncopyTS,
544 xtrmm_olnucopyTS, xtrmm_olnncopyTS, xtrmm_oltucopyTS, xtrmm_oltncopyTS,
546 xtrmm_ounucopyTS, xtrmm_ounncopyTS, xtrmm_outucopyTS, xtrmm_outncopyTS,
547 xtrmm_olnucopyTS, xtrmm_olnncopyTS, xtrmm_oltucopyTS, xtrmm_oltncopyTS,
549 #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
550 xsymm_iutcopyTS, xsymm_iltcopyTS,
552 xsymm_outcopyTS, xsymm_oltcopyTS,
554 xsymm_outcopyTS, xsymm_oltcopyTS,
555 #if XGEMM_DEFAULT_UNROLL_M != XGEMM_DEFAULT_UNROLL_N
556 xhemm_iutcopyTS, xhemm_iltcopyTS,
558 xhemm_outcopyTS, xhemm_oltcopyTS,
560 xhemm_outcopyTS, xhemm_oltcopyTS,
563 #if defined(USE_GEMM3M)
564 QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N, MAX(QGEMM_DEFAULT_UNROLL_M, QGEMM_DEFAULT_UNROLL_N),
568 xgemm3m_incopybTS, xgemm3m_incopyrTS,
569 xgemm3m_incopyiTS, xgemm3m_itcopybTS,
570 xgemm3m_itcopyrTS, xgemm3m_itcopyiTS,
571 xgemm3m_oncopybTS, xgemm3m_oncopyrTS,
572 xgemm3m_oncopyiTS, xgemm3m_otcopybTS,
573 xgemm3m_otcopyrTS, xgemm3m_otcopyiTS,
575 xsymm3m_iucopybTS, xsymm3m_ilcopybTS,
576 xsymm3m_iucopyrTS, xsymm3m_ilcopyrTS,
577 xsymm3m_iucopyiTS, xsymm3m_ilcopyiTS,
578 xsymm3m_oucopybTS, xsymm3m_olcopybTS,
579 xsymm3m_oucopyrTS, xsymm3m_olcopyrTS,
580 xsymm3m_oucopyiTS, xsymm3m_olcopyiTS,
582 xhemm3m_iucopybTS, xhemm3m_ilcopybTS,
583 xhemm3m_iucopyrTS, xhemm3m_ilcopyrTS,
584 xhemm3m_iucopyiTS, xhemm3m_ilcopyiTS,
586 xhemm3m_oucopybTS, xhemm3m_olcopybTS,
587 xhemm3m_oucopyrTS, xhemm3m_olcopyrTS,
588 xhemm3m_oucopyiTS, xhemm3m_olcopyiTS,
618 xneg_tcopyTS, xlaswp_ncopyTS,
627 SNUMOPT, DNUMOPT, QNUMOPT,
629 saxpby_kTS, daxpby_kTS, caxpby_kTS, zaxpby_kTS,
631 somatcopy_k_cnTS, somatcopy_k_ctTS, somatcopy_k_rnTS, somatcopy_k_rtTS,
632 domatcopy_k_cnTS, domatcopy_k_ctTS, domatcopy_k_rnTS, domatcopy_k_rtTS,
633 comatcopy_k_cnTS, comatcopy_k_ctTS, comatcopy_k_rnTS, comatcopy_k_rtTS,
634 comatcopy_k_cncTS, comatcopy_k_ctcTS, comatcopy_k_rncTS, comatcopy_k_rtcTS,
635 zomatcopy_k_cnTS, zomatcopy_k_ctTS, zomatcopy_k_rnTS, zomatcopy_k_rtTS,
636 zomatcopy_k_cncTS, zomatcopy_k_ctcTS, zomatcopy_k_rncTS, zomatcopy_k_rtcTS,
638 simatcopy_k_cnTS, simatcopy_k_ctTS, simatcopy_k_rnTS, simatcopy_k_rtTS,
639 dimatcopy_k_cnTS, dimatcopy_k_ctTS, dimatcopy_k_rnTS, dimatcopy_k_rtTS,
640 cimatcopy_k_cnTS, cimatcopy_k_ctTS, cimatcopy_k_rnTS, cimatcopy_k_rtTS,
641 cimatcopy_k_cncTS, cimatcopy_k_ctcTS, cimatcopy_k_rncTS, cimatcopy_k_rtcTS,
642 zimatcopy_k_cnTS, zimatcopy_k_ctTS, zimatcopy_k_rnTS, zimatcopy_k_rtTS,
643 zimatcopy_k_cncTS, zimatcopy_k_ctcTS, zimatcopy_k_rncTS, zimatcopy_k_rtcTS,
645 sgeadd_kTS, dgeadd_kTS, cgeadd_kTS, zgeadd_kTS
649 #if defined(ARCH_ARM64)
650 static void init_parameter(void) {
651 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
652 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
653 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
654 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
656 TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
657 TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
658 TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
659 TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
661 TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
662 TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
663 TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
664 TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
667 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
668 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
669 TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q;
670 TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
671 TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R;
672 TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R;
675 #if defined(USE_GEMM3M)
676 #ifdef CGEMM3M_DEFAULT_P
677 TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
679 TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p;
682 #ifdef ZGEMM3M_DEFAULT_P
683 TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P;
685 TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p;
688 #ifdef CGEMM3M_DEFAULT_Q
689 TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q;
691 TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q;
694 #ifdef ZGEMM3M_DEFAULT_Q
695 TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q;
697 TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q;
700 #ifdef CGEMM3M_DEFAULT_R
701 TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R;
703 TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r;
706 #ifdef ZGEMM3M_DEFAULT_R
707 TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R;
709 TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r;
713 TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p;
714 TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q;
715 TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r;
720 #else // defined(ARCH_ARM64)
721 #if defined(ARCH_POWER)
722 static void init_parameter(void) {
724 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
725 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
726 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
727 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
729 TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
730 TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
731 TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
732 TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
735 TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
736 TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
737 TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
738 TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
743 static int get_l2_size_old(void){
744 int i, eax, ebx, ecx, edx, cpuid_level;
747 cpuid(2, &eax, &ebx, &ecx, &edx);
749 info[ 0] = BITMASK(eax, 8, 0xff);
750 info[ 1] = BITMASK(eax, 16, 0xff);
751 info[ 2] = BITMASK(eax, 24, 0xff);
753 info[ 3] = BITMASK(ebx, 0, 0xff);
754 info[ 4] = BITMASK(ebx, 8, 0xff);
755 info[ 5] = BITMASK(ebx, 16, 0xff);
756 info[ 6] = BITMASK(ebx, 24, 0xff);
758 info[ 7] = BITMASK(ecx, 0, 0xff);
759 info[ 8] = BITMASK(ecx, 8, 0xff);
760 info[ 9] = BITMASK(ecx, 16, 0xff);
761 info[10] = BITMASK(ecx, 24, 0xff);
763 info[11] = BITMASK(edx, 0, 0xff);
764 info[12] = BITMASK(edx, 8, 0xff);
765 info[13] = BITMASK(edx, 16, 0xff);
766 info[14] = BITMASK(edx, 24, 0xff);
768 for (i = 0; i < 15; i++){
772 /* This table is from http://www.sandpile.org/ia32/cpuid.htm */
829 fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n");
834 static __inline__ int get_l2_size(void){
836 int eax, ebx, ecx, edx, l2;
838 cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
840 l2 = BITMASK(ecx, 16, 0xffff);
844 fprintf (stderr,"OpenBLAS WARNING - could not determine the L2 cache size on this system, assuming 256k\n");
851 if (l2 > 0) return l2;
853 return get_l2_size_old();
857 static __inline__ int get_l3_size(void){
859 int eax, ebx, ecx, edx;
861 cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
863 return BITMASK(edx, 18, 0x3fff) * 512;
867 static void init_parameter(void) {
869 int l2 = get_l2_size();
871 (void) l2; /* dirty trick to suppress unused variable warning for targets */
872 /* where the GEMM unrolling parameters do not depend on l2 */
874 TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
875 TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
876 TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
877 TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
879 #ifdef CGEMM3M_DEFAULT_Q
880 TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q;
882 TABLE_NAME.cgemm3m_q = SGEMM_DEFAULT_Q;
885 #ifdef ZGEMM3M_DEFAULT_Q
886 TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q;
888 TABLE_NAME.zgemm3m_q = DGEMM_DEFAULT_Q;
892 TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q;
893 TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
894 TABLE_NAME.xgemm3m_q = QGEMM_DEFAULT_Q;
897 #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) || defined(CORE_YONAH) || defined(CORE_ATHLON)
900 fprintf(stderr, "Katmai, Coppermine, Banias, Athlon\n");
903 TABLE_NAME.sgemm_p = 64 * (l2 >> 7);
904 TABLE_NAME.dgemm_p = 32 * (l2 >> 7);
905 TABLE_NAME.cgemm_p = 32 * (l2 >> 7);
906 TABLE_NAME.zgemm_p = 16 * (l2 >> 7);
908 TABLE_NAME.qgemm_p = 16 * (l2 >> 7);
909 TABLE_NAME.xgemm_p = 8 * (l2 >> 7);
913 #ifdef CORE_NORTHWOOD
916 fprintf(stderr, "Northwood\n");
919 TABLE_NAME.sgemm_p = 96 * (l2 >> 7);
920 TABLE_NAME.dgemm_p = 48 * (l2 >> 7);
921 TABLE_NAME.cgemm_p = 48 * (l2 >> 7);
922 TABLE_NAME.zgemm_p = 24 * (l2 >> 7);
924 TABLE_NAME.qgemm_p = 24 * (l2 >> 7);
925 TABLE_NAME.xgemm_p = 12 * (l2 >> 7);
932 fprintf(stderr, "Atom\n");
935 TABLE_NAME.sgemm_p = 256;
936 TABLE_NAME.dgemm_p = 128;
937 TABLE_NAME.cgemm_p = 128;
938 TABLE_NAME.zgemm_p = 64;
940 TABLE_NAME.qgemm_p = 64;
941 TABLE_NAME.xgemm_p = 32;
948 fprintf(stderr, "Prescott\n");
951 TABLE_NAME.sgemm_p = 56 * (l2 >> 7);
952 TABLE_NAME.dgemm_p = 28 * (l2 >> 7);
953 TABLE_NAME.cgemm_p = 28 * (l2 >> 7);
954 TABLE_NAME.zgemm_p = 14 * (l2 >> 7);
956 TABLE_NAME.qgemm_p = 14 * (l2 >> 7);
957 TABLE_NAME.xgemm_p = 7 * (l2 >> 7);
964 fprintf(stderr, "Core2\n");
967 TABLE_NAME.sgemm_p = 92 * (l2 >> 9) + 8;
968 TABLE_NAME.dgemm_p = 46 * (l2 >> 9) + 8;
969 TABLE_NAME.cgemm_p = 46 * (l2 >> 9) + 4;
970 TABLE_NAME.zgemm_p = 23 * (l2 >> 9) + 4;
972 TABLE_NAME.qgemm_p = 92 * (l2 >> 9) + 8;
973 TABLE_NAME.xgemm_p = 46 * (l2 >> 9) + 4;
980 fprintf(stderr, "Penryn\n");
983 TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8;
984 TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8;
985 TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4;
986 TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4;
988 TABLE_NAME.qgemm_p = 42 * (l2 >> 9) + 8;
989 TABLE_NAME.xgemm_p = 21 * (l2 >> 9) + 4;
996 fprintf(stderr, "Dunnington\n");
999 TABLE_NAME.sgemm_p = 42 * (l2 >> 9) + 8;
1000 TABLE_NAME.dgemm_p = 42 * (l2 >> 9) + 8;
1001 TABLE_NAME.cgemm_p = 21 * (l2 >> 9) + 4;
1002 TABLE_NAME.zgemm_p = 21 * (l2 >> 9) + 4;
1004 TABLE_NAME.qgemm_p = 42 * (l2 >> 9) + 8;
1005 TABLE_NAME.xgemm_p = 21 * (l2 >> 9) + 4;
1013 fprintf(stderr, "Nehalem\n");
1016 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
1017 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
1018 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
1019 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
1021 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
1022 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
1029 fprintf(stderr, "Sandybridge\n");
1032 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
1033 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
1034 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
1035 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
1037 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
1038 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
1045 fprintf(stderr, "Haswell\n");
1048 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
1049 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
1050 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
1051 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
1053 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
1054 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
1061 fprintf(stderr, "SkylakeX\n");
1064 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
1065 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
1066 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
1067 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
1069 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
1070 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
1078 fprintf(stderr, "Opteron\n");
1081 TABLE_NAME.sgemm_p = 224 + 56 * (l2 >> 7);
1082 TABLE_NAME.dgemm_p = 112 + 28 * (l2 >> 7);
1083 TABLE_NAME.cgemm_p = 112 + 28 * (l2 >> 7);
1084 TABLE_NAME.zgemm_p = 56 + 14 * (l2 >> 7);
1086 TABLE_NAME.qgemm_p = 56 + 14 * (l2 >> 7);
1087 TABLE_NAME.xgemm_p = 28 + 7 * (l2 >> 7);
1094 fprintf(stderr, "Barcelona\n");
1097 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
1098 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
1099 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
1100 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
1102 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
1103 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
1110 fprintf(stderr, "Bobcate\n");
1113 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
1114 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
1115 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
1116 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
1118 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
1119 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
1126 fprintf(stderr, "Bulldozer\n");
1129 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
1130 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
1131 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
1132 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
1134 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
1135 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
1142 fprintf(stderr, "Excavator\n");
1145 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
1146 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
1147 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
1148 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
1150 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
1151 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
1159 fprintf(stderr, "Piledriver\n");
1162 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
1163 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
1164 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
1165 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
1167 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
1168 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
1175 fprintf(stderr, "Steamroller\n");
1178 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
1179 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
1180 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
1181 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
1183 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
1184 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
1191 fprintf(stderr, "Zen\n");
1194 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
1195 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
1196 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
1197 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
1199 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
1200 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
1208 fprintf(stderr, "NANO\n");
1211 TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
1212 TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
1213 TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
1214 TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
1219 TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
1220 TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
1226 #ifdef CGEMM3M_DEFAULT_P
1227 TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
1229 TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p;
1232 #ifdef ZGEMM3M_DEFAULT_P
1233 TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P;
1235 TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p;
1239 TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p;
1244 TABLE_NAME.sgemm_p = ((TABLE_NAME.sgemm_p + SGEMM_DEFAULT_UNROLL_M - 1)/SGEMM_DEFAULT_UNROLL_M) * SGEMM_DEFAULT_UNROLL_M;
1245 TABLE_NAME.dgemm_p = ((TABLE_NAME.dgemm_p + DGEMM_DEFAULT_UNROLL_M - 1)/DGEMM_DEFAULT_UNROLL_M) * DGEMM_DEFAULT_UNROLL_M;
1246 TABLE_NAME.cgemm_p = ((TABLE_NAME.cgemm_p + CGEMM_DEFAULT_UNROLL_M - 1)/CGEMM_DEFAULT_UNROLL_M) * CGEMM_DEFAULT_UNROLL_M;
1247 TABLE_NAME.zgemm_p = ((TABLE_NAME.zgemm_p + ZGEMM_DEFAULT_UNROLL_M - 1)/ZGEMM_DEFAULT_UNROLL_M) * ZGEMM_DEFAULT_UNROLL_M;
1249 #ifdef CGEMM3M_DEFAULT_UNROLL_M
1250 TABLE_NAME.cgemm3m_p = ((TABLE_NAME.cgemm3m_p + CGEMM3M_DEFAULT_UNROLL_M - 1)/CGEMM3M_DEFAULT_UNROLL_M) * CGEMM3M_DEFAULT_UNROLL_M;
1252 TABLE_NAME.cgemm3m_p = ((TABLE_NAME.cgemm3m_p + SGEMM_DEFAULT_UNROLL_M - 1)/SGEMM_DEFAULT_UNROLL_M) * SGEMM_DEFAULT_UNROLL_M;
1255 #ifdef ZGEMM3M_DEFAULT_UNROLL_M
1256 TABLE_NAME.zgemm3m_p = ((TABLE_NAME.zgemm3m_p + ZGEMM3M_DEFAULT_UNROLL_M - 1)/ZGEMM3M_DEFAULT_UNROLL_M) * ZGEMM3M_DEFAULT_UNROLL_M;
1258 TABLE_NAME.zgemm3m_p = ((TABLE_NAME.zgemm3m_p + DGEMM_DEFAULT_UNROLL_M - 1)/DGEMM_DEFAULT_UNROLL_M) * DGEMM_DEFAULT_UNROLL_M;
1261 #ifdef QUAD_PRECISION
1262 TABLE_NAME.qgemm_p = ((TABLE_NAME.qgemm_p + QGEMM_DEFAULT_UNROLL_M - 1)/QGEMM_DEFAULT_UNROLL_M) * QGEMM_DEFAULT_UNROLL_M;
1263 TABLE_NAME.xgemm_p = ((TABLE_NAME.xgemm_p + XGEMM_DEFAULT_UNROLL_M - 1)/XGEMM_DEFAULT_UNROLL_M) * XGEMM_DEFAULT_UNROLL_M;
1264 TABLE_NAME.xgemm3m_p = ((TABLE_NAME.xgemm3m_p + QGEMM_DEFAULT_UNROLL_M - 1)/QGEMM_DEFAULT_UNROLL_M) * QGEMM_DEFAULT_UNROLL_M;
1268 fprintf(stderr, "L2 = %8d DGEMM_P .. %d\n", l2, TABLE_NAME.dgemm_p);
1271 TABLE_NAME.sgemm_r = (((BUFFER_SIZE -
1272 ((TABLE_NAME.sgemm_p * TABLE_NAME.sgemm_q * 4 + TABLE_NAME.offsetA
1273 + TABLE_NAME.align) & ~TABLE_NAME.align)
1274 ) / (TABLE_NAME.sgemm_q * 4) - 15) & ~15);
1276 TABLE_NAME.dgemm_r = (((BUFFER_SIZE -
1277 ((TABLE_NAME.dgemm_p * TABLE_NAME.dgemm_q * 8 + TABLE_NAME.offsetA
1278 + TABLE_NAME.align) & ~TABLE_NAME.align)
1279 ) / (TABLE_NAME.dgemm_q * 8) - 15) & ~15);
1282 TABLE_NAME.qgemm_r = (((BUFFER_SIZE -
1283 ((TABLE_NAME.qgemm_p * TABLE_NAME.qgemm_q * 16 + TABLE_NAME.offsetA
1284 + TABLE_NAME.align) & ~TABLE_NAME.align)
1285 ) / (TABLE_NAME.qgemm_q * 16) - 15) & ~15);
1288 TABLE_NAME.cgemm_r = (((BUFFER_SIZE -
1289 ((TABLE_NAME.cgemm_p * TABLE_NAME.cgemm_q * 8 + TABLE_NAME.offsetA
1290 + TABLE_NAME.align) & ~TABLE_NAME.align)
1291 ) / (TABLE_NAME.cgemm_q * 8) - 15) & ~15);
1293 TABLE_NAME.zgemm_r = (((BUFFER_SIZE -
1294 ((TABLE_NAME.zgemm_p * TABLE_NAME.zgemm_q * 16 + TABLE_NAME.offsetA
1295 + TABLE_NAME.align) & ~TABLE_NAME.align)
1296 ) / (TABLE_NAME.zgemm_q * 16) - 15) & ~15);
1298 TABLE_NAME.cgemm3m_r = (((BUFFER_SIZE -
1299 ((TABLE_NAME.cgemm3m_p * TABLE_NAME.cgemm3m_q * 8 + TABLE_NAME.offsetA
1300 + TABLE_NAME.align) & ~TABLE_NAME.align)
1301 ) / (TABLE_NAME.cgemm3m_q * 8) - 15) & ~15);
1303 TABLE_NAME.zgemm3m_r = (((BUFFER_SIZE -
1304 ((TABLE_NAME.zgemm3m_p * TABLE_NAME.zgemm3m_q * 16 + TABLE_NAME.offsetA
1305 + TABLE_NAME.align) & ~TABLE_NAME.align)
1306 ) / (TABLE_NAME.zgemm3m_q * 16) - 15) & ~15);
1312 TABLE_NAME.xgemm_r = (((BUFFER_SIZE -
1313 ((TABLE_NAME.xgemm_p * TABLE_NAME.xgemm_q * 32 + TABLE_NAME.offsetA
1314 + TABLE_NAME.align) & ~TABLE_NAME.align)
1315 ) / (TABLE_NAME.xgemm_q * 32) - 15) & ~15);
1317 TABLE_NAME.xgemm3m_r = (((BUFFER_SIZE -
1318 ((TABLE_NAME.xgemm3m_p * TABLE_NAME.xgemm3m_q * 32 + TABLE_NAME.offsetA
1319 + TABLE_NAME.align) & ~TABLE_NAME.align)
1320 ) / (TABLE_NAME.xgemm3m_q * 32) - 15) & ~15);
1328 #endif //defined(ARCH_ARM64)