1 /*********************************************************************/
2 /* Copyright 2009, 2010 The University of Texas at Austin. */
3 /* All rights reserved. */
5 /* Redistribution and use in source and binary forms, with or */
6 /* without modification, are permitted provided that the following */
7 /* conditions are met: */
9 /* 1. Redistributions of source code must retain the above */
10 /* copyright notice, this list of conditions and the following */
13 /* 2. Redistributions in binary form must reproduce the above */
14 /* copyright notice, this list of conditions and the following */
15 /* disclaimer in the documentation and/or other materials */
16 /* provided with the distribution. */
18 /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
19 /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
20 /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
21 /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
22 /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
23 /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
24 /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
25 /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
26 /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
27 /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
28 /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
29 /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
30 /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
31 /* POSSIBILITY OF SUCH DAMAGE. */
33 /* The views and conclusions contained in the software and */
34 /* documentation are those of the authors and should not be */
35 /* interpreted as representing official policies, either expressed */
36 /* or implied, of The University of Texas at Austin. */
37 /*********************************************************************/
43 extern int openblas_block_factor();
44 int get_L2_size(void);
46 #define DEFAULT_GEMM_P 128
47 #define DEFAULT_GEMM_Q 128
48 #define DEFAULT_GEMM_R 128
49 #define DEFAULT_GEMM_OFFSET_A 0
50 #define DEFAULT_GEMM_OFFSET_B 0
52 /* Global Parameter */
53 #if GEMM_OFFSET_A == gemm_offset_a
54 BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A;
56 BLASLONG gemm_offset_a = GEMM_OFFSET_A;
59 #if GEMM_OFFSET_B == gemm_offset_b
60 BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
62 BLASLONG gemm_offset_b = GEMM_OFFSET_B;
65 #if SGEMM_P == sgemm_p
66 BLASLONG sgemm_p = DEFAULT_GEMM_P;
68 BLASLONG sgemm_p = SGEMM_P;
70 #if DGEMM_P == dgemm_p
71 BLASLONG dgemm_p = DEFAULT_GEMM_P;
73 BLASLONG dgemm_p = DGEMM_P;
75 #if CGEMM_P == cgemm_p
76 BLASLONG cgemm_p = DEFAULT_GEMM_P;
78 BLASLONG cgemm_p = CGEMM_P;
80 #if ZGEMM_P == zgemm_p
81 BLASLONG zgemm_p = DEFAULT_GEMM_P;
83 BLASLONG zgemm_p = ZGEMM_P;
86 #if SGEMM_Q == sgemm_q
87 BLASLONG sgemm_q = DEFAULT_GEMM_Q;
89 BLASLONG sgemm_q = SGEMM_Q;
91 #if DGEMM_Q == dgemm_q
92 BLASLONG dgemm_q = DEFAULT_GEMM_Q;
94 BLASLONG dgemm_q = DGEMM_Q;
96 #if CGEMM_Q == cgemm_q
97 BLASLONG cgemm_q = DEFAULT_GEMM_Q;
99 BLASLONG cgemm_q = CGEMM_Q;
101 #if ZGEMM_Q == zgemm_q
102 BLASLONG zgemm_q = DEFAULT_GEMM_Q;
104 BLASLONG zgemm_q = ZGEMM_Q;
107 #if SGEMM_R == sgemm_r
108 BLASLONG sgemm_r = DEFAULT_GEMM_R;
110 BLASLONG sgemm_r = SGEMM_R;
112 #if DGEMM_R == dgemm_r
113 BLASLONG dgemm_r = DEFAULT_GEMM_R;
115 BLASLONG dgemm_r = DGEMM_R;
117 #if CGEMM_R == cgemm_r
118 BLASLONG cgemm_r = DEFAULT_GEMM_R;
120 BLASLONG cgemm_r = CGEMM_R;
122 #if ZGEMM_R == zgemm_r
123 BLASLONG zgemm_r = DEFAULT_GEMM_R;
125 BLASLONG zgemm_r = ZGEMM_R;
128 #if defined(EXPRECISION) || defined(QUAD_PRECISION)
129 #if QGEMM_P == qgemm_p
130 BLASLONG qgemm_p = DEFAULT_GEMM_P;
132 BLASLONG qgemm_p = QGEMM_P;
134 #if XGEMM_P == xgemm_p
135 BLASLONG xgemm_p = DEFAULT_GEMM_P;
137 BLASLONG xgemm_p = XGEMM_P;
139 #if QGEMM_Q == qgemm_q
140 BLASLONG qgemm_q = DEFAULT_GEMM_Q;
142 BLASLONG qgemm_q = QGEMM_Q;
144 #if XGEMM_Q == xgemm_q
145 BLASLONG xgemm_q = DEFAULT_GEMM_Q;
147 BLASLONG xgemm_q = XGEMM_Q;
149 #if QGEMM_R == qgemm_r
150 BLASLONG qgemm_r = DEFAULT_GEMM_R;
152 BLASLONG qgemm_r = QGEMM_R;
154 #if XGEMM_R == xgemm_r
155 BLASLONG xgemm_r = DEFAULT_GEMM_R;
157 BLASLONG xgemm_r = XGEMM_R;
161 #if defined(ARCH_X86) || defined(ARCH_X86_64)
163 int get_L2_size(void){
165 int eax, ebx, ecx, edx;
167 #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
168 defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
169 defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \
170 defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
172 cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
174 return BITMASK(ecx, 16, 0xffff);
181 cpuid(2, &eax, &ebx, &ecx, &edx);
183 info[ 0] = BITMASK(eax, 8, 0xff);
184 info[ 1] = BITMASK(eax, 16, 0xff);
185 info[ 2] = BITMASK(eax, 24, 0xff);
187 info[ 3] = BITMASK(ebx, 0, 0xff);
188 info[ 4] = BITMASK(ebx, 8, 0xff);
189 info[ 5] = BITMASK(ebx, 16, 0xff);
190 info[ 6] = BITMASK(ebx, 24, 0xff);
192 info[ 7] = BITMASK(ecx, 0, 0xff);
193 info[ 8] = BITMASK(ecx, 8, 0xff);
194 info[ 9] = BITMASK(ecx, 16, 0xff);
195 info[10] = BITMASK(ecx, 24, 0xff);
197 info[11] = BITMASK(edx, 0, 0xff);
198 info[12] = BITMASK(edx, 8, 0xff);
199 info[13] = BITMASK(edx, 16, 0xff);
200 info[14] = BITMASK(edx, 24, 0xff);
202 for (i = 0; i < 15; i++){
251 void blas_set_parameter(void){
254 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || defined(SKYLAKEX)
257 int size = get_L2_size();
260 #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
263 #if defined(CORE_BANIAS) && (HAVE_HIT > 1)
264 sgemm_p = 64 / HAVE_HIT * size;
265 dgemm_p = 32 / HAVE_HIT * size;
266 cgemm_p = 32 / HAVE_HIT * size;
267 zgemm_p = 16 / HAVE_HIT * size;
269 qgemm_p = 16 / HAVE_HIT * size;
270 xgemm_p = 8 / HAVE_HIT * size;
272 #ifdef QUAD_PRECISION
273 qgemm_p = 8 / HAVE_HIT * size;
274 xgemm_p = 4 / HAVE_HIT * size;
285 #ifdef QUAD_PRECISION
292 #if defined(CORE_NORTHWOOD)
296 sgemm_p = 128 * size;
304 #ifdef QUAD_PRECISION
317 #ifdef QUAD_PRECISION
324 #if defined(CORE_CORE2)
337 #ifdef QUAD_PRECISION
356 #ifdef QUAD_PRECISION
357 qgemm_p = 21 * size + 4;
358 xgemm_p = 10 * size + 2;
362 #if defined(DUNNINGTON)
375 #ifdef QUAD_PRECISION
376 qgemm_p = 21 * size + 4;
377 xgemm_p = 10 * size + 2;
392 #if defined(SANDYBRIDGE)
403 #if defined(CORE_PRESCOTT) || defined(GENERIC)
406 if (size > 16) size = 16;
416 #ifdef QUAD_PRECISION
422 #if defined(CORE_OPTERON)
423 sgemm_p = 224 + 14 * (size >> 5);
424 dgemm_p = 112 + 14 * (size >> 6);
425 cgemm_p = 116 + 14 * (size >> 6);
426 zgemm_p = 58 + 14 * (size >> 7);
428 qgemm_p = 58 + 14 * (size >> 7);
429 xgemm_p = 29 + 14 * (size >> 8);
431 #ifdef QUAD_PRECISION
432 qgemm_p = 29 + 14 * (size >> 8);
433 xgemm_p = 15 + 14 * (size >> 9);
448 #ifdef QUAD_PRECISION
454 #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT)
457 sgemm_p = 232 * size;
458 dgemm_p = 116 * size;
459 cgemm_p = 116 * size;
465 #ifdef QUAD_PRECISION
471 factor=openblas_block_factor();
473 if (factor < 10) factor = 10;
474 if (factor > 200) factor = 200;
476 sgemm_p = ((long)((double)sgemm_p * (double)factor * 1.e-2)) & ~7L;
477 dgemm_p = ((long)((double)dgemm_p * (double)factor * 1.e-2)) & ~7L;
478 cgemm_p = ((long)((double)cgemm_p * (double)factor * 1.e-2)) & ~7L;
479 zgemm_p = ((long)((double)zgemm_p * (double)factor * 1.e-2)) & ~7L;
481 qgemm_p = ((long)((double)qgemm_p * (double)factor * 1.e-2)) & ~7L;
482 xgemm_p = ((long)((double)xgemm_p * (double)factor * 1.e-2)) & ~7L;
486 if (sgemm_p == 0) sgemm_p = 64;
487 if (dgemm_p == 0) dgemm_p = 64;
488 if (cgemm_p == 0) cgemm_p = 64;
489 if (zgemm_p == 0) zgemm_p = 64;
491 if (qgemm_p == 0) qgemm_p = 64;
492 if (xgemm_p == 0) xgemm_p = 64;
495 #ifdef QUAD_PRECISION
496 if (qgemm_p == 0) qgemm_p = 64;
497 if (xgemm_p == 0) xgemm_p = 64;
500 sgemm_p = ((sgemm_p + SGEMM_UNROLL_M - 1)/SGEMM_UNROLL_M) * SGEMM_UNROLL_M;
501 dgemm_p = ((dgemm_p + DGEMM_UNROLL_M - 1)/DGEMM_UNROLL_M) * DGEMM_UNROLL_M;
502 cgemm_p = ((cgemm_p + CGEMM_UNROLL_M - 1)/CGEMM_UNROLL_M) * CGEMM_UNROLL_M;
503 zgemm_p = ((zgemm_p + ZGEMM_UNROLL_M - 1)/ZGEMM_UNROLL_M) * ZGEMM_UNROLL_M;
504 #ifdef QUAD_PRECISION
505 qgemm_p = ((qgemm_p + QGEMM_UNROLL_M - 1)/QGEMM_UNROLL_M) * QGEMM_UNROLL_M;
506 xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
509 sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
510 dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
511 cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
512 zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
513 #if defined(EXPRECISION) || defined(QUAD_PRECISION)
514 qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
515 xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
519 fprintf(stderr, "SGEMM ... %3d, %3d, %3d\n", SGEMM_P, SGEMM_Q, SGEMM_R);
520 fprintf(stderr, "DGEMM ... %3d, %3d, %3d\n", DGEMM_P, DGEMM_Q, DGEMM_R);
521 fprintf(stderr, "CGEMM ... %3d, %3d, %3d\n", CGEMM_P, CGEMM_Q, CGEMM_R);
522 fprintf(stderr, "ZGEMM ... %3d, %3d, %3d\n", ZGEMM_P, ZGEMM_Q, ZGEMM_R);
530 int get_current_cpu_info(void){
532 int nlprocs, ncores, cmplegacy;
536 #if defined(CORE_PRESCOTT) || defined(CORE_OPTERON)
537 int eax, ebx, ecx, edx;
539 cpuid(1, &eax, &ebx, &ecx, &edx);
540 nlprocs = BITMASK(ebx, 16, 0xff);
541 apicid = BITMASK(ebx, 24, 0xff);
542 htt = BITMASK(edx, 28, 0x01);
545 #if defined(CORE_PRESCOTT)
546 cpuid(4, &eax, &ebx, &ecx, &edx);
547 ncores = BITMASK(eax, 26, 0x3f);
549 if (htt == 0) nlprocs = 0;
552 #if defined(CORE_OPTERON)
553 cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
554 ncores = BITMASK(ecx, 0, 0xff);
556 cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
557 cmplegacy = BITMASK(ecx, 1, 0x01);
568 fprintf(stderr, "APICID = %d Number of core = %d\n", apicid, ncores);
576 #if defined(ARCH_IA64)
578 static inline BLASULONG cpuid(BLASULONG regnum){
582 asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum));
584 value = __getIndReg(_IA64_REG_INDR_CPUID, regnum);
592 void blas_set_parameter(void){
594 BLASULONG cpuid3, size;
598 size = BITMASK(cpuid3, 16, 0xff);
600 sgemm_p = 192 * (size + 1);
601 dgemm_p = 96 * (size + 1);
602 cgemm_p = 96 * (size + 1);
603 zgemm_p = 48 * (size + 1);
605 qgemm_p = 64 * (size + 1);
606 xgemm_p = 32 * (size + 1);
608 #ifdef QUAD_PRECISION
609 qgemm_p = 32 * (size + 1);
610 xgemm_p = 16 * (size + 1);
613 sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
614 dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
615 cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
616 zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
617 #if defined(EXPRECISION) || defined(QUAD_PRECISION)
618 qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
619 xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
627 #define IA64_SYS_NAME "/sys/devices/system/cpu/cpu0/cache/index3/size"
628 #define IA64_PROC_NAME "/proc/pal/cpu0/cache_info"
630 void blas_set_parameter(void){
639 if ((infile = fopen(IA64_SYS_NAME, "r")) != NULL) {
641 fgets(buffer, sizeof(buffer), infile);
644 size = atoi(buffer) / 1536;
648 if ((infile = fopen(IA64_PROC_NAME, "r")) != NULL) {
650 while(fgets(buffer, sizeof(buffer), infile) != NULL) {
651 if ((!strncmp("Data/Instruction Cache level 3", buffer, 30))) break;
654 fgets(buffer, sizeof(buffer), infile);
658 *strstr(buffer, "bytes") = (char)NULL;
660 size = atoi(strchr(buffer, ':') + 1) / 1572864;
665 /* The last resort */
670 size = BITMASK(cpuid3, 16, 0xff) + 1;
673 sgemm_p = 320 * size;
674 dgemm_p = 160 * size;
675 cgemm_p = 160 * size;
682 sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
683 dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
684 cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
685 zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
687 qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
688 xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
698 #if defined(ARCH_MIPS64)
699 void blas_set_parameter(void){
700 #if defined(LOONGSON3A)
702 if(blas_num_threads == 1){
714 #if defined(LOONGSON3B)
716 if(blas_num_threads == 1 || blas_num_threads == 2){
731 #if defined(ARCH_ARM64)
733 #if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8)
734 unsigned long dgemm_prefetch_size_a;
735 unsigned long dgemm_prefetch_size_b;
736 unsigned long dgemm_prefetch_size_c;
739 void blas_set_parameter(void)
741 #if defined(VULCAN) || defined(THUNDERX2T99) || defined(ARMV8)
758 dgemm_prefetch_size_a = 3584;
759 dgemm_prefetch_size_b = 512;
760 dgemm_prefetch_size_c = 128;