Makefile.rule

   1 #
   2 #  Beginning of user configuration
   3 #
   4
   5 # This library's version
   6 VERSION = 0.3.20
   7
   8 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
   9 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
  10 # is libopenblas_$(LIBNAMESUFFIX).so.0.
  11 # LIBNAMESUFFIX = omp
  12
  13 # You can specify the target architecture, otherwise it's
  14 # automatically detected.
  15 # TARGET = PENRYN
  16
  17 # If you want to support multiple architecture in one binary
  18 # DYNAMIC_ARCH = 1
  19
  20 # If you want the full list of x86_64 architectures supported in DYNAMIC_ARCH
  21 # mode (including individual optimizied codes for PENRYN, DUNNINGTON, OPTERON,
  22 # OPTERON_SSE3, ATOM and NANO rather than fallbacks to older architectures)
  23 # DYNAMIC_OLDER = 1
  24
  25 # C compiler including binary type(32bit / 64bit). Default is gcc.
  26 # Don't use Intel Compiler or PGI, it won't generate right codes as I expect.
  27 # CC = gcc
  28
  29 # Fortran compiler. Default is g77.
  30 # FC = gfortran
  31
  32 # Even you can specify cross compiler. Meanwhile, please set HOSTCC.
  33
  34 # cross compiler for Windows
  35 # CC = x86_64-w64-mingw32-gcc
  36 # FC = x86_64-w64-mingw32-gfortran
  37
  38 # cross compiler for 32bit ARM
  39 # CC = arm-linux-gnueabihf-gcc
  40 # FC = arm-linux-gnueabihf-gfortran
  41
  42 # cross compiler for 64bit ARM
  43 # CC = aarch64-linux-gnu-gcc
  44 # FC = aarch64-linux-gnu-gfortran
  45
  46
  47 # If you use the cross compiler, please set this host compiler.
  48 # HOSTCC = gcc
  49
  50 # If you need 32bit binary, define BINARY=32, otherwise define BINARY=64
  51 # Please note that AVX is not available on 32-bit.
  52 # Setting BINARY=32 disables AVX/AVX2/AVX-512.
  53 # BINARY=64
  54
  55 # About threaded BLAS. It will be automatically detected if you don't
  56 # specify it.
  57 # For force setting for single threaded, specify USE_THREAD = 0
  58 # For force setting for multi  threaded, specify USE_THREAD = 1
  59 # USE_THREAD = 0
  60
  61 # If you want to build a single-threaded OpenBLAS, but expect to call this
  62 # from several concurrent threads in some other program, comment this in for
  63 # thread safety. (This is done automatically for USE_THREAD=1 , and should not
  64 # be necessary when USE_OPENMP=1)
  65 # USE_LOCKING = 1
  66
  67 # If you're going to use this library with OpenMP, please comment it in.
  68 # This flag is always set for POWER8. Don't set USE_OPENMP = 0 if you're targeting POWER8.
  69 # USE_OPENMP = 1
  70
  71 # The OpenMP scheduler to use - by default this is "static" and you
  72 # will normally not want to change this unless you know that your main
  73 # workload will involve tasks that have highly unbalanced running times
  74 # for individual threads. Changing away from "static" may also adversely
  75 # affect memory access locality in NUMA systems. Setting to "runtime" will
  76 # allow you to select the scheduler from the environment variable OMP_SCHEDULE
  77 # CCOMMON_OPT += -DOMP_SCHED=dynamic
  78
  79 # You can define the maximum number of threads. Basically it should be less
  80 # than or equal to the number of CPU threads. If you don't specify one, it's
  81 # automatically detected by the build system.
  82 # If SMT (aka. HT) is enabled on the system, it may or may not be beneficial to
  83 # restrict NUM_THREADS to the number of physical cores. By default, the automatic
  84 # detection includes logical CPUs, thus allowing the use of SMT.
  85 # Users may opt at runtime to use less than NUM_THREADS threads.
  86 #
  87 # Note for package maintainers: you can build OpenBLAS with a large NUM_THREADS
  88 # value (eg. 32-256) if you expect your users to use that many threads. Due to the way
  89 # some internal structures are allocated, using a large NUM_THREADS value has a RAM
  90 # footprint penalty, even if users reduce the actual number of threads at runtime.
  91 # NUM_THREADS = 24
  92
  93 # If you have enabled USE_OPENMP and your application would call
  94 # OpenBLAS's calculation API from multiple threads, please comment this in.
  95 # This flag defines how many instances of OpenBLAS's calculation API can actually
  96 # run in parallel. If more than NUM_PARALLEL threads call OpenBLAS's calculation API,
  97 # they need to wait for the preceding API calls to finish or risk data corruption.
  98 # NUM_PARALLEL = 2
  99
 100 # When multithreading, OpenBLAS needs to use a memory buffer for communicating
 101 # and collating results for individual subranges of the original matrix. Since
 102 # the original GotoBLAS of the early 2000s, the default size of this buffer has
 103 # been set at a value of 32<<20 (which is 32MB) on x86_64 , twice that on PPC.
 104 # If you expect to handle large problem sizes (beyond about 30000x30000) uncomment
 105 # this line and adjust the (32<<n) factor if necessary. Usually an insufficient value
 106 # manifests itself as a crash in the relevant scal kernel (sscal_k, dscal_k etc)
 107 # BUFFERSIZE = 25
 108
 109 # If you don't need to install the static library, please comment this in.
 110 # NO_STATIC = 1
 111
 112 # If you don't need to generate the shared library, please comment this in.
 113 # NO_SHARED = 1
 114
 115 # If you don't need the CBLAS interface, please comment this in.
 116 # NO_CBLAS = 1
 117
 118 # If you only want the CBLAS interface without installing a Fortran compiler,
 119 # please comment this in.
 120 # ONLY_CBLAS = 1
 121
 122 # If you don't need LAPACK, please comment this in.
 123 # If you set NO_LAPACK=1, the build system automatically sets NO_LAPACKE=1.
 124 # NO_LAPACK = 1
 125
 126 # If you don't need LAPACKE (C Interface to LAPACK), please comment this in.
 127 # NO_LAPACKE = 1
 128
 129 # Build LAPACK Deprecated functions since LAPACK 3.6.0
 130 BUILD_LAPACK_DEPRECATED = 1
 131
 132 # Build RecursiveLAPACK on top of LAPACK
 133 # BUILD_RELAPACK = 1
 134
 135 # If you want to use the legacy threaded Level 3 implementation.
 136 # USE_SIMPLE_THREADED_LEVEL3 = 1
 137
 138 # If you want to use the new, still somewhat experimental code that uses
 139 # thread-local storage instead of a central memory buffer in memory.c
 140 # Note that if your system uses GLIBC, it needs to have at least glibc 2.21
 141 # for this to work.
 142 # USE_TLS = 1
 143
 144 # If you want to drive whole 64bit region by BLAS. Not all Fortran
 145 # compilers support this. It's safe to keep this commented out if you
 146 # are not sure. (This is equivalent to the "-i8" ifort option).
 147 # INTERFACE64 = 1
 148
 149 # Unfortunately most of kernel won't give us high quality buffer.
 150 # BLAS tries to find the best region before entering main function,
 151 # but it will consume time. If you don't like it, you can disable one.
 152 NO_WARMUP = 1
 153
 154 # Comment this in if you want to disable OpenBLAS's CPU/Memory affinity handling.
 155 # This feature is only implemented on Linux, and is always disabled on other platforms.
 156 # Enabling affinity handling may improve performance, especially on NUMA systems, but
 157 # it may conflict with certain applications that also try to manage affinity.
 158 # This conflict can result in threads of the application calling OpenBLAS ending up locked
 159 # to the same core(s) as OpenBLAS, possibly binding all threads to a single core.
 160 # For this reason, affinity handling is disabled by default. Can be safely enabled if nothing
 161 # else modifies affinity settings.
 162 # Note: enabling affinity has been known to cause problems with NumPy and R
 163 NO_AFFINITY = 1
 164
 165 # If you are compiling for Linux and you have more than 16 numa nodes or more than 256 cpus
 166 # BIGNUMA = 1
 167
 168 # Don't use AVX kernel on Sandy Bridge. It is compatible with old compilers
 169 # and OS. However, the performance is low.
 170 # NO_AVX = 1
 171
 172 # Don't use Haswell optimizations if binutils is too old (e.g. RHEL6)
 173 # NO_AVX2 = 1
 174
 175 # Don't use SkylakeX optimizations if binutils or compiler are too old (the build
 176 # system will try to determine this automatically)
 177 # NO_AVX512 = 1
 178
 179 # Don't use parallel make.
 180 # NO_PARALLEL_MAKE = 1
 181
 182 # Force number of make jobs. The default is the number of logical CPU of the host.
 183 # This is particularly useful when using distcc.
 184 # A negative value will disable adding a -j flag to make, allowing to use a parent
 185 # make -j value. This is useful to call OpenBLAS make from an other project
 186 # makefile
 187 # MAKE_NB_JOBS = 2
 188
 189 # If you would like to know minute performance report of GotoBLAS.
 190 # FUNCTION_PROFILE = 1
 191
 192 # Support for IEEE quad precision(it's *real* REAL*16)( under testing)
 193 # This option should not be used - it is a holdover from unfinished code present
 194 # in the original GotoBLAS2 library that may be usable as a starting point but
 195 # is not even expected to compile in its present form.
 196 # QUAD_PRECISION = 1
 197
 198 # Theads are still working for a while after finishing BLAS operation
 199 # to reduce thread activate/deactivate overhead. You can determine
 200 # time out to improve performance. This number should be from 4 to 30
 201 # which corresponds to (1 << n) cycles. For example, if you set to 26,
 202 # thread will be running for (1 << 26) cycles(about 25ms on 3.0GHz
 203 # system). Also you can control this number by THREAD_TIMEOUT
 204 # CCOMMON_OPT   += -DTHREAD_TIMEOUT=26
 205
 206 # Using special device driver for mapping physically contiguous memory
 207 # to the user space. If bigphysarea is enabled, it will use it.
 208 # DEVICEDRIVER_ALLOCATION = 1
 209
 210 # If you need to synchronize FP CSR between threads (for x86/x86_64 only).
 211 # CONSISTENT_FPCSR = 1
 212
 213 # If any gemm argument m, n or k is less or equal this threshold, gemm will be execute
 214 # with single thread. (Actually in recent versions this is a factor proportional to the
 215 # number of floating point operations necessary for the given problem size, no longer
 216 # an individual dimension). You can use this setting to avoid the overhead of multi-
 217 # threading in small matrix sizes. The default value is 4, but values as high as 50 have
 218 # been reported to be optimal for certain workloads (50 is the recommended value for Julia).
 219 # GEMM_MULTITHREAD_THRESHOLD = 4
 220
 221 # If you need sanity check by comparing results to reference BLAS. It'll be very
 222 # slow (Not implemented yet).
 223 # SANITY_CHECK = 1
 224
 225 # The installation directory.
 226 # PREFIX = /opt/OpenBLAS
 227
 228 # Common Optimization Flag;
 229 # The default -O2 is enough.
 230 # Flags for POWER8 are defined in Makefile.power. Don't modify COMMON_OPT
 231 # COMMON_OPT = -O2
 232
 233 # gfortran option for LAPACK to improve thread-safety
 234 # It is enabled by default in Makefile.system for gfortran
 235 # Flags for POWER8 are defined in Makefile.power. Don't modify FCOMMON_OPT
 236 # FCOMMON_OPT = -frecursive
 237
 238 # Profiling flags
 239 COMMON_PROF = -pg
 240
 241 # Build Debug version
 242 # DEBUG = 1
 243
 244 # Set maximum stack allocation.
 245 # The default value is 2048. 0 disable stack allocation a may reduce GER and GEMV
 246 # performance. For details, https://github.com/xianyi/OpenBLAS/pull/482
 247 #
 248 # MAX_STACK_ALLOC = 0
 249
 250 # Add a prefix or suffix to all exported symbol names in the shared library.
 251 # Avoid conflicts with other BLAS libraries, especially when using
 252 # 64 bit integer interfaces in OpenBLAS.
 253 # For details, https://github.com/xianyi/OpenBLAS/pull/459
 254 #
 255 # The same prefix and suffix are also added to the library name,
 256 # i.e. you get lib$(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX) rather than libopenblas
 257 #
 258 # SYMBOLPREFIX=
 259 # SYMBOLSUFFIX=
 260
 261 # Run a C++ based thread safety tester after the build is done.
 262 # This is mostly intended as a developer feature to spot regressions, but users and
 263 # package maintainers can enable this if they have doubts about the thread safety of
 264 # the library, given the configuration in this file.
 265 # By default, the thread safety tester launches 52 concurrent calculations at the same
 266 # time.
 267 #
 268 # Please note that the test uses ~1300 MiB of RAM for the DGEMM test.
 269 #
 270 # The test requires CBLAS to be built, a C++11 capable compiler and the presence of
 271 # an OpenMP implementation. If you are cross-compiling this test will probably not
 272 # work at all.
 273 #
 274 # CPP_THREAD_SAFETY_TEST = 1
 275 #
 276 # use this to run only the less memory-hungry GEMV test
 277 # CPP_THREAD_SAFETY_GEMV = 1
 278
 279
 280 # If you want to enable the experimental BFLOAT16 support
 281 # BUILD_BFLOAT16 = 1
 282
 283
 284 # Set the thread number threshold beyond which the job array for the threaded level3 BLAS
 285 # will be allocated on the heap rather than the stack. (This array alone requires
 286 # NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu
 287 # counts, but obviously it is not the only item that ends up on the stack.
 288 # The default value of 32 ensures that the overall requirement is compatible
 289 # with the default 1MB stacksize imposed by having the Java VM loaded without use
 290 # of its -Xss parameter.
 291 # The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible
 292 # with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java
 293 # VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code
 294 # BLAS3_MEM_ALLOC_THRESHOLD = 160
 295
 296
 297
 298 # By default the library contains BLAS functions (and LAPACK if selected) for all input types.
 299 # To build a smaller library supporting e.g. only single precision real (SGEMM etc.) or only
 300 # the functions for complex numbers, uncomment the desired type(s) below
 301 # BUILD_SINGLE = 1
 302 # BUILD_DOUBLE = 1
 303 # BUILD_COMPLEX = 1
 304 # BUILD_COMPLEX16 = 1
 305 #
 306 #  End of user configuration
 307 #