nntrainer_conf.set('CAPI_ML_COMMON_DEP', '')
extra_defines += '-DML_API_COMMON=0'
endif
-
-
blas_dep = dummy_dep
# Dependencies
if get_option('enable-cublas')
if get_option('enable-blas')
extra_defines += '-DUSE_BLAS=1'
- if get_option('platform') == 'tizen' or get_option('platform') == 'yocto'
- blas_dep = dependency('openblas')
- elif get_option('platform') == 'android'
+
+ if get_option('platform') == 'android'
message('preparing blas')
run_command(meson.source_root() / 'jni' / 'prepare_openblas.sh', meson.build_root(), check: true)
blas_dep = found_dummy_dep
blas_root = meson.build_root() / 'openblas'
else
- blas_dep = dependency('blas-openblas', required:false)
- # for Ubuntu 20.04
- if not blas_dep.found()
- blas_dep = dependency('openblas')
+ blas_dep = dependency('openblas')
+ endif
+
+ if blas_dep.found()
+ if get_option('openblas-num-threads') > 0
+ extra_defines += '-DBLAS_NUM_THREADS=@0@'.format(get_option('openblas-num-threads'))
+ message('set openblas num threads=@0@'.format(get_option('openblas-num-threads')))
endif
endif
endif
void saxpy(const unsigned int N, const float alpha, const float *X,
const int incX, float *Y, const int incY) {
#ifdef USE_BLAS
+#ifdef BLAS_NUM_THREADS
+ openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
cblas_saxpy(N, alpha, X, incX, Y, incY);
#else
saxpy_raw(N, alpha, X, incX, Y, incY);
cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost);
cublasDestroy(handle);
#elif defined USE_BLAS
+#ifdef BLAS_NUM_THREADS
+ openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
cblas_sgemm(order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C,
ldc);
#else
void scopy(const unsigned int N, const float *X, const int incX, float *Y,
const int incY) {
#ifdef USE_BLAS
+#ifdef BLAS_NUM_THREADS
+ openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
cblas_scopy(N, X, incX, Y, incY);
#else
scopy_raw(N, X, incX, Y, incY);
void sscal(const int N, const float alpha, float *X, const int incX) {
#ifdef USE_BLAS
+#ifdef BLAS_NUM_THREADS
+ openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
cblas_sscal(N, alpha, X, incX);
#else
sscal_raw(N, alpha, X, incX);
float snrm2(const int N, const float *X, const int incX) {
#ifdef USE_BLAS
+#ifdef BLAS_NUM_THREADS
+ openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
return cblas_snrm2(N, X, incX);
#else
return snrm2_raw(N, X, incX);
float sdot(const unsigned int N, const float *X, const unsigned int incX,
const float *Y, const unsigned int incY) {
#ifdef USE_BLAS
+#ifdef BLAS_NUM_THREADS
+ openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
return cblas_sdot(N, X, incX, Y, incY);
#else
return sdot_raw(N, X, incX, Y, incY);
const unsigned int lda, const float *X, const int incX,
const float beta, float *Y, const int incY) {
#ifdef USE_BLAS
+#ifdef BLAS_NUM_THREADS
+ openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
return cblas_sgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
incY);
#else
unsigned int isamax(const unsigned int N, const float *X, const int incX) {
#ifdef USE_BLAS
+#ifdef BLAS_NUM_THREADS
+ openblas_set_num_threads(BLAS_NUM_THREADS);
+#endif
return cblas_isamax(N, X, incX);
#else
return isamax_raw(N, X, incX);