1 // SPDX-License-Identifier: Apache-2.0
3 * Copyright (C) 2020 Jijoong Moon <jijoong.moon@samsung.com>
5 * @file blas_interface.cpp
7 * @see https://github.com/nnstreamer/nntrainer
8 * @author Jijoong Moon <jijoong.moon@samsung.com>
9 * @bug No known bugs except for NYI items
10 * @brief This is dummy header for blas support
14 #include <blas_interface.h>
15 #include <nntrainer_error.h>
18 #include <blas_neon.h>
23 #define sgemv_loop(ci, cj, cM, cN) \
27 for (ci = 0; ci != cM; ci++) { \
28 y0 = Y[ci * incy] * beta; \
29 for (cj = 0; cj != cN; cj++) \
30 y0 += A[i + j * lda] * X[cj * incx]; \
35 #define sgemv_loop_fp16(ci, cj, cM, cN) \
39 for (ci = 0; ci != cM; ci++) { \
40 y0 = Y[ci * incy] * static_cast<_FP16>(beta); \
41 for (cj = 0; cj != cN; cj++) \
42 y0 += A[i + j * lda] * X[cj * incx]; \
47 #define saxpy_loop_fp16() \
50 for (i = 0; i < N; ++i) \
51 Y[i * incY] = Y[i * incY] + static_cast<_FP16>(alpha) * X[i * incX]; \
54 #define sgemm_loop_fp16() \
56 for (unsigned int m = 0; m < M; ++m) { \
57 for (unsigned int n = 0; n < N; ++n) { \
59 _FP16 c_old = C[m * ldc + n]; \
60 for (unsigned int k = 0; k < K; ++k) { \
62 a = ((TransA == CblasTrans) ? A[k * lda + m] : A[m * lda + k]); \
63 b = ((TransB == CblasTrans) ? B[n * ldb + k] : B[k * ldb + n]); \
66 C[m * ldc + n] = static_cast<_FP16>(alpha) * c; \
68 C[m * ldc + n] += static_cast<_FP16>(beta) * c_old; \
76 static void saxpy_FP16(const unsigned int N, const float alpha, const _FP16 *X,
77 const int incX, _FP16 *Y, const int incY) {
78 if (incX < 0 or incY < 0)
79 throw std::invalid_argument(
80 "Error: negative inc not supported without cblas");
83 // USE__FP16 is defined when platform is android
84 if (incX == 1 && incY == 1) {
85 nntrainer::neon::saxpy_neon_fp16(N, alpha, X, Y);
94 static void sgemv_FP16(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
95 const unsigned int M, const unsigned int N,
96 const float alpha, const _FP16 *A,
97 const unsigned int lda, const _FP16 *X, const int incX,
98 const float beta, _FP16 *Y, const int incY) {
100 unsigned int incy = abs(incY);
101 unsigned int incx = abs(incX);
103 if (TransA == CblasTrans) {
105 if (incX == 1 && incY == 1 && (N % 16 == 0 || N % 8 == 0)) {
106 nntrainer::neon::sgemv_transpose_neon_fp16(A, X, Y, M, N, alpha, beta);
108 sgemv_loop_fp16(i, j, N, M);
111 sgemv_loop_fp16(i, j, N, M);
115 if (incX == 1 && incY == 1 && (N % 16 == 0 || N % 8 == 0)) {
116 nntrainer::neon::sgemv_neon_fp16(A, X, Y, M, N, alpha, beta);
118 sgemv_loop_fp16(j, i, M, N);
121 sgemv_loop_fp16(j, i, M, N);
126 static _FP16 sdot_FP16(const unsigned int N, const _FP16 *X,
127 const unsigned int incX, const _FP16 *Y,
128 const unsigned int incY) {
130 if (incX < 0 or incY < 0)
131 throw std::invalid_argument("Error: negative inc not supported");
136 if (incX == 1 && incY == 1) {
137 ret = nntrainer::neon::sdot_neon_fp16(N, X, Y);
139 for (unsigned int i = 0; i < N; ++i) {
140 ret += X[i * incX] * Y[i * incY];
144 for (unsigned int i = 0; i < N; ++i) {
145 ret += X[i * incX] * Y[i * incY];
151 static void scopy_FP16(const unsigned int N, const _FP16 *X, const int incX,
152 _FP16 *Y, const int incY) {
153 unsigned int incy = abs(incY);
154 unsigned int incx = abs(incX);
157 if (incX == 1 && incY == 1) {
158 nntrainer::neon::scopy_neon_fp16(N, X, Y);
160 for (unsigned int i = 0; i < N; ++i)
161 Y[i * incy] = X[i * incx];
164 for (unsigned int i = 0; i < N; ++i)
165 Y[i * incy] = X[i * incx];
169 void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX) {
170 unsigned int incx = abs(incX);
174 nntrainer::neon::sscal_neon_fp16(N, X, alpha);
176 for (unsigned int i = 0; i < N; ++i)
177 X[i * incx] = static_cast<_FP16>(alpha) * X[i * incx];
180 for (unsigned int i = 0; i < N; ++i)
181 X[i * incx] = static_cast<_FP16>(alpha) * X[i * incx];
185 static _FP16 snrm2_FP16(const unsigned int N, const _FP16 *X, const int incX) {
186 unsigned int incx = abs(incX);
191 sum = nntrainer::neon::snrm2_neon_fp16(N, X);
193 for (unsigned int i = 0; i < N; i++) {
199 for (unsigned int i = 0; i < N; i++) {
204 return static_cast<_FP16>(sqrt(sum));
207 static void sgemm_FP16(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
208 CBLAS_TRANSPOSE TransB, const unsigned int M,
209 const unsigned int N, const unsigned int K,
210 const float alpha, const _FP16 *A,
211 const unsigned int lda, const _FP16 *B,
212 const unsigned int ldb, const float beta, _FP16 *C,
213 const unsigned int ldc) {
216 if ((N % 8 == 0) && (K % 8 == 0)) {
217 nntrainer::neon::sgemm_neon_fp16(A, B, C, M, N, K, alpha, beta,
218 TransA == CblasTrans,
219 TransB == CblasTrans);
228 static unsigned int isamax_FP16(const unsigned int N, const _FP16 *X,
230 unsigned int max_idx = 0;
233 if (incX == 1 && N >= 8) {
234 max_idx = nntrainer::neon::isamax_neon_fp16(N, X);
236 _FP16 max_val = X[0];
237 for (unsigned int n = 1; n < N; n += incX) {
238 _FP16 cur_val = (X[n] >= 0) ? X[n] : -1 * X[n];
239 if (cur_val > max_val) {
246 _FP16 max_val = X[0];
247 for (unsigned int n = 1; n < N; n += incX) {
248 _FP16 cur_val = (X[n] >= 0) ? X[n] : -1 * X[n];
249 if (cur_val > max_val) {
259 void saxpy(const unsigned int N, const float alpha, const _FP16 *X,
260 const int incX, _FP16 *Y, const int incY) {
261 saxpy_FP16(N, alpha, X, incX, Y, incY);
264 void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
265 const unsigned int M, const unsigned int N, const unsigned int K,
266 const float alpha, const _FP16 *A, const unsigned int lda,
267 const _FP16 *B, const unsigned int ldb, const float beta, _FP16 *C,
268 const unsigned int ldc) {
269 sgemm_FP16(order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C,
273 void scopy(const unsigned int N, const _FP16 *X, const int incX, _FP16 *Y,
275 scopy_FP16(N, X, incX, Y, incY);
277 } // namespace nntrainer
279 _FP16 snrm2(const int N, const _FP16 *X, const int incX) {
280 return snrm2_FP16(N, X, incX);
283 _FP16 sdot(const unsigned int N, const _FP16 *X, const unsigned int incX,
284 const _FP16 *Y, const unsigned int incY) {
285 return sdot_FP16(N, X, incX, Y, incY);
288 void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
289 const unsigned int N, const float alpha, const _FP16 *A,
290 const unsigned int lda, const _FP16 *X, const int incX,
291 const float beta, _FP16 *Y, const int incY) {
292 sgemv_FP16(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
295 unsigned int isamax(const unsigned int N, const _FP16 *X, const int incX) {
296 /// @todo isamax_FP16 for BLAS_NUM_THREADS
297 return isamax_FP16(N, X, incX);
303 static void saxpy_raw(const unsigned int N, const float alpha, const float *X,
304 const int incX, float *Y, const int incY) {
305 if (incX < 0 or incY < 0)
306 throw std::invalid_argument(
307 "Error: negative inc not supported without cblas");
308 for (unsigned int i = 0; i < N; ++i)
309 Y[i * incY] = Y[i * incY] + X[i * incX] * alpha;
312 static void sgemv_raw(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
313 const unsigned int M, const unsigned int N,
314 const float alpha, const float *A, const unsigned int lda,
315 const float *X, const int incX, const float beta,
316 float *Y, const int incY) {
318 unsigned int incy = abs(incY);
319 unsigned int incx = abs(incX);
321 if (TransA == CblasTrans) {
322 sgemv_loop(i, j, N, M);
324 sgemv_loop(j, i, M, N);
328 static float sdot_raw(const unsigned int N, const float *X,
329 const unsigned int incX, const float *Y,
330 const unsigned int incY) {
332 for (unsigned int i = 0; i < N; ++i) {
333 ret += X[i * incX] * Y[i * incY];
338 static void scopy_raw(const unsigned int N, const float *X, const int incX,
339 float *Y, const int incY) {
340 unsigned int incy = abs(incY);
341 unsigned int incx = abs(incX);
343 for (unsigned int i = 0; i < N; ++i)
344 Y[i * incy] = X[i * incx];
347 static void sscal_raw(const unsigned int N, const float alpha, float *X,
349 unsigned int incx = abs(incX);
351 for (unsigned int i = 0; i < N; ++i)
352 X[i * incx] = alpha * X[i * incx];
355 static float snrm2_raw(const unsigned int N, const float *X, const int incX) {
356 unsigned int incx = abs(incX);
360 for (unsigned int i = 0; i < N; i++) {
367 static void sgemm_raw(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
368 CBLAS_TRANSPOSE TransB, const unsigned int M,
369 const unsigned int N, const unsigned int K,
370 const float alpha, const float *A, const unsigned int lda,
371 const float *B, const unsigned int ldb, const float beta,
372 float *C, const unsigned int ldc) {
374 for (unsigned int m = 0; m < M; ++m) {
375 for (unsigned int n = 0; n < N; ++n) {
377 float c_old = C[m * ldc + n];
378 for (unsigned int k = 0; k < K; ++k) {
380 a = ((TransA == CblasTrans) ? A[k * lda + m] : A[m * lda + k]);
381 b = ((TransB == CblasTrans) ? B[n * ldb + k] : B[k * ldb + n]);
384 C[m * ldc + n] = alpha * c;
386 C[m * ldc + n] += beta * c_old;
391 static unsigned int isamax_raw(const unsigned int N, const float *X,
394 unsigned int max_idx = 0;
395 float max_val = X[0];
396 for (unsigned int n = 1; n < N; n += incX) {
397 float cur_val = abs(X[n]);
398 if (cur_val > max_val) {
409 void sscal(const unsigned int N, const float alpha, void *X, const int incX,
410 ml::train::TensorDim::DataType d_type) {
412 if (d_type == ml::train::TensorDim::DataType::FP32) {
415 #ifdef BLAS_NUM_THREADS
416 openblas_set_num_threads(BLAS_NUM_THREADS);
417 #endif // BLAS_NUM_THREADS
418 cblas_sscal(N, alpha, (float *)X, incX);
419 #else // USE_BLAS else
420 sscal_raw(N, alpha, (float *)X, incX);
422 } else if (d_type == ml::train::TensorDim::DataType::FP16) {
424 sscal(N, alpha, (_FP16 *)X, incX);
426 throw std::invalid_argument("Error: enable-fp16 is not enabled");
431 void sscal(const unsigned int N, const float alpha, float *X, const int incX) {
433 #ifdef BLAS_NUM_THREADS
434 openblas_set_num_threads(BLAS_NUM_THREADS);
436 cblas_sscal(N, alpha, X, incX);
438 sscal_raw(N, alpha, X, incX);
442 void saxpy(const unsigned int N, const float alpha, const void *X,
443 const int incX, void *Y, const int incY,
444 ml::train::TensorDim::DataType d_type) {
445 if (d_type == ml::train::TensorDim::DataType::FP32) {
447 #ifdef BLAS_NUM_THREADS
448 openblas_set_num_threads(BLAS_NUM_THREADS);
450 cblas_saxpy(N, alpha, static_cast<const float *>(X), incX,
451 static_cast<float *>(Y), incY);
453 saxpy_raw(N, alpha, static_cast<const float *>(X), incX,
454 static_cast<float *>(Y), incY);
456 } else if (d_type == ml::train::TensorDim::DataType::FP16) {
458 saxpy_FP16(N, alpha, static_cast<const _FP16 *>(X), incX,
459 static_cast<_FP16 *>(Y), incY);
461 throw std::invalid_argument("Error: enable-fp16 is not enabled");
466 void saxpy(const unsigned int N, const float alpha, const float *X,
467 const int incX, float *Y, const int incY) {
469 #ifdef BLAS_NUM_THREADS
470 openblas_set_num_threads(BLAS_NUM_THREADS);
472 cblas_saxpy(N, alpha, X, incX, Y, incY);
474 saxpy_raw(N, alpha, X, incX, Y, incY);
478 void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
479 const unsigned int M, const unsigned int N, const unsigned int K,
480 const float alpha, const void *A, const unsigned int lda,
481 const void *B, const unsigned int ldb, const float beta, void *C,
482 const unsigned int ldc, ml::train::TensorDim::DataType d_type) {
484 if (d_type == ml::train::TensorDim::DataType::FP32) {
487 cudaDeviceProp deviceProp;
488 cudaGetDeviceProperties(&deviceProp, devID);
489 float *d_A, *d_B, *d_C;
491 unsigned int size_A = M * K * sizeof(float);
492 unsigned int size_B = K * N * sizeof(float);
493 unsigned int size_C = M * N * sizeof(float);
495 cudaMalloc((void **)&d_A, size_A);
496 cudaMalloc((void **)&d_B, size_B);
497 cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice);
498 cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice);
499 cudaMalloc((void **)&d_C, size_C);
501 cublasHandle_t handle;
502 cublasCreate(&handle);
504 cublasOperation_t transA =
505 (TransA == CblasTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
506 cublasOperation_t transB =
507 (TransB == CblasTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
508 cublasSgemm(handle, transA, transB, N, M, K, &alpha, d_B, N, d_A, K, &beta,
511 cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost);
512 cublasDestroy(handle);
514 #elif defined USE_BLAS
516 #ifdef BLAS_NUM_THREADS
517 openblas_set_num_threads(BLAS_NUM_THREADS);
521 order, TransA, TransB, M, N, K, alpha, static_cast<const float *>(A), lda,
522 static_cast<const float *>(B), ldb, beta, static_cast<float *>(C), ldc);
524 sgemm_raw(order, TransA, TransB, M, N, K, alpha,
525 static_cast<const float *>(A), lda, static_cast<const float *>(B),
526 ldb, beta, static_cast<float *>(C), ldc);
529 } else if (d_type == ml::train::TensorDim::DataType::FP16) {
532 order, TransA, TransB, M, N, K, alpha, static_cast<const _FP16 *>(A), lda,
533 static_cast<const _FP16 *>(B), ldb, beta, static_cast<_FP16 *>(C), ldc);
535 throw std::invalid_argument("Error: enable-fp16 is not enabled");
538 } // namespace nntrainer
540 void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
541 const unsigned int M, const unsigned int N, const unsigned int K,
542 const float alpha, const float *A, const unsigned int lda,
543 const float *B, const unsigned int ldb, const float beta, float *C,
544 const unsigned int ldc) {
548 cudaDeviceProp deviceProp;
549 cudaGetDeviceProperties(&deviceProp, devID);
550 float *d_A, *d_B, *d_C;
552 unsigned int size_A = M * K * sizeof(float);
553 unsigned int size_B = K * N * sizeof(float);
554 unsigned int size_C = M * N * sizeof(float);
556 cudaMalloc((void **)&d_A, size_A);
557 cudaMalloc((void **)&d_B, size_B);
558 cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice);
559 cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice);
560 cudaMalloc((void **)&d_C, size_C);
562 cublasHandle_t handle;
563 cublasCreate(&handle);
565 cublasOperation_t transA = (TransA == CblasTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
566 cublasOperation_t transB = (TransB == CblasTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
567 cublasSgemm(handle, transA, transB, N, M, K, &alpha, d_B, N, d_A, K, &beta,
570 cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost);
571 cublasDestroy(handle);
572 #elif defined USE_BLAS
573 #ifdef BLAS_NUM_THREADS
574 openblas_set_num_threads(BLAS_NUM_THREADS);
576 cblas_sgemm(order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C,
579 sgemm_raw(order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C,
584 void scopy(const unsigned int N, const void *X, const int incX, void *Y,
585 const int incY, ml::train::TensorDim::DataType d_type) {
587 if (d_type == ml::train::TensorDim::DataType::FP32) {
590 #ifdef BLAS_NUM_THREADS
591 openblas_set_num_threads(BLAS_NUM_THREADS);
593 cblas_scopy(N, (float *)X, incX, (float *)Y, incY);
595 scopy_raw(N, (float *)X, incX, (float *)Y, incY);
598 } else if (d_type == ml::train::TensorDim::DataType::FP16) {
600 scopy_FP16(N, (_FP16 *)X, incX, (_FP16 *)Y, incY);
602 throw std::invalid_argument("Error: enable-fp16 is not enabled");
606 } // namespace nntrainer
608 void scopy(const unsigned int N, const float *X, const int incX, float *Y,
611 #ifdef BLAS_NUM_THREADS
612 openblas_set_num_threads(BLAS_NUM_THREADS);
614 cblas_scopy(N, X, incX, Y, incY);
616 scopy_raw(N, X, incX, Y, incY);
618 } // namespace nntrainer
620 float snrm2(const int N, const float *X, const int incX) {
622 #ifdef BLAS_NUM_THREADS
623 openblas_set_num_threads(BLAS_NUM_THREADS);
625 return cblas_snrm2(N, X, incX);
627 return snrm2_raw(N, X, incX);
631 float sdot(const unsigned int N, const float *X, const unsigned int incX,
632 const float *Y, const unsigned int incY) {
634 #ifdef BLAS_NUM_THREADS
635 openblas_set_num_threads(BLAS_NUM_THREADS);
637 return cblas_sdot(N, X, incX, Y, incY);
639 return sdot_raw(N, X, incX, Y, incY);
643 void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
644 const unsigned int N, const float alpha, const void *A,
645 const unsigned int lda, const void *X, const int incX,
646 const float beta, void *Y, const int incY,
647 ml::train::TensorDim::DataType d_type) {
648 if (d_type == ml::train::TensorDim::DataType::FP32) {
650 #ifdef BLAS_NUM_THREADS
651 openblas_set_num_threads(BLAS_NUM_THREADS);
654 order, TransA, M, N, alpha, static_cast<const float *>(A), lda,
655 static_cast<const float *>(X), incX, beta, static_cast<float *>(Y), incY);
658 return sgemv_raw(order, TransA, M, N, alpha, static_cast<const float *>(A),
659 lda, static_cast<const float *>(X), incX, beta,
660 static_cast<float *>(Y), incY);
662 } else if (d_type == ml::train::TensorDim::DataType::FP16) {
664 return sgemv_FP16(order, TransA, M, N, alpha, static_cast<const _FP16 *>(A),
665 lda, static_cast<const _FP16 *>(X), incX, beta,
666 static_cast<_FP16 *>(Y), incY);
668 throw std::invalid_argument("Error: enable-fp16 is not enabled");
673 void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
674 const unsigned int N, const float alpha, const float *A,
675 const unsigned int lda, const float *X, const int incX,
676 const float beta, float *Y, const int incY) {
678 #ifdef BLAS_NUM_THREADS
679 openblas_set_num_threads(BLAS_NUM_THREADS);
681 return cblas_sgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
684 return sgemv_raw(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
688 unsigned int isamax(const unsigned int N, const float *X, const int incX) {
690 #ifdef BLAS_NUM_THREADS
691 openblas_set_num_threads(BLAS_NUM_THREADS);
693 return cblas_isamax(N, X, incX);
695 return isamax_raw(N, X, incX);
699 } // namespace nntrainer