1 // SPDX-License-Identifier: Apache-2.0
3 * Copyright (C) 2020 Jijoong Moon <jijoong.moon@samsung.com>
5 * @file blas_interface.cpp
7 * @see https://github.com/nnstreamer/nntrainer
8 * @author Jijoong Moon <jijoong.moon@samsung.com>
9 * @bug No known bugs except for NYI items
10 * @brief This is dummy header for blas support
14 #include <blas_interface.h>
16 #include <nntrainer_error.h>
20 #define sgemv_loop(ci, cj, cM, cN) \
24 for (ci = 0; ci != cM; ci++) { \
25 y0 = Y[ci * incy] * beta; \
26 for (cj = 0; cj != cN; cj++) \
27 y0 += A[i + j * lda] * X[cj * incx]; \
36 static void saxpy_raw(const unsigned int N, const float alpha, const float *X,
37 const int incX, float *Y, const int incY) {
38 if (incX < 0 or incY < 0)
39 throw std::invalid_argument(
40 "Error: negative inc not supported without cblas");
41 for (unsigned int i = 0; i < N; ++i)
42 Y[i * incY] = Y[i * incY] + X[i * incX] * alpha;
45 static void saxpy_FP16(const unsigned int N, const float alpha, const __fp16 *X,
46 const int incX, __fp16 *Y, const int incY) {
47 if (incX < 0 or incY < 0)
48 throw std::invalid_argument(
49 "Error: negative inc not supported without cblas");
50 for (unsigned int i = 0; i < N; ++i)
51 Y[i * incY] = Y[i * incY] + X[i * incX] * alpha;
54 static void sgemv_raw(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
55 const unsigned int M, const unsigned int N,
56 const float alpha, const float *A, const unsigned int lda,
57 const float *X, const int incX, const float beta,
58 float *Y, const int incY) {
60 unsigned int incy = abs(incY);
61 unsigned int incx = abs(incX);
63 if (TransA == CblasTrans) {
64 sgemv_loop(i, j, N, M);
66 sgemv_loop(j, i, M, N);
70 static void sgemv_FP16(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
71 const unsigned int M, const unsigned int N,
72 const float alpha, const __fp16 *A,
73 const unsigned int lda, const __fp16 *X, const int incX,
74 const float beta, __fp16 *Y, const int incY) {
76 unsigned int incy = abs(incY);
77 unsigned int incx = abs(incX);
79 if (TransA == CblasTrans) {
80 sgemv_loop(i, j, N, M);
82 sgemv_loop(j, i, M, N);
86 static float sdot_raw(const unsigned int N, const float *X,
87 const unsigned int incX, const float *Y,
88 const unsigned int incY) {
90 for (unsigned int i = 0; i < N; ++i) {
91 ret += X[i * incX] * Y[i * incY];
96 static __fp16 sdot_FP16(const unsigned int N, const __fp16 *X,
97 const unsigned int incX, const __fp16 *Y,
98 const unsigned int incY) {
100 for (unsigned int i = 0; i < N; ++i) {
101 ret += X[i * incX] * Y[i * incY];
106 static void scopy_raw(const unsigned int N, const float *X, const int incX,
107 float *Y, const int incY) {
108 unsigned int incy = abs(incY);
109 unsigned int incx = abs(incX);
111 for (unsigned int i = 0; i < N; ++i)
112 Y[i * incy] = X[i * incx];
115 static void scopy_FP16(const unsigned int N, const __fp16 *X, const int incX,
116 __fp16 *Y, const int incY) {
117 unsigned int incy = abs(incY);
118 unsigned int incx = abs(incX);
120 for (unsigned int i = 0; i < N; ++i)
121 Y[i * incy] = X[i * incx];
124 static void sscal_raw(const unsigned int N, const float alpha, float *X,
126 unsigned int incx = abs(incX);
128 for (unsigned int i = 0; i < N; ++i)
129 X[i * incx] = alpha * X[i * incx];
132 void sscal(const unsigned int N, const float alpha, __fp16 *X, const int incX) {
133 unsigned int incx = abs(incX);
135 for (unsigned int i = 0; i < N; ++i)
136 X[i * incx] = alpha * X[i * incx];
139 void sscal(const unsigned int N, const float alpha, void *X, const int incX,
140 ml::train::TensorDim::DataType d_type) {
142 #ifdef BLAS_NUM_THREADS
143 openblas_set_num_threads(BLAS_NUM_THREADS);
145 if (d_type == ml::train::TensorDim::DataType::FP32)
146 cblas_sscal(N, alpha, (float *)X, incX);
148 if (d_type == ml::train::TensorDim::DataType::FP32) {
149 sscal_raw(N, alpha, (float *)X, incX);
150 } else if (d_type == ml::train::TensorDim::DataType::FP16) {
151 sscal(N, alpha, (__fp16 *)X, incX);
156 void sscal(const unsigned int N, const float alpha, float *X, const int incX) {
158 #ifdef BLAS_NUM_THREADS
159 openblas_set_num_threads(BLAS_NUM_THREADS);
161 cblas_sscal(N, alpha, (float *)X, incX);
163 sscal_raw(N, alpha, (float *)X, incX);
167 static float snrm2_raw(const unsigned int N, const float *X, const int incX) {
168 unsigned int incx = abs(incX);
171 #pragma omp parallel for private(tmp) reduction(+ : sum)
172 for (unsigned int i = 0; i < N; i++) {
179 static float snrm2_FP16(const unsigned int N, const __fp16 *X, const int incX) {
180 unsigned int incx = abs(incX);
183 #pragma omp parallel for private(tmp) reduction(+ : sum)
184 for (unsigned int i = 0; i < N; i++) {
191 static void sgemm_raw(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
192 CBLAS_TRANSPOSE TransB, const unsigned int M,
193 const unsigned int N, const unsigned int K,
194 const float alpha, const float *A, const unsigned int lda,
195 const float *B, const unsigned int ldb, const float beta,
196 float *C, const unsigned int ldc) {
198 for (unsigned int m = 0; m < M; ++m) {
199 for (unsigned int n = 0; n < N; ++n) {
201 float c_old = C[m * ldc + n];
202 for (unsigned int k = 0; k < K; ++k) {
204 a = ((TransA == CblasTrans) ? A[k * lda + m] : A[m * lda + k]);
205 b = ((TransB == CblasTrans) ? B[n * ldb + k] : B[k * ldb + n]);
208 C[m * ldc + n] = alpha * c;
210 C[m * ldc + n] += beta * c_old;
215 static void sgemm_FP16(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
216 CBLAS_TRANSPOSE TransB, const unsigned int M,
217 const unsigned int N, const unsigned int K,
218 const float alpha, const __fp16 *A,
219 const unsigned int lda, const __fp16 *B,
220 const unsigned int ldb, const float beta, __fp16 *C,
221 const unsigned int ldc) {
223 for (unsigned int m = 0; m < M; ++m) {
224 for (unsigned int n = 0; n < N; ++n) {
226 __fp16 c_old = C[m * ldc + n];
227 for (unsigned int k = 0; k < K; ++k) {
229 a = ((TransA == CblasTrans) ? A[k * lda + m] : A[m * lda + k]);
230 b = ((TransB == CblasTrans) ? B[n * ldb + k] : B[k * ldb + n]);
233 C[m * ldc + n] = alpha * c;
235 C[m * ldc + n] += beta * c_old;
240 static unsigned int isamax_raw(const unsigned int N, const float *X,
243 unsigned int max_idx = 0;
244 float max_val = X[0];
245 for (unsigned int n = 1; n < N; n += incX) {
246 float cur_val = abs(X[n]);
247 if (cur_val > max_val) {
256 static unsigned int isamax_FP16(const unsigned int N, const __fp16 *X,
259 unsigned int max_idx = 0;
260 __fp16 max_val = X[0];
261 for (unsigned int n = 1; n < N; n += incX) {
262 __fp16 cur_val = abs(X[n]);
263 if (cur_val > max_val) {
274 void saxpy(const unsigned int N, const float alpha, const void *X,
275 const int incX, void *Y, const int incY,
276 ml::train::TensorDim::DataType d_type) {
278 #ifdef BLAS_NUM_THREADS
279 openblas_set_num_threads(BLAS_NUM_THREADS);
281 cblas_saxpy(N, alpha, X, incX, Y, incY);
283 if (d_type == ml::train::TensorDim::DataType::FP32) {
284 saxpy_raw(N, alpha, (float *)X, incX, (float *)Y, incY);
285 } else if (d_type == ml::train::TensorDim::DataType::FP16) {
286 saxpy_FP16(N, alpha, (__fp16 *)X, incX, (__fp16 *)Y, incY);
291 void saxpy(const unsigned int N, const float alpha, const float *X,
292 const int incX, float *Y, const int incY) {
294 #ifdef BLAS_NUM_THREADS
295 openblas_set_num_threads(BLAS_NUM_THREADS);
297 cblas_saxpy(N, alpha, X, incX, Y, incY);
299 saxpy_raw(N, alpha, X, incX, Y, incY);
303 void saxpy(const unsigned int N, const float alpha, const __fp16 *X,
304 const int incX, __fp16 *Y, const int incY) {
305 saxpy_FP16(N, alpha, X, incX, Y, incY);
308 void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
309 const unsigned int M, const unsigned int N, const unsigned int K,
310 const float alpha, const void *A, const unsigned int lda,
311 const void *B, const unsigned int ldb, const float beta, void *C,
312 const unsigned int ldc, ml::train::TensorDim::DataType d_type) {
315 cudaDeviceProp deviceProp;
316 cudaGetDeviceProperties(&deviceProp, devID);
317 float *d_A, *d_B, *d_C;
319 unsigned int size_A = M * K * sizeof(float);
320 unsigned int size_B = K * N * sizeof(float);
321 unsigned int size_C = M * N * sizeof(float);
323 cudaMalloc((void **)&d_A, size_A);
324 cudaMalloc((void **)&d_B, size_B);
325 cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice);
326 cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice);
327 cudaMalloc((void **)&d_C, size_C);
329 cublasHandle_t handle;
330 cublasCreate(&handle);
332 cublasOperation_t transA = (TransA == CblasTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
333 cublasOperation_t transB = (TransB == CblasTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
334 cublasSgemm(handle, transA, transB, N, M, K, &alpha, d_B, N, d_A, K, &beta,
337 cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost);
338 cublasDestroy(handle);
339 #elif defined USE_BLAS
340 #ifdef BLAS_NUM_THREADS
341 openblas_set_num_threads(BLAS_NUM_THREADS);
343 cblas_sgemm(order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C,
346 if (d_type == ml::train::TensorDim::DataType::FP32) {
347 sgemm_raw(order, TransA, TransB, M, N, K, alpha, (float *)A, lda,
348 (float *)B, ldb, beta, (float *)C, ldc);
349 } else if (d_type == ml::train::TensorDim::DataType::FP16) {
350 sgemm_FP16(order, TransA, TransB, M, N, K, alpha, (__fp16 *)A, lda,
351 (__fp16 *)B, ldb, beta, (__fp16 *)C, ldc);
356 void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
357 const unsigned int M, const unsigned int N, const unsigned int K,
358 const float alpha, const float *A, const unsigned int lda,
359 const float *B, const unsigned int ldb, const float beta, float *C,
360 const unsigned int ldc) {
364 cudaDeviceProp deviceProp;
365 cudaGetDeviceProperties(&deviceProp, devID);
366 float *d_A, *d_B, *d_C;
368 unsigned int size_A = M * K * sizeof(float);
369 unsigned int size_B = K * N * sizeof(float);
370 unsigned int size_C = M * N * sizeof(float);
372 cudaMalloc((void **)&d_A, size_A);
373 cudaMalloc((void **)&d_B, size_B);
374 cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice);
375 cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice);
376 cudaMalloc((void **)&d_C, size_C);
378 cublasHandle_t handle;
379 cublasCreate(&handle);
381 cublasOperation_t transA = (TransA == CblasTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
382 cublasOperation_t transB = (TransB == CblasTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
383 cublasSgemm(handle, transA, transB, N, M, K, &alpha, d_B, N, d_A, K, &beta,
386 cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost);
387 cublasDestroy(handle);
388 #elif defined USE_BLAS
389 #ifdef BLAS_NUM_THREADS
390 openblas_set_num_threads(BLAS_NUM_THREADS);
392 cblas_sgemm(order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C,
395 sgemm_raw(order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C,
400 void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
401 const unsigned int M, const unsigned int N, const unsigned int K,
402 const float alpha, const __fp16 *A, const unsigned int lda,
403 const __fp16 *B, const unsigned int ldb, const float beta, __fp16 *C,
404 const unsigned int ldc) {
405 sgemm_FP16(order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C,
409 void scopy(const unsigned int N, const void *X, const int incX, void *Y,
410 const int incY, ml::train::TensorDim::DataType d_type) {
412 #ifdef BLAS_NUM_THREADS
413 openblas_set_num_threads(BLAS_NUM_THREADS);
415 if (d_type == ml::train::TensorDim::DataType::FP32) {
416 cblas_scopy(N, (float *)X, incX, (float *)Y, incY);
419 if (d_type == ml::train::TensorDim::DataType::FP32) {
420 scopy_raw(N, (float *)X, incX, (float *)Y, incY);
421 } else if (d_type == ml::train::TensorDim::DataType::FP16) {
422 scopy_FP16(N, (__fp16 *)X, incX, (__fp16 *)Y, incY);
425 } // namespace nntrainer
427 void scopy(const unsigned int N, const float *X, const int incX, float *Y,
430 #ifdef BLAS_NUM_THREADS
431 openblas_set_num_threads(BLAS_NUM_THREADS);
433 cblas_scopy(N, X, incX, Y, incY);
435 scopy_raw(N, X, incX, Y, incY);
437 } // namespace nntrainer
439 void scopy(const unsigned int N, const __fp16 *X, const int incX, __fp16 *Y,
441 scopy_FP16(N, X, incX, Y, incY);
443 } // namespace nntrainer
445 float snrm2(const int N, const float *X, const int incX) {
447 #ifdef BLAS_NUM_THREADS
448 openblas_set_num_threads(BLAS_NUM_THREADS);
450 return cblas_snrm2(N, X, incX);
452 return snrm2_raw(N, X, incX);
456 __fp16 snrm2(const int N, const __fp16 *X, const int incX) {
457 return snrm2_FP16(N, X, incX);
460 float sdot(const unsigned int N, const float *X, const unsigned int incX,
461 const float *Y, const unsigned int incY) {
463 #ifdef BLAS_NUM_THREADS
464 openblas_set_num_threads(BLAS_NUM_THREADS);
466 return cblas_sdot(N, X, incX, Y, incY);
468 return sdot_raw(N, X, incX, Y, incY);
472 __fp16 sdot(const unsigned int N, const __fp16 *X, const unsigned int incX,
473 const __fp16 *Y, const unsigned int incY) {
474 return sdot_FP16(N, X, incX, Y, incY);
477 void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
478 const unsigned int N, const float alpha, const void *A,
479 const unsigned int lda, const void *X, const int incX,
480 const float beta, void *Y, const int incY,
481 ml::train::TensorDim::DataType d_type) {
483 #ifdef BLAS_NUM_THREADS
484 openblas_set_num_threads(BLAS_NUM_THREADS);
486 return cblas_sgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
489 if (d_type == ml::train::TensorDim::DataType::FP32) {
490 return sgemv_raw(order, TransA, M, N, alpha, (float *)A, lda, (float *)X,
491 incX, beta, (float *)Y, incY);
492 } else if (d_type == ml::train::TensorDim::DataType::FP16) {
493 return sgemv_FP16(order, TransA, M, N, alpha, (__fp16 *)A, lda, (__fp16 *)X,
494 incX, beta, (__fp16 *)Y, incY);
499 void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
500 const unsigned int N, const float alpha, const float *A,
501 const unsigned int lda, const float *X, const int incX,
502 const float beta, float *Y, const int incY) {
504 #ifdef BLAS_NUM_THREADS
505 openblas_set_num_threads(BLAS_NUM_THREADS);
507 return cblas_sgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
510 return sgemv_raw(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
514 void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
515 const unsigned int N, const float alpha, const __fp16 *A,
516 const unsigned int lda, const __fp16 *X, const int incX,
517 const float beta, __fp16 *Y, const int incY) {
518 sgemv_FP16(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
521 unsigned int isamax(const unsigned int N, const float *X, const int incX) {
523 #ifdef BLAS_NUM_THREADS
524 openblas_set_num_threads(BLAS_NUM_THREADS);
526 return cblas_isamax(N, X, incX);
528 return isamax_raw(N, X, incX);
532 unsigned int isamax(const unsigned int N, const __fp16 *X, const int incX) {
533 /// @todo isamax_FP16 for BLAS_NUM_THREADS
534 return isamax_FP16(N, X, incX);
537 } // namespace nntrainer