1 // SPDX-License-Identifier: Apache-2.0
3 * Copyright (C) 2020 Jijoong Moon <jijoong.moon@samsung.com>
5 * @file blas_interface.cpp
7 * @see https://github.com/nnstreamer/nntrainer
8 * @author Jijoong Moon <jijoong.moon@samsung.com>
9 * @bug No known bugs except for NYI items
10 * @brief This is dummy header for blas support
14 #include <blas_interface.h>
15 #include <nntrainer_error.h>
20 #define sgemv_loop(ci, cj, cM, cN) \
24 for (ci = 0; ci != cM; ci++) { \
25 y0 = Y[ci * incy] * beta; \
26 for (cj = 0; cj != cN; cj++) \
27 y0 += A[i + j * lda] * X[cj * incx]; \
36 static void saxpy_raw(const unsigned int N, const float alpha, const float *X,
37 const int incX, float *Y, const int incY) {
38 if (incX < 0 or incY < 0)
39 throw std::invalid_argument(
40 "Error: negative inc not supported without cblas");
41 for (unsigned int i = 0; i < N; ++i)
42 Y[i * incY] = Y[i * incY] + X[i * incX] * alpha;
45 static void sgemv_raw(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
46 const unsigned int M, const unsigned int N,
47 const float alpha, const float *A, const unsigned int lda,
48 const float *X, const int incX, const float beta,
49 float *Y, const int incY) {
51 unsigned int incy = abs(incY);
52 unsigned int incx = abs(incX);
54 if (TransA == CblasTrans) {
55 sgemv_loop(i, j, N, M);
57 sgemv_loop(j, i, M, N);
61 static float sdot_raw(const unsigned int N, const float *X,
62 const unsigned int incX, const float *Y,
63 const unsigned int incY) {
65 for (unsigned int i = 0; i < N; ++i) {
66 ret += X[i * incX] * Y[i * incY];
71 static void scopy_raw(const unsigned int N, const float *X, const int incX,
72 float *Y, const int incY) {
73 unsigned int incy = abs(incY);
74 unsigned int incx = abs(incX);
76 for (unsigned int i = 0; i < N; ++i)
77 Y[i * incy] = X[i * incx];
80 static void scopy_FP16(const unsigned int N, const __fp16 *X, const int incX,
81 __fp16 *Y, const int incY) {
82 unsigned int incy = abs(incY);
83 unsigned int incx = abs(incX);
85 for (unsigned int i = 0; i < N; ++i)
86 Y[i * incy] = X[i * incx];
89 static void sscal_raw(const unsigned int N, const float alpha, float *X,
91 unsigned int incx = abs(incX);
93 for (unsigned int i = 0; i < N; ++i)
94 X[i * incx] = alpha * X[i * incx];
97 void sscal(const unsigned int N, const float alpha, __fp16 *X, const int incX) {
98 unsigned int incx = abs(incX);
100 for (unsigned int i = 0; i < N; ++i)
101 X[i * incx] = alpha * X[i * incx];
104 void sscal(const unsigned int N, const float alpha, void *X, const int incX,
105 ml::train::TensorDim::DataType d_type) {
107 #ifdef BLAS_NUM_THREADS
108 openblas_set_num_threads(BLAS_NUM_THREADS);
110 if (d_type == ml::train::TensorDim::DataType::FP32)
111 cblas_sscal(N, alpha, (float *)X, incX);
113 if (d_type == ml::train::TensorDim::DataType::FP32) {
114 sscal_raw(N, alpha, (float *)X, incX);
115 } else if (d_type == ml::train::TensorDim::DataType::FP16) {
116 sscal(N, alpha, (__fp16 *)X, incX);
121 void sscal(const unsigned int N, const float alpha, float *X, const int incX) {
123 #ifdef BLAS_NUM_THREADS
124 openblas_set_num_threads(BLAS_NUM_THREADS);
126 cblas_sscal(N, alpha, (float *)X, incX);
128 sscal_raw(N, alpha, (float *)X, incX);
132 static float snrm2_raw(const unsigned int N, const float *X, const int incX) {
133 unsigned int incx = abs(incX);
136 #pragma omp parallel for private(tmp) reduction(+ : sum)
137 for (unsigned int i = 0; i < N; i++) {
144 static void sgemm_raw(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
145 CBLAS_TRANSPOSE TransB, const unsigned int M,
146 const unsigned int N, const unsigned int K,
147 const float alpha, const float *A, const unsigned int lda,
148 const float *B, const unsigned int ldb, const float beta,
149 float *C, const unsigned int ldc) {
151 for (unsigned int m = 0; m < M; ++m) {
152 for (unsigned int n = 0; n < N; ++n) {
154 float c_old = C[m * ldc + n];
155 for (unsigned int k = 0; k < K; ++k) {
157 a = ((TransA == CblasTrans) ? A[k * lda + m] : A[m * lda + k]);
158 b = ((TransB == CblasTrans) ? B[n * ldb + k] : B[k * ldb + n]);
161 C[m * ldc + n] = alpha * c;
163 C[m * ldc + n] += beta * c_old;
168 static unsigned int isamax_raw(const unsigned int N, const float *X,
171 unsigned int max_idx = 0;
172 float max_val = X[0];
173 for (unsigned int n = 1; n < N; n += incX) {
174 float cur_val = abs(X[n]);
175 if (cur_val > max_val) {
186 void saxpy(const unsigned int N, const float alpha, const float *X,
187 const int incX, float *Y, const int incY) {
189 #ifdef BLAS_NUM_THREADS
190 openblas_set_num_threads(BLAS_NUM_THREADS);
192 cblas_saxpy(N, alpha, X, incX, Y, incY);
194 saxpy_raw(N, alpha, X, incX, Y, incY);
198 void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
199 const unsigned int M, const unsigned int N, const unsigned int K,
200 const float alpha, const float *A, const unsigned int lda,
201 const float *B, const unsigned int ldb, const float beta, float *C,
202 const unsigned int ldc) {
206 cudaDeviceProp deviceProp;
207 cudaGetDeviceProperties(&deviceProp, devID);
208 float *d_A, *d_B, *d_C;
210 unsigned int size_A = M * K * sizeof(float);
211 unsigned int size_B = K * N * sizeof(float);
212 unsigned int size_C = M * N * sizeof(float);
214 cudaMalloc((void **)&d_A, size_A);
215 cudaMalloc((void **)&d_B, size_B);
216 cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice);
217 cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice);
218 cudaMalloc((void **)&d_C, size_C);
220 cublasHandle_t handle;
221 cublasCreate(&handle);
223 cublasOperation_t transA = (TransA == CblasTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
224 cublasOperation_t transB = (TransB == CblasTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
225 cublasSgemm(handle, transA, transB, N, M, K, &alpha, d_B, N, d_A, K, &beta,
228 cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost);
229 cublasDestroy(handle);
230 #elif defined USE_BLAS
231 #ifdef BLAS_NUM_THREADS
232 openblas_set_num_threads(BLAS_NUM_THREADS);
234 cblas_sgemm(order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C,
237 sgemm_raw(order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C,
242 void scopy(const unsigned int N, const void *X, const int incX, void *Y,
243 const int incY, ml::train::TensorDim::DataType d_type) {
245 #ifdef BLAS_NUM_THREADS
246 openblas_set_num_threads(BLAS_NUM_THREADS);
248 if (d_type == ml::train::TensorDim::DataType::FP32) {
249 cblas_scopy(N, (float *)X, incX, (float *)Y, incY);
252 if (d_type == ml::train::TensorDim::DataType::FP32) {
253 scopy_raw(N, (float *)X, incX, (float *)Y, incY);
254 } else if (d_type == ml::train::TensorDim::DataType::FP16) {
255 scopy_FP16(N, (__fp16 *)X, incX, (__fp16 *)Y, incY);
258 } // namespace nntrainer
260 void scopy(const unsigned int N, const float *X, const int incX, float *Y,
263 #ifdef BLAS_NUM_THREADS
264 openblas_set_num_threads(BLAS_NUM_THREADS);
266 cblas_scopy(N, X, incX, Y, incY);
268 scopy_raw(N, X, incX, Y, incY);
270 } // namespace nntrainer
272 void scopy(const unsigned int N, const __fp16 *X, const int incX, __fp16 *Y,
274 scopy_FP16(N, X, incX, Y, incY);
276 } // namespace nntrainer
278 float snrm2(const int N, const float *X, const int incX) {
280 #ifdef BLAS_NUM_THREADS
281 openblas_set_num_threads(BLAS_NUM_THREADS);
283 return cblas_snrm2(N, X, incX);
285 return snrm2_raw(N, X, incX);
289 float sdot(const unsigned int N, const float *X, const unsigned int incX,
290 const float *Y, const unsigned int incY) {
292 #ifdef BLAS_NUM_THREADS
293 openblas_set_num_threads(BLAS_NUM_THREADS);
295 return cblas_sdot(N, X, incX, Y, incY);
297 return sdot_raw(N, X, incX, Y, incY);
301 void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
302 const unsigned int N, const float alpha, const float *A,
303 const unsigned int lda, const float *X, const int incX,
304 const float beta, float *Y, const int incY) {
306 #ifdef BLAS_NUM_THREADS
307 openblas_set_num_threads(BLAS_NUM_THREADS);
309 return cblas_sgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
312 return sgemv_raw(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
316 unsigned int isamax(const unsigned int N, const float *X, const int incX) {
318 #ifdef BLAS_NUM_THREADS
319 openblas_set_num_threads(BLAS_NUM_THREADS);
321 return cblas_isamax(N, X, incX);
323 return isamax_raw(N, X, incX);
327 } // namespace nntrainer