nntrainer/tensor/blas_interface.cpp

   1 // SPDX-License-Identifier: Apache-2.0
   2 /**
   3  * Copyright (C) 2020 Jijoong Moon <jijoong.moon@samsung.com>
   4  *
   5  * @file   blas_interface.cpp
   6  * @date   28 Aug 2020
   7  * @see    https://github.com/nnstreamer/nntrainer
   8  * @author Jijoong Moon <jijoong.moon@samsung.com>
   9  * @bug    No known bugs except for NYI items
  10  * @brief  This is dummy header for blas support
  11  *
  12  */
  13
  14 #include <blas_interface.h>
  15 #include <nntrainer_error.h>
  16 #include <iostream>
  17
  18 #include <cmath>
  19
  20 #define sgemv_loop(ci, cj, cM, cN)           \
  21   do {                                       \
  22     double y0;                               \
  23     unsigned int i, j;                       \
  24     for (ci = 0; ci != cM; ci++) {           \
  25       y0 = Y[ci * incy] * beta;              \
  26       for (cj = 0; cj != cN; cj++)           \
  27         y0 += A[i + j * lda] * X[cj * incx]; \
  28       Y[ci * incy] = y0;                     \
  29     }                                        \
  30   } while (0);
  31
  32 namespace nntrainer {
  33
  34 #ifndef USE_BLAS
  35
  36 static void saxpy_raw(const unsigned int N, const float alpha, const float *X,
  37                       const int incX, float *Y, const int incY) {
  38   if (incX < 0 or incY < 0)
  39     throw std::invalid_argument(
  40       "Error: negative inc not supported without cblas");
  41   for (unsigned int i = 0; i < N; ++i)
  42     Y[i * incY] = Y[i * incY] + X[i * incX] * alpha;
  43 }
  44
  45 static void sgemv_raw(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
  46                       const unsigned int M, const unsigned int N,
  47                       const float alpha, const float *A, const unsigned int lda,
  48                       const float *X, const int incX, const float beta,
  49                       float *Y, const int incY) {
  50
  51   unsigned int incy = abs(incY);
  52   unsigned int incx = abs(incX);
  53
  54   if (TransA == CblasTrans) {
  55     sgemv_loop(i, j, N, M);
  56   } else {
  57     sgemv_loop(j, i, M, N);
  58   }
  59 }
  60
  61 static float sdot_raw(const unsigned int N, const float *X,
  62                       const unsigned int incX, const float *Y,
  63                       const unsigned int incY) {
  64   float ret = 0;
  65   for (unsigned int i = 0; i < N; ++i) {
  66     ret += X[i * incX] * Y[i * incY];
  67   }
  68   return ret;
  69 }
  70
  71 static void scopy_raw(const unsigned int N, const float *X, const int incX,
  72                       float *Y, const int incY) {
  73   unsigned int incy = abs(incY);
  74   unsigned int incx = abs(incX);
  75
  76   for (unsigned int i = 0; i < N; ++i)
  77     Y[i * incy] = X[i * incx];
  78 }
  79
  80 static void scopy_FP16(const unsigned int N, const __fp16 *X, const int incX,
  81                        __fp16 *Y, const int incY) {
  82   unsigned int incy = abs(incY);
  83   unsigned int incx = abs(incX);
  84
  85   for (unsigned int i = 0; i < N; ++i)
  86     Y[i * incy] = X[i * incx];
  87 }
  88
  89 static void sscal_raw(const unsigned int N, const float alpha, float *X,
  90                       const int incX) {
  91   unsigned int incx = abs(incX);
  92
  93   for (unsigned int i = 0; i < N; ++i)
  94     X[i * incx] = alpha * X[i * incx];
  95 }
  96
  97 void sscal(const unsigned int N, const float alpha, __fp16 *X, const int incX) {
  98   unsigned int incx = abs(incX);
  99
 100   for (unsigned int i = 0; i < N; ++i)
 101     X[i * incx] = alpha * X[i * incx];
 102 }
 103
 104 void sscal(const unsigned int N, const float alpha, void *X, const int incX,
 105            ml::train::TensorDim::DataType d_type) {
 106 #ifdef USE_BLAS
 107 #ifdef BLAS_NUM_THREADS
 108   openblas_set_num_threads(BLAS_NUM_THREADS);
 109 #endif
 110   if (d_type == ml::train::TensorDim::DataType::FP32)
 111     cblas_sscal(N, alpha, (float *)X, incX);
 112 #else
 113   if (d_type == ml::train::TensorDim::DataType::FP32) {
 114     sscal_raw(N, alpha, (float *)X, incX);
 115   } else if (d_type == ml::train::TensorDim::DataType::FP16) {
 116     sscal(N, alpha, (__fp16 *)X, incX);
 117   }
 118 #endif
 119 }
 120
 121 void sscal(const unsigned int N, const float alpha, float *X, const int incX) {
 122 #ifdef USE_BLAS
 123 #ifdef BLAS_NUM_THREADS
 124   openblas_set_num_threads(BLAS_NUM_THREADS);
 125 #endif
 126   cblas_sscal(N, alpha, (float *)X, incX);
 127 #else
 128   sscal_raw(N, alpha, (float *)X, incX);
 129 #endif
 130 }
 131
 132 static float snrm2_raw(const unsigned int N, const float *X, const int incX) {
 133   unsigned int incx = abs(incX);
 134   float sum = 0.0f;
 135   float tmp;
 136 #pragma omp parallel for private(tmp) reduction(+ : sum)
 137   for (unsigned int i = 0; i < N; i++) {
 138     tmp = X[i * incx];
 139     sum += tmp * tmp;
 140   }
 141   return sqrt(sum);
 142 }
 143
 144 static void sgemm_raw(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA,
 145                       CBLAS_TRANSPOSE TransB, const unsigned int M,
 146                       const unsigned int N, const unsigned int K,
 147                       const float alpha, const float *A, const unsigned int lda,
 148                       const float *B, const unsigned int ldb, const float beta,
 149                       float *C, const unsigned int ldc) {
 150
 151   for (unsigned int m = 0; m < M; ++m) {
 152     for (unsigned int n = 0; n < N; ++n) {
 153       double c = 0.0;
 154       float c_old = C[m * ldc + n];
 155       for (unsigned int k = 0; k < K; ++k) {
 156         float a, b;
 157         a = ((TransA == CblasTrans) ? A[k * lda + m] : A[m * lda + k]);
 158         b = ((TransB == CblasTrans) ? B[n * ldb + k] : B[k * ldb + n]);
 159         c += a * b;
 160       }
 161       C[m * ldc + n] = alpha * c;
 162       if (beta != 0.0)
 163         C[m * ldc + n] += beta * c_old;
 164     }
 165   }
 166 }
 167
 168 static unsigned int isamax_raw(const unsigned int N, const float *X,
 169                                const int incX) {
 170
 171   unsigned int max_idx = 0;
 172   float max_val = X[0];
 173   for (unsigned int n = 1; n < N; n += incX) {
 174     float cur_val = abs(X[n]);
 175     if (cur_val > max_val) {
 176       max_val = cur_val;
 177       max_idx = n;
 178     }
 179   }
 180
 181   return max_idx;
 182 }
 183
 184 #endif
 185
 186 void saxpy(const unsigned int N, const float alpha, const float *X,
 187            const int incX, float *Y, const int incY) {
 188 #ifdef USE_BLAS
 189 #ifdef BLAS_NUM_THREADS
 190   openblas_set_num_threads(BLAS_NUM_THREADS);
 191 #endif
 192   cblas_saxpy(N, alpha, X, incX, Y, incY);
 193 #else
 194   saxpy_raw(N, alpha, X, incX, Y, incY);
 195 #endif
 196 }
 197
 198 void sgemm(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, CBLAS_TRANSPOSE TransB,
 199            const unsigned int M, const unsigned int N, const unsigned int K,
 200            const float alpha, const float *A, const unsigned int lda,
 201            const float *B, const unsigned int ldb, const float beta, float *C,
 202            const unsigned int ldc) {
 203
 204 #ifdef USE_CUBLAS
 205   int devID = 0;
 206   cudaDeviceProp deviceProp;
 207   cudaGetDeviceProperties(&deviceProp, devID);
 208   float *d_A, *d_B, *d_C;
 209
 210   unsigned int size_A = M * K * sizeof(float);
 211   unsigned int size_B = K * N * sizeof(float);
 212   unsigned int size_C = M * N * sizeof(float);
 213
 214   cudaMalloc((void **)&d_A, size_A);
 215   cudaMalloc((void **)&d_B, size_B);
 216   cudaMemcpy(d_A, A, size_A, cudaMemcpyHostToDevice);
 217   cudaMemcpy(d_B, B, size_B, cudaMemcpyHostToDevice);
 218   cudaMalloc((void **)&d_C, size_C);
 219
 220   cublasHandle_t handle;
 221   cublasCreate(&handle);
 222
 223   cublasOperation_t transA = (TransA == CblasTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
 224   cublasOperation_t transB = (TransB == CblasTrans) ? CUBLAS_OP_T : CUBLAS_OP_N;
 225   cublasSgemm(handle, transA, transB, N, M, K, &alpha, d_B, N, d_A, K, &beta,
 226               d_C, N);
 227
 228   cudaMemcpy(C, d_C, size_C, cudaMemcpyDeviceToHost);
 229   cublasDestroy(handle);
 230 #elif defined USE_BLAS
 231 #ifdef BLAS_NUM_THREADS
 232   openblas_set_num_threads(BLAS_NUM_THREADS);
 233 #endif
 234   cblas_sgemm(order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C,
 235               ldc);
 236 #else
 237   sgemm_raw(order, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C,
 238             ldc);
 239 #endif
 240 }
 241
 242 void scopy(const unsigned int N, const void *X, const int incX, void *Y,
 243            const int incY, ml::train::TensorDim::DataType d_type) {
 244 #ifdef USE_BLAS
 245 #ifdef BLAS_NUM_THREADS
 246   openblas_set_num_threads(BLAS_NUM_THREADS);
 247 #endif
 248   if (d_type == ml::train::TensorDim::DataType::FP32) {
 249     cblas_scopy(N, (float *)X, incX, (float *)Y, incY);
 250   }
 251 #else
 252   if (d_type == ml::train::TensorDim::DataType::FP32) {
 253     scopy_raw(N, (float *)X, incX, (float *)Y, incY);
 254   } else if (d_type == ml::train::TensorDim::DataType::FP16) {
 255     scopy_FP16(N, (__fp16 *)X, incX, (__fp16 *)Y, incY);
 256   }
 257 #endif
 258 } // namespace nntrainer
 259
 260 void scopy(const unsigned int N, const float *X, const int incX, float *Y,
 261            const int incY) {
 262 #ifdef USE_BLAS
 263 #ifdef BLAS_NUM_THREADS
 264   openblas_set_num_threads(BLAS_NUM_THREADS);
 265 #endif
 266   cblas_scopy(N, X, incX, Y, incY);
 267 #else
 268   scopy_raw(N, X, incX, Y, incY);
 269 #endif
 270 } // namespace nntrainer
 271
 272 void scopy(const unsigned int N, const __fp16 *X, const int incX, __fp16 *Y,
 273            const int incY) {
 274   scopy_FP16(N, X, incX, Y, incY);
 275
 276 } // namespace nntrainer
 277
 278 float snrm2(const int N, const float *X, const int incX) {
 279 #ifdef USE_BLAS
 280 #ifdef BLAS_NUM_THREADS
 281   openblas_set_num_threads(BLAS_NUM_THREADS);
 282 #endif
 283   return cblas_snrm2(N, X, incX);
 284 #else
 285   return snrm2_raw(N, X, incX);
 286 #endif
 287 }
 288
 289 float sdot(const unsigned int N, const float *X, const unsigned int incX,
 290            const float *Y, const unsigned int incY) {
 291 #ifdef USE_BLAS
 292 #ifdef BLAS_NUM_THREADS
 293   openblas_set_num_threads(BLAS_NUM_THREADS);
 294 #endif
 295   return cblas_sdot(N, X, incX, Y, incY);
 296 #else
 297   return sdot_raw(N, X, incX, Y, incY);
 298 #endif
 299 }
 300
 301 void sgemv(CBLAS_ORDER order, CBLAS_TRANSPOSE TransA, const unsigned int M,
 302            const unsigned int N, const float alpha, const float *A,
 303            const unsigned int lda, const float *X, const int incX,
 304            const float beta, float *Y, const int incY) {
 305 #ifdef USE_BLAS
 306 #ifdef BLAS_NUM_THREADS
 307   openblas_set_num_threads(BLAS_NUM_THREADS);
 308 #endif
 309   return cblas_sgemv(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y,
 310                      incY);
 311 #else
 312   return sgemv_raw(order, TransA, M, N, alpha, A, lda, X, incX, beta, Y, incY);
 313 #endif
 314 }
 315
 316 unsigned int isamax(const unsigned int N, const float *X, const int incX) {
 317 #ifdef USE_BLAS
 318 #ifdef BLAS_NUM_THREADS
 319   openblas_set_num_threads(BLAS_NUM_THREADS);
 320 #endif
 321   return cblas_isamax(N, X, incX);
 322 #else
 323   return isamax_raw(N, X, incX);
 324 #endif
 325 }
 326
 327 } // namespace nntrainer