inference-engine/src/gna_plugin/floatmath.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #include "floatmath.h"
   6 #include "pwl.h"
   7 #include "gna_plugin_log.hpp"
   8 #include <cmath>
   9
  10
  11 void CNNFilter32(intel_dnn_component_t *component) {
  12     float *ptr_filters = reinterpret_cast<float *>(component->op.conv1D.ptr_filters);
  13     float *ptr_biases = reinterpret_cast<float *>(component->op.conv1D.ptr_biases);
  14     float *ptr_inputs = reinterpret_cast<float *>(component->ptr_inputs);
  15     float *ptr_outputs = reinterpret_cast<float *>(component->ptr_outputs);
  16     uint32_t num_group = component->num_rows_in;
  17     uint32_t num_filter_outputs = component->op.conv1D.num_feature_map_rows - component->op.conv1D.num_filter_rows + 1;
  18     uint32_t
  19         num_inputs_band_stride = component->op.conv1D.num_feature_maps * component->op.conv1D.num_feature_map_columns;
  20     uint32_t num_filter_coefficients = component->op.conv1D.num_filter_coefficients;
  21
  22     if ((component->num_rows_in != 1) || (component->num_rows_out != 1)
  23         || (component->num_columns_out != num_filter_outputs * component->op.conv1D.num_filters)) {
  24         THROW_GNA_EXCEPTION << "Bad problem dimensions in CNNFilter32!";
  25     }
  26
  27     for (uint32_t j = 0; j < num_filter_outputs; j++) {
  28         float *ptr_in = ptr_inputs + j * num_inputs_band_stride;
  29         for (uint32_t i = 0; i < component->op.conv1D.num_filters; i++) {
  30             float *ptr_coef = ptr_filters + i * num_filter_coefficients;
  31             float sum = ptr_biases[i];
  32             for (uint32_t k = 0; k < num_filter_coefficients; k++) {
  33                 sum += ptr_in[k] * ptr_coef[k];
  34             }
  35             ptr_outputs[j * component->op.conv1D.num_filters + i] = sum;
  36         }
  37     }
  38 }
  39
  40 void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) {
  41     if (number_type == kDnnInt) {
  42         int32_t *ptr_inputs = reinterpret_cast<int32_t *>(component->ptr_inputs);
  43         int32_t *ptr_outputs = reinterpret_cast<int32_t *>(component->ptr_outputs);
  44         uint32_t num_inputs = component->num_columns_in;
  45         uint32_t num_columns = component->op.maxpool.num_inputs_stride;
  46         uint32_t num_pool_size = component->op.maxpool.num_inputs;
  47         uint32_t num_pool_step = component->op.maxpool.num_inputs_step;
  48         uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride;
  49         uint32_t num_rows_out = num_rows_in / num_pool_step;
  50
  51         for (uint32_t i = 0; i < num_columns; i++) {
  52             int32_t m = 0;
  53             if (component->op.maxpool.do_sum_not_max) {
  54                 uint32_t num_saturate = 0;
  55                 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
  56                     int64_t sum = 0;
  57                     uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
  58                     for (uint32_t k = j; k < num_end; k++) {
  59                         sum += ptr_inputs[k * num_columns + i];
  60                     }
  61                     if (sum > 2147483647.0) {
  62                         ptr_outputs[m * num_columns + i] = 2147483647L;
  63                         num_saturate++;
  64                     } else if (sum < -2147483648.0) {
  65                         ptr_outputs[m * num_columns + i] = -2147483648L;
  66                         num_saturate++;
  67                     } else {
  68                         ptr_outputs[m * num_columns + i] = (int32_t) sum;
  69                     }
  70                     m++;
  71                 }
  72                 if (num_saturate > 0) {
  73                     fprintf(stderr, "Warning:  %d saturations in CNNMaxPool()\n", num_saturate);
  74                 }
  75             } else {
  76                 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
  77                     int32_t max = INT32_MIN;
  78                     uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
  79                     for (uint32_t k = j; k < num_end; k++) {
  80                         if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i];
  81                     }
  82                     ptr_outputs[m * num_columns + i] = max;
  83                     m++;
  84                 }
  85             }
  86         }
  87     } else {
  88         float *ptr_inputs = reinterpret_cast<float *>(component->ptr_inputs);
  89         float *ptr_outputs = reinterpret_cast<float *>(component->ptr_outputs);
  90         uint32_t num_inputs = component->num_columns_in;
  91         uint32_t num_columns = component->op.maxpool.num_inputs_stride;
  92         uint32_t num_pool_size = component->op.maxpool.num_inputs;
  93         uint32_t num_pool_step = component->op.maxpool.num_inputs_step;
  94         uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride;
  95         uint32_t num_rows_out = num_rows_in / num_pool_step;
  96
  97         for (uint32_t i = 0; i < num_columns; i++) {
  98             int32_t m = 0;
  99             if (component->op.maxpool.do_sum_not_max) {
 100                 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
 101                     float sum = 0.0;
 102                     uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
 103                     for (uint32_t k = j; k < num_end; k++) {
 104                         sum += ptr_inputs[k * num_columns + i];
 105                     }
 106                     ptr_outputs[m * num_columns + i] = sum;
 107                     m++;
 108                 }
 109             } else {
 110                 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
 111                     float max = -1e20f;
 112                     uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
 113                     for (uint32_t k = j; k < num_end; k++) {
 114                         if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i];
 115                     }
 116                     ptr_outputs[m * num_columns + i] = max;
 117                     m++;
 118                 }
 119             }
 120         }
 121     }
 122 }
 123
 124 void PwlApply16(intel_dnn_component_t *component, uint32_t num_subset_size) {
 125     if (component->orientation_in == kDnnInterleavedOrientation) {  // subsets only supported in interleaved orientation
 126         PwlApply16(component, 0, num_subset_size - 1, 0, component->num_columns_in - 1);
 127     } else {
 128         PwlApply16(component, 0, component->num_rows_in - 1, 0, component->num_columns_in - 1);
 129     }
 130 }
 131
 132 void PwlApply16(intel_dnn_component_t *component,
 133                 uint32_t num_row_start,
 134                 uint32_t num_row_end,
 135                 uint32_t num_col_start,
 136                 uint32_t num_col_end) {
 137     uint32_t num_saturate = 0;
 138     uint32_t num_segments = component->op.pwl.num_segments;
 139     if (num_segments > 0) {
 140         intel_pwl_segment_t *ptr_segment = component->op.pwl.ptr_segments;
 141         for (int i = num_row_start; i <= num_row_end; i++) {
 142             int32_t *ptr_input = reinterpret_cast<int32_t *>(component->ptr_inputs) + i * component->num_columns_in;
 143             int16_t *ptr_output = reinterpret_cast<int16_t *>(component->ptr_outputs) + i * component->num_columns_in;
 144             for (int j = num_col_start; j <= num_col_end; j++) {
 145                 int32_t xbase = (int32_t) (ptr_segment[0].xBase & XBASEMASK);
 146                 int32_t input = ptr_input[j];
 147                 if (input <= xbase) {
 148                     ptr_output[j] = ptr_segment[0].yBase;
 149                 } else {
 150                     uint32_t slope_shift;
 151                     int16_t slope, ybase;
 152                     int64_t diff, prod, prod_shift, sum;
 153                     uint32_t k = num_segments / 2;
 154                     uint32_t k_upper = num_segments;
 155                     uint32_t k_lower = 0;
 156                     while (k_upper > k_lower + 1) {
 157                         xbase = (int32_t) (ptr_segment[k].xBase & XBASEMASK);
 158                         if (xbase > input) {
 159                             k_upper = k;
 160                             k = (k + k_lower) / 2;
 161                         } else {
 162                             k_lower = k;
 163                             k = (k_upper + k) / 2;
 164                         }
 165                     }
 166                     xbase = (int32_t) (ptr_segment[k].xBase & XBASEMASK);
 167                     slope_shift = ((ptr_segment[k].xBase & ~XBASEMASK) + 1) * 8;
 168                     slope = ptr_segment[k].slope;
 169                     ybase = ptr_segment[k].yBase;
 170                     diff = (int64_t) input - (int64_t) xbase;
 171                     prod = diff * slope;
 172                     prod_shift = prod >> slope_shift;
 173                     sum = prod_shift + (int64_t) ybase;
 174                     if (sum > 32767LL) {
 175                         ptr_output[j] = 32767;
 176                         num_saturate++;
 177                     } else if (sum < -32768LL) {
 178                         ptr_output[j] = -32768;
 179                         num_saturate++;
 180                     } else {
 181                         ptr_output[j] = (int16_t) sum;
 182                     }
 183                 }
 184             }
 185         }
 186     }
 187
 188     if (num_saturate > 0) {
 189         fprintf(stderr, "Warning:  %d saturations in PwlApply16!\n", num_saturate);
 190     }
 191 }
 192
 193 void PwlApply32(intel_dnn_component_t *component, uint32_t num_subset_size) {
 194     if (component->orientation_in == kDnnInterleavedOrientation) {  // subsets only supported in interleaved orientation
 195         PwlApply32(component, 0, num_subset_size - 1, 0, component->num_columns_in - 1);
 196     } else {
 197         PwlApply32(component, 0, component->num_rows_in - 1, 0, component->num_columns_in - 1);
 198     }
 199 }
 200
 201 void PwlApply32(intel_dnn_component_t *component,
 202                 uint32_t num_row_start,
 203                 uint32_t num_row_end,
 204                 uint32_t num_col_start,
 205                 uint32_t num_col_end) {
 206     intel_piecewiselinear_t *transform = reinterpret_cast<intel_piecewiselinear_t *>(&component->op.pwl);
 207     float *ptr_in = reinterpret_cast<float *>(component->ptr_inputs);
 208     float *ptr_out = reinterpret_cast<float *>(component->ptr_outputs);
 209     uint32_t num_columns = component->num_columns_in;
 210     switch (transform->func_id.type) {
 211         case kActSigmoid:
 212             for (uint32_t i = num_row_start; i <= num_row_end; i++) {
 213                 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
 214                     ptr_out[i * num_columns + j] = 0.5 * (1.0 + tanh(0.5 * ptr_in[i * num_columns + j]));
 215                 }
 216             }
 217             break;
 218         case kActTanh:
 219             for (uint32_t i = num_row_start; i <= num_row_end; i++) {
 220                 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
 221                     ptr_out[i * num_columns + j] = tanh(ptr_in[i * num_columns + j]);
 222                 }
 223             }
 224             break;
 225         case kActRelu:
 226             for (uint32_t i = num_row_start; i <= num_row_end; i++) {
 227                 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
 228                     ptr_out[i * num_columns + j] =
 229                         (ptr_in[i * num_columns + j] < 0.0f) ? ptr_in[i * num_columns + j] * transform->func_id.negative_slope : ptr_in[i * num_columns + j];
 230                 }
 231             }
 232             break;
 233         case kActIdentity:
 234             for (uint32_t i = num_row_start; i <= num_row_end; i++) {
 235                 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
 236                     ptr_out[i * num_columns + j] = ptr_in[i * num_columns + j];
 237                 }
 238             }
 239             break;
 240         case kActKaldiLstmClipping:
 241             for (uint32_t i = num_row_start; i <= num_row_end; i++) {
 242                 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
 243                     float val = ptr_in[i * num_columns + j];
 244                     if (val > KALDI_LSTM_CLIP_UPPER) {
 245                         ptr_out[i * num_columns + j] = KALDI_LSTM_CLIP_UPPER;
 246                     } else if (val < KALDI_LSTM_CLIP_LOWER) {
 247                         ptr_out[i * num_columns + j] = KALDI_LSTM_CLIP_LOWER;
 248                     } else {
 249                         ptr_out[i * num_columns + j] = val;
 250                     }
 251                 }
 252             }
 253             break;
 254         case kActCustom:
 255             // break;
 256         default:fprintf(stderr, "Unknown piecewise linear function type!\n");
 257             throw -1;
 258     }
 259 }
 260
 261 #ifdef __cplusplus
 262 extern "C" {  // API uses C linkage so that it can be used by C and C++ applications
 263 #endif
 264
 265 #ifdef _NO_MKL_
 266 void cblas_sgemm1(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
 267                   const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
 268                   const MKL_INT K, const float alpha, const float *A,
 269                   const MKL_INT lda, const float *B, const MKL_INT ldb,
 270                   const float beta, float *C, const MKL_INT ldc) {
 271     int i, j, k;
 272
 273     if (Layout != CblasRowMajor) {
 274         fprintf(stderr, "Only row major is supported in cblas_sgemm!\n");
 275         throw -1;
 276     }
 277
 278     if ((TransA == CblasNoTrans) && (TransB == CblasNoTrans)) {
 279         for (i = 0; i < M; i++) {
 280             for (j = 0; j < N; j++) {
 281                 float sum = (beta == 1.0) ? C[i * ldc + j] : 0;
 282                 for (k = 0; k < K; k++) {
 283                     sum += A[i * lda + k] * B[k * ldb + j];
 284                 }
 285                 C[i * ldc + j] = sum;
 286             }
 287         }
 288     } else if ((TransA == CblasNoTrans) && (TransB == CblasTrans)) {
 289         for (i = 0; i < M; i++) {
 290             for (j = 0; j < N; j++) {
 291                 float sum;
 292                 sum = beta * C[i * ldc + j];
 293                 for (k = 0; k < K; k++) {
 294                     sum += alpha * A[i * lda + k] * B[j * ldb + k];
 295                 }
 296                 C[i * ldc + j] = sum;
 297             }
 298         }
 299     } else if ((TransA == CblasTrans) && (TransB == CblasNoTrans)) {
 300         for (i = 0; i < M; i++) {
 301             for (j = 0; j < N; j++) {
 302                 float sum = (beta == 1.0) ? C[i * ldc + j] : 0;
 303                 for (k = 0; k < K; k++) {
 304                     sum += A[k * lda + i] * B[k * ldb + j];
 305                 }
 306                 C[i * ldc + j] = sum;
 307             }
 308         }
 309     } else {
 310         fprintf(stderr, "Expected A not transposed in cblas_sgemm!\n");
 311         throw -1;
 312     }
 313 }
 314 void cblas_ssbmv1(const CBLAS_LAYOUT Layout, const CBLAS_UPLO Uplo,
 315                   const MKL_INT N, const MKL_INT K, const float alpha, const float *A,
 316                   const MKL_INT lda, const float *X, const MKL_INT incX,
 317                   const float beta, float *Y, const MKL_INT incY) {
 318     int i, j, k;
 319
 320     if (Layout != CblasRowMajor) {
 321         fprintf(stderr, "Only row major is supported in cblas_ssbmv!\n");
 322         throw -1;
 323     }
 324     if (Uplo != CblasLower) {
 325         fprintf(stderr, "Only lower format is supported in cblas_ssbmv!\n");
 326         throw -1;
 327     }
 328     if (K != 0) {
 329         fprintf(stderr, "Only diagonal matrices supported in cblas_ssbmv at this time!\n");
 330         throw -1;
 331     }
 332     if ((alpha == 1.0) && (beta == 1.0) && (incX == 1) && (incY == 1)) {
 333         for (i = 0; i < N; i++) {
 334             Y[i] += A[i] * X[i];
 335         }
 336     } else {
 337         fprintf(stderr, "Only alpha=1, beta=1, incX=1, incY=1, LDA=1 supported in cblas_ssbmv at this time!\n");
 338         throw -1;
 339     }
 340 }
 341 #endif  // #ifdef _NO_MKL_
 342
 343 void cblas_sgemm_subset(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
 344                         const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
 345                         const MKL_INT K, const float alpha, const float *A,
 346                         const MKL_INT lda, const float *B, const MKL_INT ldb,
 347                         const float beta, float *C, const MKL_INT ldc,
 348                         const uint32_t *OutputList, const MKL_INT L) {
 349     int i, j, k, l;
 350
 351     if (Layout != CblasRowMajor) {
 352         fprintf(stderr, "Only row major is supported in cblas_sgemm_subset!\n");
 353         throw -1;
 354     }
 355
 356     if ((TransA == CblasNoTrans) && (TransB == CblasNoTrans)) {
 357         for (l = 0; l < L; l++) {
 358             i = OutputList[l];
 359             for (j = 0; j < N; j++) {
 360                 float sum = (beta == 1.0) ? C[l * ldc + j] : 0;
 361                 for (k = 0; k < K; k++) {
 362                     sum += A[i * lda + k] * B[k * ldb + j];
 363                 }
 364                 C[l * ldc + j] = sum;
 365             }
 366         }
 367     } else if ((TransA == CblasNoTrans) && (TransB == CblasTrans)) {
 368         for (i = 0; i < M; i++) {
 369             for (l = 0; l < L; l++) {
 370                 float sum;
 371                 j = OutputList[l];
 372                 sum = beta * C[i * ldc + l];
 373                 for (k = 0; k < K; k++) {
 374                     sum += alpha * A[i * lda + k] * B[j * ldb + k];
 375                 }
 376                 C[i * ldc + l] = sum;
 377             }
 378         }
 379     } else if ((TransA == CblasTrans) && (TransB == CblasNoTrans)) {
 380         for (l = 0; l < L; l++) {
 381             i = OutputList[l];
 382             for (j = 0; j < N; j++) {
 383                 float sum = (beta == 1.0) ? C[l * ldc + j] : 0;
 384                 for (k = 0; k < K; k++) {
 385                     sum += A[k * lda + i] * B[k * ldb + j];
 386                 }
 387                 C[l * ldc + j] = sum;
 388             }
 389         }
 390     } else {
 391         fprintf(stderr, "Expected A not transposed in cblas_sgemm_subset!\n");
 392         throw -1;
 393     }
 394 }
 395
 396 // C = [ A1 A2 ] * X + B
 397 void sgemv_split(const uint32_t N,
 398                  const uint32_t K1,
 399                  const uint32_t K2,
 400                  const float *A1,
 401                  const float *A2,
 402                  const float *X,
 403                  const float *B,
 404                  float *C) {
 405     uint32_t num_columns = K1 + K2;
 406     uint32_t num_rows = N;
 407     uint32_t i, j;
 408
 409     for (i = 0; i < num_rows; i++) {
 410         float sum = B[i];
 411         for (j = 0; j < K1; j++) {
 412             sum += A1[j] * X[i * num_columns + j];
 413         }
 414         for (j = K1; j < num_columns; j++) {
 415             sum += A2[j - K1] * X[i * num_columns + j];
 416         }
 417         C[i] = sum;
 418     }
 419 }
 420
 421 #ifdef __cplusplus
 422 }  // end extern "C"
 423 #endif