inference-engine/src/gna_plugin/floatmath.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4 // floatmath.cpp : unoptimized floating point math routines (for reference)
   5 //
   6
   7 #include "floatmath.h"
   8 #include "pwl.h"
   9 #include "gna_plugin_log.hpp"
  10 #include <cmath>
  11
  12
  13 void CNNFilter32(intel_dnn_component_t *component) {
  14     float *ptr_filters = reinterpret_cast<float *>(component->op.conv1D.ptr_filters);
  15     float *ptr_biases = reinterpret_cast<float *>(component->op.conv1D.ptr_biases);
  16     float *ptr_inputs = reinterpret_cast<float *>(component->ptr_inputs);
  17     float *ptr_outputs = reinterpret_cast<float *>(component->ptr_outputs);
  18     uint32_t num_group = component->num_rows_in;
  19     uint32_t num_filter_outputs = component->op.conv1D.num_feature_map_rows - component->op.conv1D.num_filter_rows + 1;
  20     uint32_t
  21         num_inputs_band_stride = component->op.conv1D.num_feature_maps * component->op.conv1D.num_feature_map_columns;
  22     uint32_t num_filter_coefficients = component->op.conv1D.num_filter_coefficients;
  23
  24     if ((component->num_rows_in != 1) || (component->num_rows_out != 1)
  25         || (component->num_columns_out != num_filter_outputs * component->op.conv1D.num_filters)) {
  26         THROW_GNA_EXCEPTION << "Bad problem dimensions in CNNFilter32!";
  27     }
  28
  29     for (uint32_t j = 0; j < num_filter_outputs; j++) {
  30         float *ptr_in = ptr_inputs + j * num_inputs_band_stride;
  31         for (uint32_t i = 0; i < component->op.conv1D.num_filters; i++) {
  32             float *ptr_coef = ptr_filters + i * num_filter_coefficients;
  33             float sum = ptr_biases[i];
  34             for (uint32_t k = 0; k < num_filter_coefficients; k++) {
  35                 sum += ptr_in[k] * ptr_coef[k];
  36             }
  37             ptr_outputs[j * component->op.conv1D.num_filters + i] = sum;
  38         }
  39     }
  40 }
  41
  42 void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) {
  43     if (number_type == kDnnInt) {
  44         int32_t *ptr_inputs = reinterpret_cast<int32_t *>(component->ptr_inputs);
  45         int32_t *ptr_outputs = reinterpret_cast<int32_t *>(component->ptr_outputs);
  46         uint32_t num_inputs = component->num_columns_in;
  47         uint32_t num_columns = component->op.maxpool.num_inputs_stride;
  48         uint32_t num_pool_size = component->op.maxpool.num_inputs;
  49         uint32_t num_pool_step = component->op.maxpool.num_inputs_step;
  50         uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride;
  51         uint32_t num_rows_out = num_rows_in / num_pool_step;
  52
  53         for (uint32_t i = 0; i < num_columns; i++) {
  54             int32_t m = 0;
  55             if (component->op.maxpool.do_sum_not_max) {
  56                 uint32_t num_saturate = 0;
  57                 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
  58                     int64_t sum = 0;
  59                     uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
  60                     for (uint32_t k = j; k < num_end; k++) {
  61                         sum += ptr_inputs[k * num_columns + i];
  62                     }
  63                     if (sum > 2147483647.0) {
  64                         ptr_outputs[m * num_columns + i] = 2147483647L;
  65                         num_saturate++;
  66                     } else if (sum < -2147483648.0) {
  67                         ptr_outputs[m * num_columns + i] = -2147483648L;
  68                         num_saturate++;
  69                     } else {
  70                         ptr_outputs[m * num_columns + i] = (int32_t) sum;
  71                     }
  72                     m++;
  73                 }
  74                 if (num_saturate > 0) {
  75                     fprintf(stderr, "Warning:  %d saturations in CNNMaxPool()\n", num_saturate);
  76                 }
  77             } else {
  78                 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
  79                     int32_t max = INT32_MIN;
  80                     uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
  81                     for (uint32_t k = j; k < num_end; k++) {
  82                         if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i];
  83                     }
  84                     ptr_outputs[m * num_columns + i] = max;
  85                     m++;
  86                 }
  87             }
  88         }
  89     } else {
  90         float *ptr_inputs = reinterpret_cast<float *>(component->ptr_inputs);
  91         float *ptr_outputs = reinterpret_cast<float *>(component->ptr_outputs);
  92         uint32_t num_inputs = component->num_columns_in;
  93         uint32_t num_columns = component->op.maxpool.num_inputs_stride;
  94         uint32_t num_pool_size = component->op.maxpool.num_inputs;
  95         uint32_t num_pool_step = component->op.maxpool.num_inputs_step;
  96         uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride;
  97         uint32_t num_rows_out = num_rows_in / num_pool_step;
  98
  99         for (uint32_t i = 0; i < num_columns; i++) {
 100             int32_t m = 0;
 101             if (component->op.maxpool.do_sum_not_max) {
 102                 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
 103                     float sum = 0.0;
 104                     uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
 105                     for (uint32_t k = j; k < num_end; k++) {
 106                         sum += ptr_inputs[k * num_columns + i];
 107                     }
 108                     ptr_outputs[m * num_columns + i] = sum;
 109                     m++;
 110                 }
 111             } else {
 112                 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
 113                     float max = -1e20f;
 114                     uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
 115                     for (uint32_t k = j; k < num_end; k++) {
 116                         if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i];
 117                     }
 118                     ptr_outputs[m * num_columns + i] = max;
 119                     m++;
 120                 }
 121             }
 122         }
 123     }
 124 }
 125
 126 void PwlApply16(intel_dnn_component_t *component, uint32_t num_subset_size) {
 127     if (component->orientation_in == kDnnInterleavedOrientation) {  // subsets only supported in interleaved orientation
 128         PwlApply16(component, 0, num_subset_size - 1, 0, component->num_columns_in - 1);
 129     } else {
 130         PwlApply16(component, 0, component->num_rows_in - 1, 0, component->num_columns_in - 1);
 131     }
 132 }
 133
 134 void PwlApply16(intel_dnn_component_t *component,
 135                 uint32_t num_row_start,
 136                 uint32_t num_row_end,
 137                 uint32_t num_col_start,
 138                 uint32_t num_col_end) {
 139     uint32_t num_saturate = 0;
 140     uint32_t num_segments = component->op.pwl.num_segments;
 141     if (num_segments > 0) {
 142         intel_pwl_segment_t *ptr_segment = component->op.pwl.ptr_segments;
 143         for (int i = num_row_start; i <= num_row_end; i++) {
 144             int32_t *ptr_input = reinterpret_cast<int32_t *>(component->ptr_inputs) + i * component->num_columns_in;
 145             int16_t *ptr_output = reinterpret_cast<int16_t *>(component->ptr_outputs) + i * component->num_columns_in;
 146             for (int j = num_col_start; j <= num_col_end; j++) {
 147                 int32_t xbase = (int32_t) (ptr_segment[0].xBase & XBASEMASK);
 148                 int32_t input = ptr_input[j];
 149                 if (input <= xbase) {
 150                     ptr_output[j] = ptr_segment[0].yBase;
 151                 } else {
 152                     uint32_t slope_shift;
 153                     int16_t slope, ybase;
 154                     int64_t diff, prod, prod_shift, sum;
 155                     uint32_t k = num_segments / 2;
 156                     uint32_t k_upper = num_segments;
 157                     uint32_t k_lower = 0;
 158                     while (k_upper > k_lower + 1) {
 159                         xbase = (int32_t) (ptr_segment[k].xBase & XBASEMASK);
 160                         if (xbase > input) {
 161                             k_upper = k;
 162                             k = (k + k_lower) / 2;
 163                         } else {
 164                             k_lower = k;
 165                             k = (k_upper + k) / 2;
 166                         }
 167                     }
 168                     xbase = (int32_t) (ptr_segment[k].xBase & XBASEMASK);
 169                     slope_shift = ((ptr_segment[k].xBase & ~XBASEMASK) + 1) * 8;
 170                     slope = ptr_segment[k].slope;
 171                     ybase = ptr_segment[k].yBase;
 172                     diff = (int64_t) input - (int64_t) xbase;
 173                     prod = diff * slope;
 174                     prod_shift = prod >> slope_shift;
 175                     sum = prod_shift + (int64_t) ybase;
 176                     if (sum > 32767LL) {
 177                         ptr_output[j] = 32767;
 178                         num_saturate++;
 179                     } else if (sum < -32768LL) {
 180                         ptr_output[j] = -32768;
 181                         num_saturate++;
 182                     } else {
 183                         ptr_output[j] = (int16_t) sum;
 184                     }
 185                 }
 186             }
 187         }
 188     }
 189
 190     if (num_saturate > 0) {
 191         fprintf(stderr, "Warning:  %d saturations in PwlApply16!\n", num_saturate);
 192     }
 193 }
 194
 195 void PwlApply32(intel_dnn_component_t *component, uint32_t num_subset_size) {
 196     if (component->orientation_in == kDnnInterleavedOrientation) {  // subsets only supported in interleaved orientation
 197         PwlApply32(component, 0, num_subset_size - 1, 0, component->num_columns_in - 1);
 198     } else {
 199         PwlApply32(component, 0, component->num_rows_in - 1, 0, component->num_columns_in - 1);
 200     }
 201 }
 202
 203 void PwlApply32(intel_dnn_component_t *component,
 204                 uint32_t num_row_start,
 205                 uint32_t num_row_end,
 206                 uint32_t num_col_start,
 207                 uint32_t num_col_end) {
 208     intel_piecewiselinear_t *transform = reinterpret_cast<intel_piecewiselinear_t *>(&component->op.pwl);
 209     float *ptr_in = reinterpret_cast<float *>(component->ptr_inputs);
 210     float *ptr_out = reinterpret_cast<float *>(component->ptr_outputs);
 211     uint32_t num_columns = component->num_columns_in;
 212     switch (transform->func_id.type) {
 213         case kActSigmoid:
 214             for (uint32_t i = num_row_start; i <= num_row_end; i++) {
 215                 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
 216                     ptr_out[i * num_columns + j] = 0.5 * (1.0 + tanh(0.5 * ptr_in[i * num_columns + j]));
 217                 }
 218             }
 219             break;
 220         case kActTanh:
 221             for (uint32_t i = num_row_start; i <= num_row_end; i++) {
 222                 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
 223                     ptr_out[i * num_columns + j] = tanh(ptr_in[i * num_columns + j]);
 224                 }
 225             }
 226             break;
 227         case kActRelu:
 228             for (uint32_t i = num_row_start; i <= num_row_end; i++) {
 229                 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
 230                     ptr_out[i * num_columns + j] =
 231                         (ptr_in[i * num_columns + j] < 0.0f) ? ptr_in[i * num_columns + j] * transform->func_id.negative_slope : ptr_in[i * num_columns + j];
 232                 }
 233             }
 234             break;
 235         case kActIdentity:
 236             for (uint32_t i = num_row_start; i <= num_row_end; i++) {
 237                 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
 238                     ptr_out[i * num_columns + j] = ptr_in[i * num_columns + j];
 239                 }
 240             }
 241             break;
 242         case kActKaldiLstmClipping:
 243             for (uint32_t i = num_row_start; i <= num_row_end; i++) {
 244                 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
 245                     float val = ptr_in[i * num_columns + j];
 246                     if (val > KALDI_LSTM_CLIP_UPPER) {
 247                         ptr_out[i * num_columns + j] = KALDI_LSTM_CLIP_UPPER;
 248                     } else if (val < KALDI_LSTM_CLIP_LOWER) {
 249                         ptr_out[i * num_columns + j] = KALDI_LSTM_CLIP_LOWER;
 250                     } else {
 251                         ptr_out[i * num_columns + j] = val;
 252                     }
 253                 }
 254             }
 255             break;
 256         case kActCustom:
 257             // break;
 258         default:fprintf(stderr, "Unknown piecewise linear function type!\n");
 259             throw -1;
 260     }
 261 }
 262
 263 #ifdef __cplusplus
 264 extern "C" {  // API uses C linkage so that it can be used by C and C++ applications
 265 #endif
 266
 267 #ifdef _NO_MKL_
 268 void cblas_sgemm1(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
 269                   const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
 270                   const MKL_INT K, const float alpha, const float *A,
 271                   const MKL_INT lda, const float *B, const MKL_INT ldb,
 272                   const float beta, float *C, const MKL_INT ldc) {
 273     int i, j, k;
 274
 275     if (Layout != CblasRowMajor) {
 276         fprintf(stderr, "Only row major is supported in cblas_sgemm!\n");
 277         throw -1;
 278     }
 279
 280     if ((TransA == CblasNoTrans) && (TransB == CblasNoTrans)) {
 281         for (i = 0; i < M; i++) {
 282             for (j = 0; j < N; j++) {
 283                 float sum = (beta == 1.0) ? C[i * ldc + j] : 0;
 284                 for (k = 0; k < K; k++) {
 285                     sum += A[i * lda + k] * B[k * ldb + j];
 286                 }
 287                 C[i * ldc + j] = sum;
 288             }
 289         }
 290     } else if ((TransA == CblasNoTrans) && (TransB == CblasTrans)) {
 291         for (i = 0; i < M; i++) {
 292             for (j = 0; j < N; j++) {
 293                 float sum;
 294                 sum = beta * C[i * ldc + j];
 295                 for (k = 0; k < K; k++) {
 296                     sum += alpha * A[i * lda + k] * B[j * ldb + k];
 297                 }
 298                 C[i * ldc + j] = sum;
 299             }
 300         }
 301     } else if ((TransA == CblasTrans) && (TransB == CblasNoTrans)) {
 302         for (i = 0; i < M; i++) {
 303             for (j = 0; j < N; j++) {
 304                 float sum = (beta == 1.0) ? C[i * ldc + j] : 0;
 305                 for (k = 0; k < K; k++) {
 306                     sum += A[k * lda + i] * B[k * ldb + j];
 307                 }
 308                 C[i * ldc + j] = sum;
 309             }
 310         }
 311     } else {
 312         fprintf(stderr, "Expected A not transposed in cblas_sgemm!\n");
 313         throw -1;
 314     }
 315 }
 316 void cblas_ssbmv1(const CBLAS_LAYOUT Layout, const CBLAS_UPLO Uplo,
 317                   const MKL_INT N, const MKL_INT K, const float alpha, const float *A,
 318                   const MKL_INT lda, const float *X, const MKL_INT incX,
 319                   const float beta, float *Y, const MKL_INT incY) {
 320     int i, j, k;
 321
 322     if (Layout != CblasRowMajor) {
 323         fprintf(stderr, "Only row major is supported in cblas_ssbmv!\n");
 324         throw -1;
 325     }
 326     if (Uplo != CblasLower) {
 327         fprintf(stderr, "Only lower format is supported in cblas_ssbmv!\n");
 328         throw -1;
 329     }
 330     if (K != 0) {
 331         fprintf(stderr, "Only diagonal matrices supported in cblas_ssbmv at this time!\n");
 332         throw -1;
 333     }
 334     if ((alpha == 1.0) && (beta == 1.0) && (incX == 1) && (incY == 1)) {
 335         for (i = 0; i < N; i++) {
 336             Y[i] += A[i] * X[i];
 337         }
 338     } else {
 339         fprintf(stderr, "Only alpha=1, beta=1, incX=1, incY=1, LDA=1 supported in cblas_ssbmv at this time!\n");
 340         throw -1;
 341     }
 342 }
 343 #endif  // #ifdef _NO_MKL_
 344
 345 void cblas_sgemm_subset(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
 346                         const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
 347                         const MKL_INT K, const float alpha, const float *A,
 348                         const MKL_INT lda, const float *B, const MKL_INT ldb,
 349                         const float beta, float *C, const MKL_INT ldc,
 350                         const uint32_t *OutputList, const MKL_INT L) {
 351     int i, j, k, l;
 352
 353     if (Layout != CblasRowMajor) {
 354         fprintf(stderr, "Only row major is supported in cblas_sgemm_subset!\n");
 355         throw -1;
 356     }
 357
 358     if ((TransA == CblasNoTrans) && (TransB == CblasNoTrans)) {
 359         for (l = 0; l < L; l++) {
 360             i = OutputList[l];
 361             for (j = 0; j < N; j++) {
 362                 float sum = (beta == 1.0) ? C[l * ldc + j] : 0;
 363                 for (k = 0; k < K; k++) {
 364                     sum += A[i * lda + k] * B[k * ldb + j];
 365                 }
 366                 C[l * ldc + j] = sum;
 367             }
 368         }
 369     } else if ((TransA == CblasNoTrans) && (TransB == CblasTrans)) {
 370         for (i = 0; i < M; i++) {
 371             for (l = 0; l < L; l++) {
 372                 float sum;
 373                 j = OutputList[l];
 374                 sum = beta * C[i * ldc + l];
 375                 for (k = 0; k < K; k++) {
 376                     sum += alpha * A[i * lda + k] * B[j * ldb + k];
 377                 }
 378                 C[i * ldc + l] = sum;
 379             }
 380         }
 381     } else if ((TransA == CblasTrans) && (TransB == CblasNoTrans)) {
 382         for (l = 0; l < L; l++) {
 383             i = OutputList[l];
 384             for (j = 0; j < N; j++) {
 385                 float sum = (beta == 1.0) ? C[l * ldc + j] : 0;
 386                 for (k = 0; k < K; k++) {
 387                     sum += A[k * lda + i] * B[k * ldb + j];
 388                 }
 389                 C[l * ldc + j] = sum;
 390             }
 391         }
 392     } else {
 393         fprintf(stderr, "Expected A not transposed in cblas_sgemm_subset!\n");
 394         throw -1;
 395     }
 396 }
 397
 398 // C = [ A1 A2 ] * X + B
 399 void sgemv_split(const uint32_t N,
 400                  const uint32_t K1,
 401                  const uint32_t K2,
 402                  const float *A1,
 403                  const float *A2,
 404                  const float *X,
 405                  const float *B,
 406                  float *C) {
 407     uint32_t num_columns = K1 + K2;
 408     uint32_t num_rows = N;
 409     uint32_t i, j;
 410
 411     for (i = 0; i < num_rows; i++) {
 412         float sum = B[i];
 413         for (j = 0; j < K1; j++) {
 414             sum += A1[j] * X[i * num_columns + j];
 415         }
 416         for (j = K1; j < num_columns; j++) {
 417             sum += A2[j - K1] * X[i * num_columns + j];
 418         }
 419         C[i] = sum;
 420     }
 421 }
 422
 423 #ifdef __cplusplus
 424 }  // end extern "C"
 425 #endif