1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
4 // floatmath.cpp : unoptimized floating point math routines (for reference)
9 #include "gna_plugin_log.hpp"
13 void CNNFilter32(intel_dnn_component_t *component) {
14 float *ptr_filters = reinterpret_cast<float *>(component->op.conv1D.ptr_filters);
15 float *ptr_biases = reinterpret_cast<float *>(component->op.conv1D.ptr_biases);
16 float *ptr_inputs = reinterpret_cast<float *>(component->ptr_inputs);
17 float *ptr_outputs = reinterpret_cast<float *>(component->ptr_outputs);
18 uint32_t num_group = component->num_rows_in;
19 uint32_t num_filter_outputs = component->op.conv1D.num_feature_map_rows - component->op.conv1D.num_filter_rows + 1;
21 num_inputs_band_stride = component->op.conv1D.num_feature_maps * component->op.conv1D.num_feature_map_columns;
22 uint32_t num_filter_coefficients = component->op.conv1D.num_filter_coefficients;
24 if ((component->num_rows_in != 1) || (component->num_rows_out != 1)
25 || (component->num_columns_out != num_filter_outputs * component->op.conv1D.num_filters)) {
26 THROW_GNA_EXCEPTION << "Bad problem dimensions in CNNFilter32!";
29 for (uint32_t j = 0; j < num_filter_outputs; j++) {
30 float *ptr_in = ptr_inputs + j * num_inputs_band_stride;
31 for (uint32_t i = 0; i < component->op.conv1D.num_filters; i++) {
32 float *ptr_coef = ptr_filters + i * num_filter_coefficients;
33 float sum = ptr_biases[i];
34 for (uint32_t k = 0; k < num_filter_coefficients; k++) {
35 sum += ptr_in[k] * ptr_coef[k];
37 ptr_outputs[j * component->op.conv1D.num_filters + i] = sum;
42 void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) {
43 if (number_type == kDnnInt) {
44 int32_t *ptr_inputs = reinterpret_cast<int32_t *>(component->ptr_inputs);
45 int32_t *ptr_outputs = reinterpret_cast<int32_t *>(component->ptr_outputs);
46 uint32_t num_inputs = component->num_columns_in;
47 uint32_t num_columns = component->op.maxpool.num_inputs_stride;
48 uint32_t num_pool_size = component->op.maxpool.num_inputs;
49 uint32_t num_pool_step = component->op.maxpool.num_inputs_step;
50 uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride;
51 uint32_t num_rows_out = num_rows_in / num_pool_step;
53 for (uint32_t i = 0; i < num_columns; i++) {
55 if (component->op.maxpool.do_sum_not_max) {
56 uint32_t num_saturate = 0;
57 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
59 uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
60 for (uint32_t k = j; k < num_end; k++) {
61 sum += ptr_inputs[k * num_columns + i];
63 if (sum > 2147483647.0) {
64 ptr_outputs[m * num_columns + i] = 2147483647L;
66 } else if (sum < -2147483648.0) {
67 ptr_outputs[m * num_columns + i] = -2147483648L;
70 ptr_outputs[m * num_columns + i] = (int32_t) sum;
74 if (num_saturate > 0) {
75 fprintf(stderr, "Warning: %d saturations in CNNMaxPool()\n", num_saturate);
78 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
79 int32_t max = INT32_MIN;
80 uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
81 for (uint32_t k = j; k < num_end; k++) {
82 if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i];
84 ptr_outputs[m * num_columns + i] = max;
90 float *ptr_inputs = reinterpret_cast<float *>(component->ptr_inputs);
91 float *ptr_outputs = reinterpret_cast<float *>(component->ptr_outputs);
92 uint32_t num_inputs = component->num_columns_in;
93 uint32_t num_columns = component->op.maxpool.num_inputs_stride;
94 uint32_t num_pool_size = component->op.maxpool.num_inputs;
95 uint32_t num_pool_step = component->op.maxpool.num_inputs_step;
96 uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride;
97 uint32_t num_rows_out = num_rows_in / num_pool_step;
99 for (uint32_t i = 0; i < num_columns; i++) {
101 if (component->op.maxpool.do_sum_not_max) {
102 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
104 uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
105 for (uint32_t k = j; k < num_end; k++) {
106 sum += ptr_inputs[k * num_columns + i];
108 ptr_outputs[m * num_columns + i] = sum;
112 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
114 uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
115 for (uint32_t k = j; k < num_end; k++) {
116 if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i];
118 ptr_outputs[m * num_columns + i] = max;
126 void PwlApply16(intel_dnn_component_t *component, uint32_t num_subset_size) {
127 if (component->orientation_in == kDnnInterleavedOrientation) { // subsets only supported in interleaved orientation
128 PwlApply16(component, 0, num_subset_size - 1, 0, component->num_columns_in - 1);
130 PwlApply16(component, 0, component->num_rows_in - 1, 0, component->num_columns_in - 1);
134 void PwlApply16(intel_dnn_component_t *component,
135 uint32_t num_row_start,
136 uint32_t num_row_end,
137 uint32_t num_col_start,
138 uint32_t num_col_end) {
139 uint32_t num_saturate = 0;
140 uint32_t num_segments = component->op.pwl.num_segments;
141 if (num_segments > 0) {
142 intel_pwl_segment_t *ptr_segment = component->op.pwl.ptr_segments;
143 for (int i = num_row_start; i <= num_row_end; i++) {
144 int32_t *ptr_input = reinterpret_cast<int32_t *>(component->ptr_inputs) + i * component->num_columns_in;
145 int16_t *ptr_output = reinterpret_cast<int16_t *>(component->ptr_outputs) + i * component->num_columns_in;
146 for (int j = num_col_start; j <= num_col_end; j++) {
147 int32_t xbase = (int32_t) (ptr_segment[0].xBase & XBASEMASK);
148 int32_t input = ptr_input[j];
149 if (input <= xbase) {
150 ptr_output[j] = ptr_segment[0].yBase;
152 uint32_t slope_shift;
153 int16_t slope, ybase;
154 int64_t diff, prod, prod_shift, sum;
155 uint32_t k = num_segments / 2;
156 uint32_t k_upper = num_segments;
157 uint32_t k_lower = 0;
158 while (k_upper > k_lower + 1) {
159 xbase = (int32_t) (ptr_segment[k].xBase & XBASEMASK);
162 k = (k + k_lower) / 2;
165 k = (k_upper + k) / 2;
168 xbase = (int32_t) (ptr_segment[k].xBase & XBASEMASK);
169 slope_shift = ((ptr_segment[k].xBase & ~XBASEMASK) + 1) * 8;
170 slope = ptr_segment[k].slope;
171 ybase = ptr_segment[k].yBase;
172 diff = (int64_t) input - (int64_t) xbase;
174 prod_shift = prod >> slope_shift;
175 sum = prod_shift + (int64_t) ybase;
177 ptr_output[j] = 32767;
179 } else if (sum < -32768LL) {
180 ptr_output[j] = -32768;
183 ptr_output[j] = (int16_t) sum;
190 if (num_saturate > 0) {
191 fprintf(stderr, "Warning: %d saturations in PwlApply16!\n", num_saturate);
195 void PwlApply32(intel_dnn_component_t *component, uint32_t num_subset_size) {
196 if (component->orientation_in == kDnnInterleavedOrientation) { // subsets only supported in interleaved orientation
197 PwlApply32(component, 0, num_subset_size - 1, 0, component->num_columns_in - 1);
199 PwlApply32(component, 0, component->num_rows_in - 1, 0, component->num_columns_in - 1);
203 void PwlApply32(intel_dnn_component_t *component,
204 uint32_t num_row_start,
205 uint32_t num_row_end,
206 uint32_t num_col_start,
207 uint32_t num_col_end) {
208 intel_piecewiselinear_t *transform = reinterpret_cast<intel_piecewiselinear_t *>(&component->op.pwl);
209 float *ptr_in = reinterpret_cast<float *>(component->ptr_inputs);
210 float *ptr_out = reinterpret_cast<float *>(component->ptr_outputs);
211 uint32_t num_columns = component->num_columns_in;
212 switch (transform->func_id.type) {
214 for (uint32_t i = num_row_start; i <= num_row_end; i++) {
215 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
216 ptr_out[i * num_columns + j] = 0.5 * (1.0 + tanh(0.5 * ptr_in[i * num_columns + j]));
221 for (uint32_t i = num_row_start; i <= num_row_end; i++) {
222 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
223 ptr_out[i * num_columns + j] = tanh(ptr_in[i * num_columns + j]);
228 for (uint32_t i = num_row_start; i <= num_row_end; i++) {
229 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
230 ptr_out[i * num_columns + j] =
231 (ptr_in[i * num_columns + j] < 0.0f) ? ptr_in[i * num_columns + j] * transform->func_id.negative_slope : ptr_in[i * num_columns + j];
236 for (uint32_t i = num_row_start; i <= num_row_end; i++) {
237 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
238 ptr_out[i * num_columns + j] = ptr_in[i * num_columns + j];
242 case kActKaldiLstmClipping:
243 for (uint32_t i = num_row_start; i <= num_row_end; i++) {
244 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
245 float val = ptr_in[i * num_columns + j];
246 if (val > KALDI_LSTM_CLIP_UPPER) {
247 ptr_out[i * num_columns + j] = KALDI_LSTM_CLIP_UPPER;
248 } else if (val < KALDI_LSTM_CLIP_LOWER) {
249 ptr_out[i * num_columns + j] = KALDI_LSTM_CLIP_LOWER;
251 ptr_out[i * num_columns + j] = val;
258 default:fprintf(stderr, "Unknown piecewise linear function type!\n");
264 extern "C" { // API uses C linkage so that it can be used by C and C++ applications
268 void cblas_sgemm1(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
269 const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
270 const MKL_INT K, const float alpha, const float *A,
271 const MKL_INT lda, const float *B, const MKL_INT ldb,
272 const float beta, float *C, const MKL_INT ldc) {
275 if (Layout != CblasRowMajor) {
276 fprintf(stderr, "Only row major is supported in cblas_sgemm!\n");
280 if ((TransA == CblasNoTrans) && (TransB == CblasNoTrans)) {
281 for (i = 0; i < M; i++) {
282 for (j = 0; j < N; j++) {
283 float sum = (beta == 1.0) ? C[i * ldc + j] : 0;
284 for (k = 0; k < K; k++) {
285 sum += A[i * lda + k] * B[k * ldb + j];
287 C[i * ldc + j] = sum;
290 } else if ((TransA == CblasNoTrans) && (TransB == CblasTrans)) {
291 for (i = 0; i < M; i++) {
292 for (j = 0; j < N; j++) {
294 sum = beta * C[i * ldc + j];
295 for (k = 0; k < K; k++) {
296 sum += alpha * A[i * lda + k] * B[j * ldb + k];
298 C[i * ldc + j] = sum;
301 } else if ((TransA == CblasTrans) && (TransB == CblasNoTrans)) {
302 for (i = 0; i < M; i++) {
303 for (j = 0; j < N; j++) {
304 float sum = (beta == 1.0) ? C[i * ldc + j] : 0;
305 for (k = 0; k < K; k++) {
306 sum += A[k * lda + i] * B[k * ldb + j];
308 C[i * ldc + j] = sum;
312 fprintf(stderr, "Expected A not transposed in cblas_sgemm!\n");
316 void cblas_ssbmv1(const CBLAS_LAYOUT Layout, const CBLAS_UPLO Uplo,
317 const MKL_INT N, const MKL_INT K, const float alpha, const float *A,
318 const MKL_INT lda, const float *X, const MKL_INT incX,
319 const float beta, float *Y, const MKL_INT incY) {
322 if (Layout != CblasRowMajor) {
323 fprintf(stderr, "Only row major is supported in cblas_ssbmv!\n");
326 if (Uplo != CblasLower) {
327 fprintf(stderr, "Only lower format is supported in cblas_ssbmv!\n");
331 fprintf(stderr, "Only diagonal matrices supported in cblas_ssbmv at this time!\n");
334 if ((alpha == 1.0) && (beta == 1.0) && (incX == 1) && (incY == 1)) {
335 for (i = 0; i < N; i++) {
339 fprintf(stderr, "Only alpha=1, beta=1, incX=1, incY=1, LDA=1 supported in cblas_ssbmv at this time!\n");
343 #endif // #ifdef _NO_MKL_
345 void cblas_sgemm_subset(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
346 const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
347 const MKL_INT K, const float alpha, const float *A,
348 const MKL_INT lda, const float *B, const MKL_INT ldb,
349 const float beta, float *C, const MKL_INT ldc,
350 const uint32_t *OutputList, const MKL_INT L) {
353 if (Layout != CblasRowMajor) {
354 fprintf(stderr, "Only row major is supported in cblas_sgemm_subset!\n");
358 if ((TransA == CblasNoTrans) && (TransB == CblasNoTrans)) {
359 for (l = 0; l < L; l++) {
361 for (j = 0; j < N; j++) {
362 float sum = (beta == 1.0) ? C[l * ldc + j] : 0;
363 for (k = 0; k < K; k++) {
364 sum += A[i * lda + k] * B[k * ldb + j];
366 C[l * ldc + j] = sum;
369 } else if ((TransA == CblasNoTrans) && (TransB == CblasTrans)) {
370 for (i = 0; i < M; i++) {
371 for (l = 0; l < L; l++) {
374 sum = beta * C[i * ldc + l];
375 for (k = 0; k < K; k++) {
376 sum += alpha * A[i * lda + k] * B[j * ldb + k];
378 C[i * ldc + l] = sum;
381 } else if ((TransA == CblasTrans) && (TransB == CblasNoTrans)) {
382 for (l = 0; l < L; l++) {
384 for (j = 0; j < N; j++) {
385 float sum = (beta == 1.0) ? C[l * ldc + j] : 0;
386 for (k = 0; k < K; k++) {
387 sum += A[k * lda + i] * B[k * ldb + j];
389 C[l * ldc + j] = sum;
393 fprintf(stderr, "Expected A not transposed in cblas_sgemm_subset!\n");
398 // C = [ A1 A2 ] * X + B
399 void sgemv_split(const uint32_t N,
407 uint32_t num_columns = K1 + K2;
408 uint32_t num_rows = N;
411 for (i = 0; i < num_rows; i++) {
413 for (j = 0; j < K1; j++) {
414 sum += A1[j] * X[i * num_columns + j];
416 for (j = K1; j < num_columns; j++) {
417 sum += A2[j - K1] * X[i * num_columns + j];