1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
7 #include "gna_plugin_log.hpp"
11 void CNNFilter32(intel_dnn_component_t *component) {
12 float *ptr_filters = reinterpret_cast<float *>(component->op.conv1D.ptr_filters);
13 float *ptr_biases = reinterpret_cast<float *>(component->op.conv1D.ptr_biases);
14 float *ptr_inputs = reinterpret_cast<float *>(component->ptr_inputs);
15 float *ptr_outputs = reinterpret_cast<float *>(component->ptr_outputs);
16 uint32_t num_group = component->num_rows_in;
17 uint32_t num_filter_outputs = component->op.conv1D.num_feature_map_rows - component->op.conv1D.num_filter_rows + 1;
19 num_inputs_band_stride = component->op.conv1D.num_feature_maps * component->op.conv1D.num_feature_map_columns;
20 uint32_t num_filter_coefficients = component->op.conv1D.num_filter_coefficients;
22 if ((component->num_rows_in != 1) || (component->num_rows_out != 1)
23 || (component->num_columns_out != num_filter_outputs * component->op.conv1D.num_filters)) {
24 THROW_GNA_EXCEPTION << "Bad problem dimensions in CNNFilter32!";
27 for (uint32_t j = 0; j < num_filter_outputs; j++) {
28 float *ptr_in = ptr_inputs + j * num_inputs_band_stride;
29 for (uint32_t i = 0; i < component->op.conv1D.num_filters; i++) {
30 float *ptr_coef = ptr_filters + i * num_filter_coefficients;
31 float sum = ptr_biases[i];
32 for (uint32_t k = 0; k < num_filter_coefficients; k++) {
33 sum += ptr_in[k] * ptr_coef[k];
35 ptr_outputs[j * component->op.conv1D.num_filters + i] = sum;
40 void CNNMaxPool(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) {
41 if (number_type == kDnnInt) {
42 int32_t *ptr_inputs = reinterpret_cast<int32_t *>(component->ptr_inputs);
43 int32_t *ptr_outputs = reinterpret_cast<int32_t *>(component->ptr_outputs);
44 uint32_t num_inputs = component->num_columns_in;
45 uint32_t num_columns = component->op.maxpool.num_inputs_stride;
46 uint32_t num_pool_size = component->op.maxpool.num_inputs;
47 uint32_t num_pool_step = component->op.maxpool.num_inputs_step;
48 uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride;
49 uint32_t num_rows_out = num_rows_in / num_pool_step;
51 for (uint32_t i = 0; i < num_columns; i++) {
53 if (component->op.maxpool.do_sum_not_max) {
54 uint32_t num_saturate = 0;
55 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
57 uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
58 for (uint32_t k = j; k < num_end; k++) {
59 sum += ptr_inputs[k * num_columns + i];
61 if (sum > 2147483647.0) {
62 ptr_outputs[m * num_columns + i] = 2147483647L;
64 } else if (sum < -2147483648.0) {
65 ptr_outputs[m * num_columns + i] = -2147483648L;
68 ptr_outputs[m * num_columns + i] = (int32_t) sum;
72 if (num_saturate > 0) {
73 fprintf(stderr, "Warning: %d saturations in CNNMaxPool()\n", num_saturate);
76 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
77 int32_t max = INT32_MIN;
78 uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
79 for (uint32_t k = j; k < num_end; k++) {
80 if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i];
82 ptr_outputs[m * num_columns + i] = max;
88 float *ptr_inputs = reinterpret_cast<float *>(component->ptr_inputs);
89 float *ptr_outputs = reinterpret_cast<float *>(component->ptr_outputs);
90 uint32_t num_inputs = component->num_columns_in;
91 uint32_t num_columns = component->op.maxpool.num_inputs_stride;
92 uint32_t num_pool_size = component->op.maxpool.num_inputs;
93 uint32_t num_pool_step = component->op.maxpool.num_inputs_step;
94 uint32_t num_rows_in = num_inputs / component->op.maxpool.num_inputs_stride;
95 uint32_t num_rows_out = num_rows_in / num_pool_step;
97 for (uint32_t i = 0; i < num_columns; i++) {
99 if (component->op.maxpool.do_sum_not_max) {
100 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
102 uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
103 for (uint32_t k = j; k < num_end; k++) {
104 sum += ptr_inputs[k * num_columns + i];
106 ptr_outputs[m * num_columns + i] = sum;
110 for (uint32_t j = 0; j < num_rows_in; j += num_pool_step) {
112 uint32_t num_end = (j + num_pool_size > num_rows_in) ? num_rows_in : j + num_pool_size;
113 for (uint32_t k = j; k < num_end; k++) {
114 if (ptr_inputs[k * num_columns + i] > max) max = ptr_inputs[k * num_columns + i];
116 ptr_outputs[m * num_columns + i] = max;
124 void PwlApply16(intel_dnn_component_t *component, uint32_t num_subset_size) {
125 if (component->orientation_in == kDnnInterleavedOrientation) { // subsets only supported in interleaved orientation
126 PwlApply16(component, 0, num_subset_size - 1, 0, component->num_columns_in - 1);
128 PwlApply16(component, 0, component->num_rows_in - 1, 0, component->num_columns_in - 1);
132 void PwlApply16(intel_dnn_component_t *component,
133 uint32_t num_row_start,
134 uint32_t num_row_end,
135 uint32_t num_col_start,
136 uint32_t num_col_end) {
137 uint32_t num_saturate = 0;
138 uint32_t num_segments = component->op.pwl.num_segments;
139 if (num_segments > 0) {
140 intel_pwl_segment_t *ptr_segment = component->op.pwl.ptr_segments;
141 for (int i = num_row_start; i <= num_row_end; i++) {
142 int32_t *ptr_input = reinterpret_cast<int32_t *>(component->ptr_inputs) + i * component->num_columns_in;
143 int16_t *ptr_output = reinterpret_cast<int16_t *>(component->ptr_outputs) + i * component->num_columns_in;
144 for (int j = num_col_start; j <= num_col_end; j++) {
145 int32_t xbase = (int32_t) (ptr_segment[0].xBase & XBASEMASK);
146 int32_t input = ptr_input[j];
147 if (input <= xbase) {
148 ptr_output[j] = ptr_segment[0].yBase;
150 uint32_t slope_shift;
151 int16_t slope, ybase;
152 int64_t diff, prod, prod_shift, sum;
153 uint32_t k = num_segments / 2;
154 uint32_t k_upper = num_segments;
155 uint32_t k_lower = 0;
156 while (k_upper > k_lower + 1) {
157 xbase = (int32_t) (ptr_segment[k].xBase & XBASEMASK);
160 k = (k + k_lower) / 2;
163 k = (k_upper + k) / 2;
166 xbase = (int32_t) (ptr_segment[k].xBase & XBASEMASK);
167 slope_shift = ((ptr_segment[k].xBase & ~XBASEMASK) + 1) * 8;
168 slope = ptr_segment[k].slope;
169 ybase = ptr_segment[k].yBase;
170 diff = (int64_t) input - (int64_t) xbase;
172 prod_shift = prod >> slope_shift;
173 sum = prod_shift + (int64_t) ybase;
175 ptr_output[j] = 32767;
177 } else if (sum < -32768LL) {
178 ptr_output[j] = -32768;
181 ptr_output[j] = (int16_t) sum;
188 if (num_saturate > 0) {
189 fprintf(stderr, "Warning: %d saturations in PwlApply16!\n", num_saturate);
193 void PwlApply32(intel_dnn_component_t *component, uint32_t num_subset_size) {
194 if (component->orientation_in == kDnnInterleavedOrientation) { // subsets only supported in interleaved orientation
195 PwlApply32(component, 0, num_subset_size - 1, 0, component->num_columns_in - 1);
197 PwlApply32(component, 0, component->num_rows_in - 1, 0, component->num_columns_in - 1);
201 void PwlApply32(intel_dnn_component_t *component,
202 uint32_t num_row_start,
203 uint32_t num_row_end,
204 uint32_t num_col_start,
205 uint32_t num_col_end) {
206 intel_piecewiselinear_t *transform = reinterpret_cast<intel_piecewiselinear_t *>(&component->op.pwl);
207 float *ptr_in = reinterpret_cast<float *>(component->ptr_inputs);
208 float *ptr_out = reinterpret_cast<float *>(component->ptr_outputs);
209 uint32_t num_columns = component->num_columns_in;
210 switch (transform->func_id.type) {
212 for (uint32_t i = num_row_start; i <= num_row_end; i++) {
213 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
214 ptr_out[i * num_columns + j] = 0.5 * (1.0 + tanh(0.5 * ptr_in[i * num_columns + j]));
219 for (uint32_t i = num_row_start; i <= num_row_end; i++) {
220 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
221 ptr_out[i * num_columns + j] = tanh(ptr_in[i * num_columns + j]);
226 for (uint32_t i = num_row_start; i <= num_row_end; i++) {
227 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
228 ptr_out[i * num_columns + j] =
229 (ptr_in[i * num_columns + j] < 0.0f) ? ptr_in[i * num_columns + j] * transform->func_id.negative_slope : ptr_in[i * num_columns + j];
234 for (uint32_t i = num_row_start; i <= num_row_end; i++) {
235 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
236 ptr_out[i * num_columns + j] = ptr_in[i * num_columns + j];
240 case kActKaldiLstmClipping:
241 for (uint32_t i = num_row_start; i <= num_row_end; i++) {
242 for (uint32_t j = num_col_start; j <= num_col_end; j++) {
243 float val = ptr_in[i * num_columns + j];
244 if (val > KALDI_LSTM_CLIP_UPPER) {
245 ptr_out[i * num_columns + j] = KALDI_LSTM_CLIP_UPPER;
246 } else if (val < KALDI_LSTM_CLIP_LOWER) {
247 ptr_out[i * num_columns + j] = KALDI_LSTM_CLIP_LOWER;
249 ptr_out[i * num_columns + j] = val;
256 default:fprintf(stderr, "Unknown piecewise linear function type!\n");
262 extern "C" { // API uses C linkage so that it can be used by C and C++ applications
266 void cblas_sgemm1(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
267 const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
268 const MKL_INT K, const float alpha, const float *A,
269 const MKL_INT lda, const float *B, const MKL_INT ldb,
270 const float beta, float *C, const MKL_INT ldc) {
273 if (Layout != CblasRowMajor) {
274 fprintf(stderr, "Only row major is supported in cblas_sgemm!\n");
278 if ((TransA == CblasNoTrans) && (TransB == CblasNoTrans)) {
279 for (i = 0; i < M; i++) {
280 for (j = 0; j < N; j++) {
281 float sum = (beta == 1.0) ? C[i * ldc + j] : 0;
282 for (k = 0; k < K; k++) {
283 sum += A[i * lda + k] * B[k * ldb + j];
285 C[i * ldc + j] = sum;
288 } else if ((TransA == CblasNoTrans) && (TransB == CblasTrans)) {
289 for (i = 0; i < M; i++) {
290 for (j = 0; j < N; j++) {
292 sum = beta * C[i * ldc + j];
293 for (k = 0; k < K; k++) {
294 sum += alpha * A[i * lda + k] * B[j * ldb + k];
296 C[i * ldc + j] = sum;
299 } else if ((TransA == CblasTrans) && (TransB == CblasNoTrans)) {
300 for (i = 0; i < M; i++) {
301 for (j = 0; j < N; j++) {
302 float sum = (beta == 1.0) ? C[i * ldc + j] : 0;
303 for (k = 0; k < K; k++) {
304 sum += A[k * lda + i] * B[k * ldb + j];
306 C[i * ldc + j] = sum;
310 fprintf(stderr, "Expected A not transposed in cblas_sgemm!\n");
314 void cblas_ssbmv1(const CBLAS_LAYOUT Layout, const CBLAS_UPLO Uplo,
315 const MKL_INT N, const MKL_INT K, const float alpha, const float *A,
316 const MKL_INT lda, const float *X, const MKL_INT incX,
317 const float beta, float *Y, const MKL_INT incY) {
320 if (Layout != CblasRowMajor) {
321 fprintf(stderr, "Only row major is supported in cblas_ssbmv!\n");
324 if (Uplo != CblasLower) {
325 fprintf(stderr, "Only lower format is supported in cblas_ssbmv!\n");
329 fprintf(stderr, "Only diagonal matrices supported in cblas_ssbmv at this time!\n");
332 if ((alpha == 1.0) && (beta == 1.0) && (incX == 1) && (incY == 1)) {
333 for (i = 0; i < N; i++) {
337 fprintf(stderr, "Only alpha=1, beta=1, incX=1, incY=1, LDA=1 supported in cblas_ssbmv at this time!\n");
341 #endif // #ifdef _NO_MKL_
343 void cblas_sgemm_subset(const CBLAS_LAYOUT Layout, const CBLAS_TRANSPOSE TransA,
344 const CBLAS_TRANSPOSE TransB, const MKL_INT M, const MKL_INT N,
345 const MKL_INT K, const float alpha, const float *A,
346 const MKL_INT lda, const float *B, const MKL_INT ldb,
347 const float beta, float *C, const MKL_INT ldc,
348 const uint32_t *OutputList, const MKL_INT L) {
351 if (Layout != CblasRowMajor) {
352 fprintf(stderr, "Only row major is supported in cblas_sgemm_subset!\n");
356 if ((TransA == CblasNoTrans) && (TransB == CblasNoTrans)) {
357 for (l = 0; l < L; l++) {
359 for (j = 0; j < N; j++) {
360 float sum = (beta == 1.0) ? C[l * ldc + j] : 0;
361 for (k = 0; k < K; k++) {
362 sum += A[i * lda + k] * B[k * ldb + j];
364 C[l * ldc + j] = sum;
367 } else if ((TransA == CblasNoTrans) && (TransB == CblasTrans)) {
368 for (i = 0; i < M; i++) {
369 for (l = 0; l < L; l++) {
372 sum = beta * C[i * ldc + l];
373 for (k = 0; k < K; k++) {
374 sum += alpha * A[i * lda + k] * B[j * ldb + k];
376 C[i * ldc + l] = sum;
379 } else if ((TransA == CblasTrans) && (TransB == CblasNoTrans)) {
380 for (l = 0; l < L; l++) {
382 for (j = 0; j < N; j++) {
383 float sum = (beta == 1.0) ? C[l * ldc + j] : 0;
384 for (k = 0; k < K; k++) {
385 sum += A[k * lda + i] * B[k * ldb + j];
387 C[l * ldc + j] = sum;
391 fprintf(stderr, "Expected A not transposed in cblas_sgemm_subset!\n");
396 // C = [ A1 A2 ] * X + B
397 void sgemv_split(const uint32_t N,
405 uint32_t num_columns = K1 + K2;
406 uint32_t num_rows = N;
409 for (i = 0; i < num_rows; i++) {
411 for (j = 0; j < K1; j++) {
412 sum += A1[j] * X[i * num_columns + j];
414 for (j = K1; j < num_columns; j++) {
415 sum += A2[j - K1] * X[i * num_columns + j];