1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
4 // dnn.cpp : component based neural network class for ease of use
6 extern bool global_debug;
12 #include <details/ie_exception.hpp>
14 #include <gna-api-types-xnn.h>
26 #include "floatmath.h"
30 #include "gna_plugin_log.hpp"
33 # define rand_r(X) rand()
37 * whether to dump weights and biases
41 * in light mode only layer names are dumped
48 static int & getDumpFolderId() {
53 static std::string getDumpFolderNameGNA() {
54 return std::string("./gna_layers/")+std::to_string(getDumpFolderId() - 1)+"/";
57 static std::string getDumpFolderName() {
58 return std::string("./layers/")+std::to_string(getDumpFolderId() - 1)+"/";
61 static std::string getRefFolderName() {
62 return std::string("./ref_layers/")+std::to_string(getDumpFolderId() - 1)+"/";
65 void AmIntelDnn::BeginNewWrite() {
70 void AmIntelDnn::Init(void *ptr_memory,
71 uint32_t num_memory_bytes,
72 intel_dnn_number_type_t number_type,
74 ptr_dnn_memory_ = ptr_memory;
75 num_bytes_dnn_memory_ = num_memory_bytes;
76 number_type_ = number_type;
77 input_scale_factor_ = scale_factor;
79 ptr_active_outputs_ = nullptr;
80 num_active_outputs_ = 0;
82 num_right_context = 0;
83 do_rotate_input = false;
84 softmax_type = kSoftmaxNone;
85 ptr_sumgroup_sizes = nullptr;
86 num_sumgroup_sizes = 0;
93 void AmIntelDnn::InitActiveList(uint32_t *ptr_active_list) {
94 ptr_active_outputs_ = ptr_active_list;
95 if (ptr_active_list == nullptr) {
96 if (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) {
97 num_active_outputs_ = component[component.size() - 1].num_rows_out;
99 num_active_outputs_ = component[component.size() - 1].num_columns_out;
102 num_active_outputs_ = 0;
106 void AmIntelDnn::AddComponents(uint32_t num_components_to_add) {
107 component.resize(component.size() + num_components_to_add);
108 for (uint32_t i = 0; i < num_components_to_add; i++) {
109 ClearComponent(component.size() - i - 1);
113 void AmIntelDnn::ClearComponent(uint32_t component_index) {
114 if (component_index > component.size() - 1) {
115 fprintf(stderr, "Error: attempt to clear non-existent component!\n");
118 component[component_index].num_rows_in = 0;
119 component[component_index].num_columns_in = 0;
120 component[component_index].num_rows_out = 0;
121 component[component_index].num_columns_out = 0;
122 component[component_index].num_bytes_per_input = 0;
123 component[component_index].num_bytes_per_output = 0;
124 component[component_index].operation = kDnnNullOp;
125 component[component_index].macro_operation = kDnnMacroOpNone;
126 component[component_index].orientation_in = kDnnUnknownOrientation;
127 component[component_index].orientation_out = kDnnUnknownOrientation;
128 component[component_index].ptr_inputs = nullptr;
129 component[component_index].ptr_outputs = nullptr;
130 memset(&component[component_index].op, 0, sizeof(component[component_index].op));
133 void AmIntelDnn::ClearState() {
134 // To support recurrent networks, provide mechanism to clear persistent state
135 // (e.g., between utterances for speech recognition). For recurrent component,
136 // this means clearing the feedback buffer. For other components, just clear the
137 // output buffer since any feedback will come from some component's output.
138 for (uint32_t i = 0; i < component.size(); i++) {
139 if (component[i].operation == kDnnRecurrentOp) {
140 memset(component[i].op.recurrent.ptr_feedbacks,
142 component[i].op.recurrent.num_vector_delay * component[i].num_columns_out
143 * component[i].num_bytes_per_input);
145 memset(component[i].ptr_outputs,
147 component[i].num_bytes_per_output * component[i].num_rows_out * component[i].num_columns_out);
152 void AmIntelDnn::InitAffineComponentPrivate(intel_dnn_component_t &comp,
153 uint32_t num_rows_in,
154 uint32_t num_columns,
155 uint32_t num_rows_out,
156 uint32_t num_bytes_per_input,
157 uint32_t num_bytes_per_output,
158 uint32_t num_bytes_per_weight,
159 uint32_t num_bytes_per_bias,
160 float weight_scale_factor,
161 float output_scale_factor,
168 comp.num_rows_in = num_rows_in;
169 comp.num_columns_in = num_columns;
170 comp.num_rows_out = num_rows_out;
171 comp.num_columns_out = num_columns;
172 comp.num_bytes_per_input = num_bytes_per_input;
173 comp.num_bytes_per_output = num_bytes_per_output;
174 comp.operation = isDiag ? kDnnDiagonalOp : kDnnAffineOp;
175 comp.macro_operation = kDnnMacroOpNone;
176 comp.orientation_in = kDnnInterleavedOrientation;
177 comp.orientation_out = kDnnInterleavedOrientation;
178 comp.op.affine.num_bytes_per_weight = num_bytes_per_weight;
179 comp.op.affine.num_bytes_per_bias = num_bytes_per_bias;
180 comp.op.affine.weight_scale_factor = weight_scale_factor;
181 comp.output_scale_factor = output_scale_factor;
183 comp.op.affine.ptr_weights = ptr_weights;
184 comp.op.affine.ptr_biases = ptr_biases;
185 comp.ptr_inputs = ptr_inputs;
186 comp.ptr_outputs = ptr_outputs;
188 ptr_weights = &comp.op.affine.ptr_weights;
189 ptr_biases = &comp.op.affine.ptr_biases;
190 ptr_inputs = &comp.ptr_inputs;
191 ptr_outputs = &comp.ptr_outputs;
195 void AmIntelDnn::InitDiagonalComponent(uint32_t component_index,
196 uint32_t num_rows_in,
197 uint32_t num_columns,
198 uint32_t num_rows_out,
199 uint32_t num_bytes_per_input,
200 uint32_t num_bytes_per_output,
201 uint32_t num_bytes_per_weight,
202 uint32_t num_bytes_per_bias,
203 float weight_scale_factor,
204 float output_scale_factor,
209 component[component_index].num_rows_in = num_rows_in;
210 component[component_index].num_columns_in = num_columns;
211 component[component_index].num_rows_out = num_rows_out;
212 component[component_index].num_columns_out = num_columns;
213 component[component_index].num_bytes_per_input = num_bytes_per_input;
214 component[component_index].num_bytes_per_output = num_bytes_per_output;
215 component[component_index].operation = kDnnDiagonalOp;
216 component[component_index].macro_operation = kDnnMacroOpNone;
217 component[component_index].orientation_in = kDnnInterleavedOrientation;
218 component[component_index].orientation_out = kDnnInterleavedOrientation;
219 component[component_index].ptr_inputs = ptr_inputs;
220 component[component_index].ptr_outputs = ptr_outputs;
221 component[component_index].op.affine.num_bytes_per_weight = num_bytes_per_weight;
222 component[component_index].op.affine.num_bytes_per_bias = num_bytes_per_bias;
223 component[component_index].op.affine.weight_scale_factor = weight_scale_factor;
224 component[component_index].output_scale_factor = output_scale_factor;
225 component[component_index].op.affine.ptr_weights = ptr_weights;
226 component[component_index].op.affine.ptr_biases = ptr_biases;
229 void AmIntelDnn::InitConvolutional1DComponentPrivate(intel_dnn_component_t &comp,
230 uint32_t num_rows_in,
231 uint32_t num_columns_in,
232 uint32_t num_rows_out,
233 uint32_t num_columns_out,
234 uint32_t num_bytes_per_input,
235 uint32_t num_bytes_per_output,
236 uint32_t num_bytes_per_weight,
237 uint32_t num_bytes_per_bias,
238 uint32_t num_filters,
239 uint32_t num_filter_rows,
240 uint32_t num_filter_coefficients,
241 uint32_t num_feature_maps,
242 uint32_t num_feature_map_rows,
243 uint32_t num_feature_map_columns,
244 float weight_scale_factor,
245 float output_scale_factor,
251 comp.num_rows_in = num_rows_in;
252 comp.num_columns_in = num_columns_in;
253 comp.num_rows_out = num_rows_out;
254 comp.num_columns_out = num_columns_out;
255 comp.num_bytes_per_input = num_bytes_per_input;
256 comp.num_bytes_per_output = num_bytes_per_output;
257 comp.operation = kDnnConvolutional1dOp;
258 comp.macro_operation = kDnnMacroOpNone;
259 comp.orientation_in = kDnnNonInterleavedOrientation;
260 comp.orientation_out = kDnnNonInterleavedOrientation;
261 comp.ptr_inputs = ptr_inputs;
262 comp.ptr_outputs = ptr_outputs;
263 comp.op.conv1D.num_bytes_per_weight = num_bytes_per_weight;
264 comp.op.conv1D.num_bytes_per_bias = num_bytes_per_bias;
265 comp.op.conv1D.num_filters = num_filters;
266 comp.op.conv1D.num_filter_rows = num_filter_rows;
267 comp.op.conv1D.num_filter_coefficients = num_filter_coefficients;
268 comp.op.conv1D.num_feature_maps = num_feature_maps;
269 comp.op.conv1D.num_feature_map_rows = num_feature_map_rows;
270 comp.op.conv1D.num_feature_map_columns = num_feature_map_columns;
271 comp.op.conv1D.weight_scale_factor = weight_scale_factor;
272 comp.output_scale_factor = output_scale_factor;
275 comp.op.conv1D.ptr_filters = ptr_filters;
276 comp.op.conv1D.ptr_biases = ptr_biases;
277 comp.ptr_inputs = ptr_inputs;
278 comp.ptr_outputs = ptr_outputs;
280 ptr_filters = &comp.op.conv1D.ptr_filters;
281 ptr_biases = &comp.op.conv1D.ptr_biases;
282 ptr_inputs = &comp.ptr_inputs;
283 ptr_outputs = &comp.ptr_outputs;
287 void AmIntelDnn::InitMaxpoolComponentPrivate(intel_dnn_component_t &comp,
288 uint32_t num_rows_in,
289 uint32_t num_columns_in,
290 uint32_t num_rows_out,
291 uint32_t num_columns_out,
292 uint32_t num_bytes_per_input,
293 uint32_t num_bytes_per_output,
294 uint32_t num_pool_size,
295 uint32_t num_pool_step,
296 uint32_t num_pool_stride,
298 float output_scale_factor,
302 comp.num_rows_in = num_rows_in;
303 comp.num_columns_in = num_columns_in;
304 comp.num_rows_out = num_rows_out;
305 comp.num_columns_out = num_columns_out;
306 comp.num_bytes_per_input = num_bytes_per_input;
307 comp.num_bytes_per_output = num_bytes_per_output;
308 comp.operation = kDnnMaxPoolOp;
309 comp.macro_operation = kDnnMacroOpNone;
310 comp.orientation_in = kDnnNonInterleavedOrientation;
311 comp.orientation_out = kDnnNonInterleavedOrientation;
312 comp.op.maxpool.num_inputs = num_pool_size;
313 comp.op.maxpool.num_inputs_step = num_pool_step;
314 comp.op.maxpool.num_inputs_stride = num_pool_stride;
315 comp.op.maxpool.do_sum_not_max = do_sum_not_max;
316 comp.output_scale_factor = output_scale_factor;
319 comp.ptr_inputs = ptr_inputs;
320 comp.ptr_outputs = ptr_outputs;
322 ptr_inputs = &comp.ptr_inputs;
323 ptr_outputs = &comp.ptr_outputs;
327 void AmIntelDnn::InitCopyComponentPrivate(intel_dnn_component_t &comp,
328 intel_dnn_orientation_t orientation,
329 uint32_t num_rows_in,
330 uint32_t num_columns_in,
331 uint32_t num_rows_out,
332 uint32_t num_columns_out,
333 uint32_t num_bytes_per_input,
334 uint32_t num_bytes_per_output,
335 float output_scale_factor,
336 uint32_t num_copy_rows,
337 uint32_t num_copy_columns,
341 comp.num_rows_in = num_rows_in;
342 comp.num_columns_in = num_columns_in;
343 comp.num_rows_out = num_rows_out;
344 comp.num_columns_out = num_columns_out;
345 comp.num_bytes_per_input = num_bytes_per_input;
346 comp.num_bytes_per_output = num_bytes_per_output;
347 comp.operation = kDnnCopyOp;
348 comp.macro_operation = kDnnMacroOpNone;
349 comp.orientation_in = orientation;
350 comp.orientation_out = orientation;
351 comp.ptr_inputs = ptr_inputs;
352 comp.ptr_outputs = ptr_outputs;
353 comp.output_scale_factor = output_scale_factor;
354 comp.op.copy.num_copy_rows = num_copy_rows;
355 comp.op.copy.num_copy_columns = num_copy_columns;
358 comp.ptr_inputs = ptr_inputs;
359 comp.ptr_outputs = ptr_outputs;
361 ptr_inputs = &comp.ptr_inputs;
362 ptr_outputs = &comp.ptr_outputs;
366 void AmIntelDnn::InitPiecewiseLinearComponentPrivate(intel_dnn_component_t &comp,
367 DnnActivation function_id,
368 intel_dnn_orientation_t orientation,
370 uint32_t num_columns,
371 uint32_t num_bytes_per_input,
372 uint32_t num_bytes_per_output,
373 uint32_t num_segments,
374 float output_scale_factor,
377 intel_pwl_segment_t *ptr_segments,
379 comp.num_rows_in = num_rows;
380 comp.num_columns_in = num_columns;
381 comp.num_rows_out = num_rows;
382 comp.num_columns_out = num_columns;
383 comp.num_bytes_per_input = num_bytes_per_input;
384 comp.num_bytes_per_output = num_bytes_per_output;
385 comp.operation = kDnnPiecewiselinearOp;
386 comp.macro_operation = kDnnMacroOpNone;
387 comp.orientation_in = orientation;
388 comp.orientation_out = orientation;
389 comp.op.pwl.func_id = function_id;
390 comp.op.pwl.num_segments = num_segments;
391 comp.output_scale_factor = output_scale_factor;
394 comp.ptr_inputs = ptr_inputs;
395 comp.ptr_outputs = ptr_outputs;
396 comp.op.pwl.ptr_segments = ptr_segments;
398 ptr_inputs = &comp.ptr_inputs;
399 ptr_outputs = &comp.ptr_outputs;
400 if (ptr_segments != nullptr) {
401 *reinterpret_cast<intel_pwl_segment_t **>(ptr_segments) =
402 reinterpret_cast<intel_pwl_segment_t *>(& comp.op.pwl.ptr_segments);
407 void AmIntelDnn::InitRecurrentComponent(uint32_t component_index,
409 uint32_t num_columns_in,
410 uint32_t num_columns_out,
411 uint32_t num_bytes_per_input,
412 uint32_t num_bytes_per_output,
413 uint32_t num_vector_delay,
414 uint32_t num_bytes_per_weight,
415 uint32_t num_bytes_per_bias,
416 float weight_scale_factor,
417 float output_scale_factor,
423 component[component_index].num_rows_in = num_rows;
424 component[component_index].num_columns_in = num_columns_in;
425 component[component_index].num_rows_out = num_rows;
426 component[component_index].num_columns_out = num_columns_out;
427 component[component_index].num_bytes_per_input = num_bytes_per_input;
428 component[component_index].num_bytes_per_output = num_bytes_per_output;
429 component[component_index].operation = kDnnRecurrentOp;
430 component[component_index].macro_operation = kDnnMacroOpNone;
431 component[component_index].orientation_in = kDnnNonInterleavedOrientation;
432 component[component_index].orientation_out = kDnnNonInterleavedOrientation;
433 component[component_index].ptr_inputs = ptr_inputs;
434 component[component_index].ptr_outputs = ptr_outputs;
435 component[component_index].op.recurrent.num_vector_delay = num_vector_delay;
436 component[component_index].op.recurrent.num_bytes_per_weight = num_bytes_per_weight;
437 component[component_index].op.recurrent.num_bytes_per_bias = num_bytes_per_bias;
438 component[component_index].op.recurrent.weight_scale_factor = weight_scale_factor;
439 component[component_index].output_scale_factor = output_scale_factor;
440 component[component_index].op.recurrent.ptr_feedbacks = ptr_feedbacks;
441 component[component_index].op.recurrent.ptr_weights = ptr_weights;
442 component[component_index].op.recurrent.ptr_biases = ptr_biases;
445 void AmIntelDnn::InitInterleaveComponent(uint32_t component_index, uint32_t num_rows, uint32_t num_columns,
446 uint32_t num_bytes_per_input, uint32_t num_bytes_per_output,
447 float output_scale_factor, void *ptr_inputs, void *ptr_outputs) {
448 component[component_index].num_rows_in = num_rows;
449 component[component_index].num_columns_in = num_columns;
450 component[component_index].num_rows_out = num_columns;
451 component[component_index].num_columns_out = num_rows;
452 component[component_index].num_bytes_per_input = num_bytes_per_input;
453 component[component_index].num_bytes_per_output = num_bytes_per_output;
454 component[component_index].operation = kDnnInterleaveOp;
455 component[component_index].macro_operation = kDnnMacroOpNone;
456 component[component_index].orientation_in = kDnnNonInterleavedOrientation;
457 component[component_index].orientation_out = kDnnInterleavedOrientation;
458 component[component_index].ptr_inputs = ptr_inputs;
459 component[component_index].ptr_outputs = ptr_outputs;
460 component[component_index].output_scale_factor = output_scale_factor;
463 void AmIntelDnn::InitDeinterleaveComponent(uint32_t component_index, uint32_t num_rows, uint32_t num_columns,
464 uint32_t num_bytes_per_input, uint32_t num_bytes_per_output,
465 float output_scale_factor, void *ptr_inputs, void *ptr_outputs) {
466 component[component_index].num_rows_in = num_rows;
467 component[component_index].num_columns_in = num_columns;
468 component[component_index].num_rows_out = num_columns;
469 component[component_index].num_columns_out = num_rows;
470 component[component_index].num_bytes_per_input = num_bytes_per_input;
471 component[component_index].num_bytes_per_output = num_bytes_per_output;
472 component[component_index].operation = kDnnDeinterleaveOp;
473 component[component_index].macro_operation = kDnnMacroOpNone;
474 component[component_index].orientation_in = kDnnInterleavedOrientation;
475 component[component_index].orientation_out = kDnnNonInterleavedOrientation;
476 component[component_index].ptr_inputs = ptr_inputs;
477 component[component_index].ptr_outputs = ptr_outputs;
478 component[component_index].output_scale_factor = output_scale_factor;
481 __inline void ApplyAffineTransform(intel_dnn_component_t *component, uint32_t *list, uint32_t listsize) {
482 auto transform = &component->op.affine;
483 int m = component->num_rows_out;
484 int n = component->num_columns_in;
485 int k = component->num_rows_in;
486 int lda = component->num_rows_in;
487 int ldb = component->num_columns_in;
488 int ldc = component->num_columns_out;
490 switch (component->num_bytes_per_input) {
493 if (component->op.affine.num_bytes_per_weight == 1) {
494 int8_t *A = reinterpret_cast<int8_t*>(transform->ptr_weights);
495 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
496 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
497 intel_compound_bias_t *bias = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
498 if (list == nullptr) {
499 // PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
500 // PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
501 // PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
502 igemm8_gna(m, n, k, A, lda, B, ldb, bias, C, ldc);
504 // PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
505 // PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
506 // PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
507 igemm8_gna_subset(m, n, k, A, lda, B, ldb, bias, C, ldc, list, listsize);
509 // PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
510 } else if (component->op.affine.num_bytes_per_weight == 2) {
511 int16_t *A = reinterpret_cast<int16_t*>(transform->ptr_weights);
512 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
513 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
514 int32_t *bias = reinterpret_cast<int32_t*>(transform->ptr_biases);
515 if (list == nullptr) {
516 for (uint32_t i = 0; i < m; i++) {
517 for (uint32_t j = 0; j < n; j++) {
518 C[i*ldc+j] = bias[i];
521 // PrintMatrixInt16("A int16", A, k, m, lda, component->op.affine.weight_scale_factor);
522 // PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.weight_scale_factor);
523 // PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
524 cblas_igemm16(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc);
526 for (int l = 0; l < listsize; l++) {
528 for (uint32_t j = 0; j < n; j++) {
529 C[l*ldc+j] = bias[i];
532 // PrintMatrixInt16("A int16", A, k, m, lda, component->op.affine.scale_factor);
533 // PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.scale_factor);
534 // PrintMatrixInt32("C int32", C, m, n, ldc, component->op.affine.scale_factor * component->op.affine.scale_factor);
535 cblas_igemm16_subset(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc, list, listsize);
537 // PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
539 fprintf(stderr, "Bad weight width in ApplyAffineTransform!\n");
543 #endif // #ifdef INTEGER_REF
545 auto A = reinterpret_cast<float *>(transform->ptr_weights);
546 auto B = reinterpret_cast<float *>(component->ptr_inputs);
547 auto C = reinterpret_cast<float *>(component->ptr_outputs);
548 auto bias = reinterpret_cast<float *>(transform->ptr_biases);
549 if (list == nullptr) {
550 for (uint32_t i = 0; i < m; i++) {
551 for (uint32_t j = 0; j < n; j++) {
552 C[i * ldc + j] = bias[i];
555 // if (global_debug) PrintMatrixFloat32("A float", A, m, k, lda);
556 // if (global_debug) PrintMatrixFloat32("B float", B, k, n, ldb);
557 // if (global_debug) PrintMatrixFloat32("C float before", C, m, n, ldc);
558 cblas_sgemm1(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc);
559 // if (global_debug) PrintMatrixFloat32("C float after", C, m, n, ldc);
561 for (int l = 0; l < listsize; l++) {
563 for (uint32_t j = 0; j < n; j++) {
564 C[l * ldc + j] = bias[i];
567 // PrintMatrixFloat32("A float", A, k, m, lda);
568 // PrintMatrixFloat32("trans(B) float", B, k, n, ldb);
569 // PrintMatrixFloat32("C float before", C, listsize, n, ldc);
570 cblas_sgemm_subset(CblasRowMajor,
586 // PrintMatrixFloat32("C float after", C, listsize, n, ldc);
590 default:fprintf(stderr, "Bad data width in ApplyAffineTransform!\n");
595 __inline void ApplyDiagonalTransform(intel_dnn_component_t *component) {
596 auto transform = &component->op.affine;
597 int m = component->num_rows_out;
598 int n = component->num_columns_in;
599 int ldb = component->num_columns_in;
600 int ldc = component->num_columns_out;
602 switch (component->num_bytes_per_input) {
605 if (component->op.affine.num_bytes_per_weight == 1) {
606 int8_t *A = reinterpret_cast<int8_t*>(transform->ptr_weights);
607 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
608 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
609 intel_compound_bias_t *bias = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
610 // PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
611 // PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
612 // PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
613 isbmm8_gna(m, n, A, lda, B, ldb, bias, C, ldc);
614 // PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
615 } else if (component->op.affine.num_bytes_per_weight == 2) {
616 int16_t *A = reinterpret_cast<int16_t*>(transform->ptr_weights);
617 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
618 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
619 int32_t *bias = reinterpret_cast<int32_t*>(transform->ptr_biases);
620 for (uint32_t i = 0; i < m; i++) {
621 for (uint32_t j = 0; j < n; j++) {
622 C[i*ldc+j] = bias[i];
625 // PrintMatrixInt16("A int16", A, 1, m, lda, component->op.affine.weight_scale_factor);
626 // PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.weight_scale_factor);
627 // PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
628 cblas_isbmm16(m, n, A, lda, B, ldb, C, ldc);
629 // PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
631 fprintf(stderr, "Bad weight width in ApplyDiagonalTransform!\n");
635 #endif // #ifdef INTEGER_REF
637 auto A = reinterpret_cast<float *>(transform->ptr_weights);
638 auto B = reinterpret_cast<float *>(component->ptr_inputs);
639 auto C = reinterpret_cast<float *>(component->ptr_outputs);
640 auto bias = reinterpret_cast<float *>(transform->ptr_biases);
641 for (uint32_t i = 0; i < m; i++) {
642 for (uint32_t j = 0; j < n; j++) {
643 C[i * ldc + j] = bias[i];
646 // PrintMatrixFloat32("A float", A, 1, m, lda);
647 // PrintMatrixFloat32("B float", B, k, n, ldb);
648 // PrintMatrixFloat32("C float before", C, m, n, ldc);
649 for (uint32_t j = 0; j < n; j++) {
650 float *Bcol = B + j * ldb;
651 float *Ccol = C + j * ldc;
652 cblas_ssbmv1(CblasRowMajor, CblasLower, m, 0, 1.0, A, 1, Bcol, 1, 1.0, Ccol, 1);
654 // PrintMatrixFloat32("C float after", C, m, n, ldc);
657 default:fprintf(stderr, "Bad data width in ApplyDiagonalTransform!\n");
662 __inline void ApplyRecurrentTransform(intel_dnn_component_t *component, uint32_t row, void *ptr_feedbacks) {
663 intel_recurrent_t *transform = &component->op.recurrent;
664 int k1 = component->num_columns_in;
665 int k2 = component->num_columns_out;
668 if (component->op.recurrent.ptr_feedbacks == nullptr) {
669 fprintf(stderr, "nullptr feedback pointer in ApplyRecurrentTransform()!\n");
673 switch (component->num_bytes_per_input) {
676 if (component->op.recurrent.num_bytes_per_weight == 1) {
677 int16_t *A1 = reinterpret_cast<int16_t*>(component->ptr_inputs) + row * component->num_columns_in;
678 int16_t *A2 = reinterpret_cast<int16_t*>(ptr_feedbacks);
679 int8_t *X = reinterpret_cast<int8_t*>(transform->ptr_weights);
680 intel_compound_bias_t *B = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
681 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs) + row * component->num_columns_out;
682 // PrintMatrixInt16("A1 int", A1, 1, k1, k1, component->op.recurrent.weight_scale_factor);
683 // PrintMatrixInt16("A2 int", A2, 1, k2, k2);
684 // PrintMatrixInt8("X int", X, k, n, n, component->op.recurrent.weight_scale_factor);
685 // PrintMatrixInt32("B int", B, 1, 2*n, 2*n, component->output_scale_factor);
686 igemv8_gna_split(n, k1, k2, A1, A2, X, B, C);
687 // PrintMatrixInt32("C int", C, 1, n, n, component->output_scale_factor);
688 } else if (component->op.recurrent.num_bytes_per_weight == 2) {
689 int16_t *A1 = reinterpret_cast<int16_t*>(component->ptr_inputs) + row * component->num_columns_in;
690 int16_t *A2 = reinterpret_cast<int16_t*>(ptr_feedbacks);
691 int16_t *X = reinterpret_cast<int16_t*>(transform->ptr_weights);
692 int32_t *B = reinterpret_cast<int32_t*>(transform->ptr_biases);
693 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs) + row * component->num_columns_out;
694 // PrintMatrixInt16("A1 int", A1, 1, k1, k1, component->op.recurrent.weight_scale_factor);
695 // PrintMatrixInt16("A2 int", A2, 1, k2, k2, component->op.recurrent.weight_scale_factor);
696 // PrintMatrixInt16("X int", X, k, n, n, component->op.recurrent.weight_scale_factor);
697 // PrintMatrixInt32("B int", B, 1, n, n, component->output_scale_factor);
698 igemv16_split(n, k1, k2, A1, A2, X, B, C);
699 // PrintMatrixInt32("C int", C, 1, n, n, component->output_scale_factor);
701 fprintf(stderr, "Weight width not supported in ApplyRecurrentTransform!\n");
705 #endif // #ifdef INTEGER_REF
707 auto A1 = reinterpret_cast<float *>(component->ptr_inputs) + row * component->num_columns_in;
708 auto A2 = reinterpret_cast<float *>(ptr_feedbacks);
709 auto X = reinterpret_cast<float *>(transform->ptr_weights);
710 auto B = reinterpret_cast<float *>(transform->ptr_biases);
711 auto C = reinterpret_cast<float *>(component->ptr_outputs) + row * component->num_columns_out;
712 // PrintMatrixFloat32("A1 float", A1, 1, k1, k1);
713 // PrintMatrixFloat32("A2 float", A2, 1, k2, k2);
714 // PrintMatrixFloat32("X float", X, k, n, n);
715 // PrintMatrixFloat32("B float", B, 1, n, n);
716 sgemv_split(n, k1, k2, A1, A2, X, B, C);
717 // PrintMatrixFloat32("C float", C, 1, n, n);
720 default:fprintf(stderr, "Bad data width in ApplyRecurrentTransform!\n");
725 __inline void ApplyConvolutional1DTransform(intel_dnn_component_t *component) {
726 switch (component->num_bytes_per_input) {
729 CNNFilter16(component);
731 #endif // #ifdef INTEGER_REF
733 // PrintMatrixFloat32("Input float", reinterpret_cast<float*>(component->ptr_inputs),
734 // component->num_rows_in, component->num_columns_in, component->num_columns_in);
735 // PrintMatrixFloat32("Filt float", reinterpret_cast<float*>(component->op.conv1D.ptr_filters),
736 // component->op.conv1D.num_filters,
737 // component->op.conv1D.num_filter_rows*component->op.conv1D.num_feature_map_columns*component->op.conv1D.num_feature_maps,
738 // component->op.conv1D.num_filter_rows*component->op.conv1D.num_feature_map_columns*component->op.conv1D.num_feature_maps);
739 // PrintMatrixFloat32("Bias float", reinterpret_cast<float*>(component->op.conv1D.ptr_biases), 1,
740 // component->op.conv1D.num_filters, component->op.conv1D.num_filters);
741 CNNFilter32(component);
742 // PrintMatrixFloat32("Output float", reinterpret_cast<float*>(component->ptr_outputs, component->num_rows_out,
743 // component->num_columns_out, component->num_columns_out);
745 default:fprintf(stderr, "Bad data width in ApplyConvolutionalTransform!\n");
750 __inline void ApplyPiecewiseLinearTransform(intel_dnn_component_t *component,
751 intel_dnn_number_type_t number_type,
753 if (number_type == kDnnFloat) {
754 // PrintMatrixFloat32("PWL Input float", reinterpret_cast<float*>(component->ptr_inputs), component->num_rows_in,
755 // component->num_columns_in, component->num_columns_in);
756 PwlApply32(component, listsize);
757 // PrintMatrixFloat32("PWL Output float", reinterpret_cast<float*>(component->ptr_outputs), component->num_rows_out,
758 // component->num_columns_out, component->num_columns_out);
760 } else if (component->num_bytes_per_output == 2) {
761 PwlApply16(component, listsize);
762 #endif // #ifdef INTEGER_REF
764 fprintf(stderr, "Bad data width in ApplyPiecewiseLinearTransform!\n");
769 __inline void ApplyPiecewiseLinearTransform(intel_dnn_component_t *component,
770 intel_dnn_number_type_t number_type,
773 if (number_type == kDnnFloat) {
774 PwlApply32(component, num_row, num_row, 0, listsize - 1);
776 } else if (component->num_bytes_per_output == 2) {
777 PwlApply16(component, num_row, num_row, 0, listsize-1);
778 #endif // #ifdef INTEGER_REF
780 fprintf(stderr, "Bad data width in ApplyPiecewiseLinearTransform!\n");
785 __inline void ApplyMaxPoolTransform(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) {
786 if (component->num_bytes_per_input == 4) {
787 // PrintMatrixFloat32("Input float", reinterpret_cast<float*>(component->ptr_inputs), component->num_rows_in,
788 // component->num_columns_in, component->num_columns_in);
789 CNNMaxPool(component, number_type);
790 // PrintMatrixFloat32("Output float", reinterpret_cast<float*>(component->ptr_outputs), component->num_rows_out,
791 // component->num_columns_out, component->num_columns_out);
793 fprintf(stderr, "Bad data width in ApplyMaxPoolTransform!\n");
798 __inline void ApplyTranspose(intel_dnn_component_t *component) {
799 int m = component->num_rows_in;
800 int n = component->num_columns_in;
801 int lda = component->num_columns_in;
802 int ldb = component->num_columns_out;
803 // B = Transpose(A) where A is mxn and B is nxm
804 switch (component->num_bytes_per_input) {
808 int8_t *A = reinterpret_cast<int8_t*>(component->ptr_inputs);
809 int8_t *B = reinterpret_cast<int8_t*>(component->ptr_outputs);
810 for (uint32_t row = 0; row < m; row++) {
811 for (uint32_t col = 0; col < n; col++) {
812 B[col*ldb+row] = A[row*lda+col];
819 int16_t *A = reinterpret_cast<int16_t*>(component->ptr_inputs);
820 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_outputs);
821 for (uint32_t row = 0; row < m; row++) {
822 for (uint32_t col = 0; col < n; col++) {
823 B[col*ldb+row] = A[row*lda+col];
828 #endif // #ifdef INTEGER_REF
830 auto A = reinterpret_cast<float *>(component->ptr_inputs);
831 auto B = reinterpret_cast<float *>(component->ptr_outputs);
832 for (uint32_t row = 0; row < m; row++) {
833 for (uint32_t col = 0; col < n; col++) {
834 B[col * ldb + row] = A[row * lda + col];
839 default:fprintf(stderr, "Bad data width in ApplyInterleave!\n");
844 __inline void ApplyCopy(intel_dnn_component_t *component) {
845 auto src = reinterpret_cast<uint8_t *>(component->ptr_inputs);
846 auto dst = reinterpret_cast<uint8_t *>(component->ptr_outputs);
847 int32_t m = component->op.copy.num_copy_rows;
848 int32_t n = component->op.copy.num_copy_columns;
849 int32_t lda = component->num_columns_in;
850 int32_t ldb = component->num_columns_out;
851 if (m > component->num_rows_in) {
852 fprintf(stderr, "Error: attempt to copy more columns than matrix has!\n");
855 switch (component->num_bytes_per_input) {
859 int16_t *A = reinterpret_cast<int16_t*>(src);
860 int16_t *B = reinterpret_cast<int16_t*>(dst);
861 for (uint32_t row = 0; row < m; row++) {
862 for (uint32_t col = 0; col < n; col++) {
863 B[row*ldb + col] = A[row*lda + col];
868 #endif // #ifdef INTEGER_REF
870 auto A = reinterpret_cast<float *>(src);
871 auto B = reinterpret_cast<float *>(dst);
872 for (uint32_t row = 0; row < m; row++) {
873 for (uint32_t col = 0; col < n; col++) {
874 B[row * ldb + col] = A[row * lda + col];
879 default:fprintf(stderr, "Bad data width in ApplyCopy!\n");
885 uint32_t AmIntelDnn::CopyActiveList(std::vector<std::vector<uint32_t> > &active_list, uint32_t list_index) {
886 if (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) {
887 num_active_outputs_ = component[component.size() - 1].num_rows_out;
889 num_active_outputs_ = component[component.size() - 1].num_columns_out;
892 if (!active_list.empty()) {
893 if (list_index >= active_list.size()) {
894 fprintf(stderr, "Index %d beyond end of active list in CopyActiveList()\n", list_index);
897 if (active_list[list_index].size() > component[component.size() - 1].num_rows_out) {
898 fprintf(stderr, "Active list too large in CopyActiveList()\n");
902 if (ptr_active_outputs_ != nullptr) {
903 num_active_outputs_ = active_list[list_index].size();
904 memcpy(ptr_active_outputs_, active_list[list_index].data(), num_active_outputs_ * sizeof(uint32_t));
908 return (num_active_outputs_);
911 void AmIntelDnn::Propagate() {
912 for (uint32_t i = 0; i < component.size(); i++) {
913 intel_dnn_component_t *comp = &component[i];
914 uint32_t *ptr_active_outputs = nullptr;
915 uint32_t num_active_outputs = (comp->orientation_out == kDnnInterleavedOrientation)
916 ? comp->num_rows_out : comp->num_columns_out;
918 if (i == component.size() - 1) { // active list applies to last component
919 ptr_active_outputs = ptr_active_outputs_;
920 num_active_outputs = num_active_outputs_;
921 } else if (i == component.size() - 2) { // also applies to last two components when last is PWL
922 if ((component[i].operation == kDnnAffineOp) && (component[i + 1].operation == kDnnPiecewiselinearOp)) {
923 ptr_active_outputs = ptr_active_outputs_;
924 num_active_outputs = num_active_outputs_;
928 switch (comp->operation) {
929 case kDnnAffineOp :ApplyAffineTransform(comp, ptr_active_outputs, num_active_outputs);
931 case kDnnDiagonalOp:ApplyDiagonalTransform(comp);
933 case kDnnRecurrentOp:
934 if ((i < component.size() - 1) && (component[i + 1].operation == kDnnPiecewiselinearOp)) {
935 intel_dnn_component_t *comp_pwl = &component[i + 1];
936 for (uint32_t j = 0; j < comp->num_rows_in; j++) {
937 void *ptr_feedbacks =
938 reinterpret_cast<void *>(reinterpret_cast<int32_t *>(comp->op.recurrent.ptr_feedbacks) + j * comp_pwl->num_columns_out);
939 ApplyRecurrentTransform(comp, j, ptr_feedbacks);
941 ApplyPiecewiseLinearTransform(comp_pwl, number_type_, num_active_outputs, j);
943 i++; // skip next component
945 fprintf(stderr, "Missing PiecewiseLinear component after Recurrent component in Propagate!\n");
949 case kDnnConvolutional1dOp:ApplyConvolutional1DTransform(comp);
951 case kDnnPiecewiselinearOp:ApplyPiecewiseLinearTransform(comp, number_type_, num_active_outputs);
953 case kDnnMaxPoolOp:ApplyMaxPoolTransform(comp, number_type_);
955 case kDnnInterleaveOp:ApplyTranspose(comp);
957 case kDnnDeinterleaveOp:ApplyTranspose(comp);
959 case kDnnCopyOp:ApplyCopy(comp);
961 default:fprintf(stderr, "Bad operation in Propagate!\n");
965 // PrintOutputs(i); fflush(stdout);
969 intel_dnn_macro_operation_t AmIntelDnn::MacroOperation(uint32_t component_index) {
970 return (component[component_index].macro_operation);
973 void AmIntelDnn::SetMacroOperation(uint32_t component_index, intel_dnn_macro_operation_t macro_operation) {
974 component[component_index].macro_operation = macro_operation;
977 float AmIntelDnn::InputScaleFactor(uint32_t component_index) {
978 float scale_factor = 1.0;
980 if (component_index == 0) {
981 scale_factor = input_scale_factor_;
983 if (component[component_index - 1].operation == kDnnAffineOp) {
984 scale_factor = component[component_index - 1].output_scale_factor;
985 } else if (component[component_index - 1].operation == kDnnDiagonalOp) {
986 scale_factor = component[component_index - 1].output_scale_factor;
987 } else if (component[component_index - 1].operation == kDnnConvolutional1dOp) {
988 scale_factor = component[component_index - 1].output_scale_factor;
989 } else if (component[component_index - 1].operation == kDnnRecurrentOp) {
990 scale_factor = component[component_index - 1].output_scale_factor;
991 } else if (component[component_index - 1].operation == kDnnInterleaveOp) {
992 scale_factor = component[component_index - 1].output_scale_factor;
993 } else if (component[component_index - 1].operation == kDnnDeinterleaveOp) {
994 scale_factor = component[component_index - 1].output_scale_factor;
995 } else if (component[component_index - 1].operation == kDnnCopyOp) {
996 scale_factor = component[component_index - 1].output_scale_factor;
1000 return (scale_factor);
1003 float AmIntelDnn::WeightScaleFactor(uint32_t component_index) {
1004 float scale_factor = 1.0;
1006 if (component[component_index].operation == kDnnAffineOp) {
1007 scale_factor = component[component_index].op.affine.weight_scale_factor;
1008 } else if (component[component_index].operation == kDnnDiagonalOp) {
1009 scale_factor = component[component_index].op.affine.weight_scale_factor;
1010 } else if (component[component_index].operation == kDnnConvolutional1dOp) {
1011 scale_factor = component[component_index].op.conv1D.weight_scale_factor;
1012 } else if (component[component_index].operation == kDnnRecurrentOp) {
1013 scale_factor = component[component_index].op.recurrent.weight_scale_factor;
1016 return (scale_factor);
1019 float AmIntelDnn::OutputScaleFactor(intel_dnn_component_t &comp) {
1020 return comp.output_scale_factor;
1023 void AmIntelDnn::SetOutputScaleFactor(uint32_t component_index, float scale_factor) {
1024 component[component_index].output_scale_factor = scale_factor;
1027 void AmIntelDnn::PrintOutputs(uint32_t component_index) {
1028 float scale_factor = OutputScaleFactor(component_index);
1029 uint32_t num_rows = component[component_index].num_rows_out;
1030 uint32_t num_columns = component[component_index].num_columns_out;
1032 printf("component %d : %s\n", component_index, intel_dnn_operation_name[component[component_index].operation]);
1033 if (number_type_ == kDnnFloat) {
1034 auto ptr_output = reinterpret_cast<float *>(component[component_index].ptr_outputs);
1035 for (int i = 0; i < num_rows; i++) {
1036 for (int j = 0; j < num_columns; j++) {
1037 printf("%d %d : %e\n", i, j, ptr_output[i * num_columns + j] / scale_factor);
1041 switch (component[component_index].num_bytes_per_output) {
1043 auto ptr_output = reinterpret_cast<int8_t *>(component[component_index].ptr_outputs);
1044 for (int i = 0; i < num_rows; i++) {
1045 for (int j = 0; j < num_columns; j++) {
1046 printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
1052 auto ptr_output = reinterpret_cast<int16_t *>(component[component_index].ptr_outputs);
1053 for (int i = 0; i < num_rows; i++) {
1054 for (int j = 0; j < num_columns; j++) {
1055 printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
1061 auto ptr_output = reinterpret_cast<int32_t *>(component[component_index].ptr_outputs);
1062 for (int i = 0; i < num_rows; i++) {
1063 for (int j = 0; j < num_columns; j++) {
1064 printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
1071 "Bad num_bytes_per_output in component %d in AmIntelDnn::PrintOutputs()\n",
1078 uint32_t AmIntelDnn::CompareScores(void *ptr_refscorearray, intel_score_error_t *score_error, uint32_t num_frames) {
1079 intel_dnn_component_t *ptr_component = &component[component.size() - 1];
1080 intel_dnn_orientation_t orientation = ptr_component->orientation_out;
1081 float scale_factor = OutputScaleFactor(component.size() - 1);
1082 uint32_t num_errors = 0;
1083 uint32_t num_rows = (orientation == kDnnInterleavedOrientation) ? ptr_component->num_rows_out : num_frames;
1084 uint32_t num_columns = (orientation == kDnnInterleavedOrientation) ? num_frames : ptr_component->num_columns_out;
1085 uint32_t num_row_step_ref =
1086 (orientation == kDnnInterleavedOrientation) ? ptr_component->num_rows_out : ptr_component->num_columns_out;
1087 uint32_t num_row_step = ptr_component->num_columns_out;
1089 if (ptr_component->operation == kDnnAffineOp) {
1090 num_rows = num_active_outputs_;
1093 ClearScoreError(score_error);
1095 if (number_type_ == kDnnFloat) {
1096 auto A = reinterpret_cast<float *>(ptr_component->ptr_outputs);
1097 auto B = reinterpret_cast<float *>(ptr_refscorearray);
1098 for (int i = 0; i < num_rows; i++) {
1099 for (int j = 0; j < num_columns; j++) {
1100 float score = A[i * num_row_step + j];
1102 (orientation == kDnnInterleavedOrientation) ? B[j * num_row_step_ref + i] : B[i * num_row_step_ref
1104 float scaled_score = score / scale_factor;
1105 float error = fabs(refscore - scaled_score);
1106 float rel_error = error / (fabs(refscore) + 1e-20);
1107 float squared_error = error * error;
1108 float squared_rel_error = rel_error * rel_error;
1109 score_error->num_scores++;
1110 score_error->sum_error += error;
1111 score_error->sum_squared_error += squared_error;
1112 if (error > score_error->max_error) {
1113 score_error->max_error = error;
1115 score_error->sum_rel_error += rel_error;
1116 score_error->sum_squared_rel_error += squared_rel_error;
1117 if (rel_error > score_error->max_rel_error) {
1118 score_error->max_rel_error = rel_error;
1120 if (error > score_error->threshold) {
1125 } else if (number_type_ == kDnnInt) {
1126 auto B = reinterpret_cast<float *>(ptr_refscorearray);
1127 for (int i = 0; i < num_rows; i++) {
1128 for (int j = 0; j < num_columns; j++) {
1130 if (ptr_component->num_bytes_per_output == 4) {
1131 auto A = reinterpret_cast<int32_t *>(ptr_component->ptr_outputs);
1132 score = static_cast<float>(A[i * num_row_step + j]);
1133 } else if (ptr_component->num_bytes_per_output == 2) {
1134 auto A = reinterpret_cast<int16_t *>(ptr_component->ptr_outputs);
1135 score = static_cast<float>(A[i * num_row_step + j]);
1138 "Unsupported output width (%d) in AmIntelDnn::CompareScores()!\n",
1139 ptr_component->num_bytes_per_output);
1143 (orientation == kDnnInterleavedOrientation) ? B[j * num_row_step_ref + i] : B[i * num_row_step_ref
1145 float scaled_score = score / scale_factor;
1146 float error = fabs(refscore - scaled_score);
1147 float rel_error = error / (fabs(refscore) + 1e-20);
1148 float squared_error = error * error;
1149 float squared_rel_error = rel_error * rel_error;
1150 score_error->num_scores++;
1151 score_error->sum_error += error;
1152 score_error->sum_squared_error += squared_error;
1153 if (error > score_error->max_error) {
1154 score_error->max_error = error;
1156 score_error->sum_rel_error += rel_error;
1157 score_error->sum_squared_rel_error += squared_rel_error;
1158 if (rel_error > score_error->max_rel_error) {
1159 score_error->max_rel_error = rel_error;
1161 if (error > score_error->threshold) {
1167 fprintf(stderr, "Unknown number type in AmIntelDnn::CompareScores()!\n");
1171 score_error->num_errors = num_errors;
1173 return (num_errors);
1176 void AmIntelDnn::WriteGraphWizModel(const char *filename) {
1177 auto & components = component;
1179 #define IS_AFFINE(k)\
1180 (components[k].operation == kDnnAffineOp ||\
1181 components[k].operation == kDnnDiagonalOp)
1184 (components[k].operation == kDnnConvolutional1dOp)
1187 (components[k].operation == kDnnPiecewiselinearOp &&\
1188 components[k].op.pwl.func_id == kActRelu)
1192 (components[k].operation == kDnnDiagonalOp)
1194 #define OUTPUTS(idx)\
1195 components[idx].ptr_outputs, components[idx].num_rows_out*components[idx].num_columns_out * components[idx].num_bytes_per_output
1197 #define INPUTS(idx)\
1198 components[idx].ptr_inputs, components[idx].num_rows_in*components[idx].num_columns_in * components[idx].num_bytes_per_input
1200 #define BIASES(idx)\
1201 components[idx].op.affine.ptr_biases, components[idx].num_rows_in*components[idx].num_columns_in * components[idx].op.affine.num_bytes_per_bias
1203 #define WEIGHTS(idx)\
1204 components[idx].op.affine.ptr_weights, components[idx].op.affine.num_bytes_per_weight * components[idx].num_rows_in*components[idx].num_columns_in * \
1205 (IS_DIAG(idx) ? 1 : components[idx].num_rows_out*components[idx].num_columns_out)
1207 auto intersected = [](void * ptra, size_t asize, void * ptrb, size_t bsize) {
1208 return !(((reinterpret_cast<char*>(ptra) + asize) <= ptrb) || ((reinterpret_cast<char*>(ptrb) + bsize) <= ptra));
1211 auto equals = [](void * ptra, size_t asize, void * ptrb, size_t bsize) {
1212 // return !((((char*)ptra + asize) < ptrb) || (((char*)ptrb + bsize) < ptra));
1213 return ptra >= ptrb && ptra < reinterpret_cast<char*>(ptrb) + bsize;
1216 std::fstream graph("graph.dot", std::ios::out);
1217 graph << "strict digraph {";
1218 std::set<void*> weights;
1219 std::set<void*> biases;
1220 std::set<void*> outputs;
1221 std::set<std::string> layersNames;
1223 auto generate_layer_name = [&](int k) {
1225 if (components[k].operation == kDnnPiecewiselinearOp) {
1226 l += intel_dnn_activation_name[components[k].op.pwl.func_id];
1228 l += intel_dnn_operation_name[components[k].operation];
1230 l += "_" + std::to_string(k);
1231 if (components[k].operation == kDnnPiecewiselinearOp) {
1232 graph << l << " [shape=box, style=filled, fillcolor=yellow";
1234 graph << l << " [shape=box";
1237 graph << ", label=<<TABLE BORDER=\"0\" CELLBORDER=\"1\" CELLSPACING=\"0\">\n"
1238 " <TR><TD colspan=\"2\">" << l << "</TD></TR>\n"
1239 " <TR><TD colspan=\"2\">" << components[k].num_rows_in << "x" << components[k].num_rows_out<< "</TD></TR>\n";
1241 graph << " <TR><TD> wscale</TD><TD>" << components[k].op.affine.weight_scale_factor<< "</TD></TR>\n";
1242 graph << " <TR><TD> wbit</TD><TD>" << components[k].op.affine.num_bytes_per_weight<< "</TD></TR>\n";
1243 graph << " <TR><TD> bbit</TD><TD>" << components[k].op.affine.num_bytes_per_bias<< "</TD></TR>\n";
1246 graph << " <TR><TD> negative_slope</TD><TD>" << components[k].op.pwl.func_id.negative_slope<< "</TD></TR>\n";
1249 auto &conv = components[k].op.conv1D;
1250 graph << " <TR><TD> num_filters</TD><TD>" << conv.num_filters<< "</TD></TR>\n";
1251 graph << " <TR><TD> num_filter_rows</TD><TD>" << conv.num_filter_rows<< "</TD></TR>\n";
1252 graph << " <TR><TD> num_filter_coefficients</TD><TD>" << conv.num_filter_coefficients<< "</TD></TR>\n";
1253 graph << " <TR><TD> num_feature_maps</TD><TD>" << conv.num_feature_maps<< "</TD></TR>\n";
1254 graph << " <TR><TD> num_feature_map_rows</TD><TD>" << conv.num_feature_map_rows<< "</TD></TR>\n";
1255 graph << " <TR><TD> num_feature_map_columns</TD><TD>" << conv.num_feature_map_columns<< "</TD></TR>\n";
1256 graph << " <TR><TD> wscale</TD><TD>" << conv.weight_scale_factor<< "</TD></TR>\n";
1257 graph << " <TR><TD> wbit</TD><TD>" << conv.num_bytes_per_weight<< "</TD></TR>\n";
1258 graph << " <TR><TD> bbit</TD><TD>" << conv.num_bytes_per_bias<< "</TD></TR>\n";
1260 graph<< " <TR><TD> num_rows_in</TD><TD>" << components[k].num_rows_in<< "</TD></TR>\n"
1261 " <TR><TD> num_columns_in</TD><TD>" << components[k].num_columns_in<< "</TD></TR>\n"
1262 " <TR><TD> num_rows_out</TD><TD>" << components[k].num_rows_out<< "</TD></TR>\n"
1263 " <TR><TD> num_columns_out</TD><TD>" << components[k].num_columns_out<< "</TD></TR>\n"
1264 " <TR><TD> oscale</TD><TD>" << components[k].output_scale_factor<< "</TD></TR>\n"
1265 " <TR><TD> ibit</TD><TD>" << components[k].num_bytes_per_input<< "</TD></TR>\n"
1266 " <TR><TD> obit</TD><TD>" << components[k].num_bytes_per_output<< "</TD></TR>\n"
1273 for (int k = 0; k < components.size(); ++k) {
1274 std::string l = generate_layer_name(k);
1275 layersNames.insert(l);
1276 int lidx = std::distance(layersNames.begin(), layersNames.find(l));
1281 weights.insert(components[k].op.affine.ptr_weights);
1282 biases.insert(components[k].op.affine.ptr_biases);
1284 widx = std::distance(weights.begin(), weights.find(components[k].op.affine.ptr_weights));
1285 bidx = std::distance(biases.begin(), biases.find(components[k].op.affine.ptr_biases));
1289 auto lw = "weights_" + std::to_string(lidx) + "_" + std::to_string(widx);;
1290 auto lb = "biases_" + std::to_string(lidx) + "_" + std::to_string(bidx);
1293 graph << lw << " -> " << l << "[style=bold];";
1294 graph << lb << " -> " << l << "[style=bold];";
1299 bool inputConnected = false;
1301 for (int k2 = 0; k2 < components.size(); ++k2) {
1302 if (k2 == k) continue;
1305 std::string r = generate_layer_name(k2);
1310 if (IS_AFFINE(k2)) {
1311 weights.insert(components[k2].op.affine.ptr_weights);
1312 biases.insert(components[k2].op.affine.ptr_biases);
1314 w2idx = std::distance(weights.begin(), weights.find(components[k2].op.affine.ptr_weights));
1315 b2idx = std::distance(biases.begin(), biases.find(components[k2].op.affine.ptr_biases));
1318 auto rw = "weights_" + std::to_string(w2idx);
1319 auto rb = "biases_" + std::to_string(b2idx);
1321 // ----------------------------------------------------------
1322 // output to input connections
1323 if (intersected(OUTPUTS(k2), INPUTS(k))) {
1324 graph << r <<" -> "<< l << ";";
1325 inputConnected = true;
1328 // ----------------------------------------------------------
1329 // output to biases connections
1330 if (IS_AFFINE(k) && intersected(OUTPUTS(k2), BIASES(k))) {
1331 graph << r << " -> " << lb << " [label=\"OB\", fontcolor=blue, color=blue, style=dashed];";
1334 // ----------------------------------------------------------
1335 // output to weights connections
1336 if (IS_AFFINE(k) && equals(OUTPUTS(k2), WEIGHTS(k))) {
1337 graph << r << " -> " << lw << " [label=\"OW\", fontcolor=magenta, color=magenta, style=dashed];";
1340 // ----------------------------------------------------------
1341 // weights to input connections
1342 if (IS_AFFINE(k2) && equals(WEIGHTS(k2), INPUTS(k))) {
1343 graph << rw << " -> " << l << " [label=\"WI\", fontcolor=red, color=red, style=dashed];";
1344 inputConnected = true;
1347 // ----------------------------------------------------------
1348 // weights to bias connections
1349 if (IS_AFFINE(k2) && IS_AFFINE(k) && equals(WEIGHTS(k2), BIASES(k))) {
1350 graph << rw << " -> " << lb << " [label=\"WB\", fontcolor=darkgreen,color=darkgreen, style=dashed];";
1353 if (!inputConnected) {
1354 // drawing tmp connection
1355 outputs.insert(components[k].ptr_inputs);
1356 auto tidx = std::distance(outputs.begin(), outputs.find(components[k].ptr_inputs));
1357 graph << tidx << " -> " << l
1358 << " [label=\"FROM_TMP\", fontcolor=darkgreen,color=orange, style=dashed];";
1362 for (int k = 0; k < components.size(); ++k) {
1363 std::string l = generate_layer_name(k);
1366 for (auto tmpOutPtrs : outputs) {
1367 if (components[k].ptr_outputs == tmpOutPtrs) {
1368 graph << l << " -> " << tidx << " [label=\"TO_TMP\", fontcolor=darkgreen,color=orange, style=dashed];";
1377 void AmIntelDnn::WriteDnnText(const char *filename, intel_dnn_number_type_t number_type) {
1378 if ((number_type_ == kDnnFloat) && (number_type == kDnnInt)) {
1379 fprintf(stderr, "Error trying to write floating point DNN as integer in AmIntelDnn::WriteDnnText().\n");
1380 fprintf(stderr, " Please convert to integer first.\n");
1384 std::ofstream out_file1(filename, std::ios::out);
1385 std::ofstream &out_file = out_file1;
1387 std::ofstream out_file((std::string(filename) + ".light").c_str(), std::ios::out);
1389 if (out_file.good()) {
1390 uint32_t num_inputs = component[0].num_rows_in;
1391 uint32_t num_outputs =
1392 (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) ? component[component.size()
1393 - 1].num_rows_out : component[component.size() - 1].num_columns_out;
1394 uint32_t num_layers = num_gna_layers();
1395 uint32_t num_group = this->num_group_in();
1398 out_file << "<intel_dnn_file>\n";
1399 out_file << "<number_type> " << intel_dnn_number_type_name[number_type] << "\n";
1400 out_file << "<softmax_type> " << intel_dnn_softmax_name[softmax_type] << "\n";
1401 out_file << "<num_memory_bytes> " << std::dec << num_bytes_dnn_memory_ << "\n";
1402 out_file << "<num_group> " << std::dec << num_group << "\n";
1403 out_file << "<number_inputs> " << std::dec << num_inputs << "\n";
1404 out_file << "<num_outputs> " << std::dec << num_outputs << "\n";
1405 out_file << "<num_layers> " << std::dec << num_layers << "\n";
1406 for (uint32_t i = 0; i < component.size(); i++) {
1408 std::stringstream out_file_name;
1409 out_file_name << getDumpFolderName() << std::setfill('0') << std::setw(2) << i << "_"
1410 << intel_dnn_operation_name[component[i].operation]
1411 << "-" << component[i].num_rows_in
1412 << "-" << component[i].num_rows_out;
1413 if (component[i].operation == kDnnPiecewiselinearOp) {
1414 out_file_name << "-" << intel_dnn_activation_name[component[i].op.pwl.func_id.type];
1416 std::ofstream out_file((out_file_name.str() + ".txt").c_str(), std::ios::out);
1419 uint32_t num_rows_in = component[i].num_rows_in;
1420 uint32_t num_columns_in = component[i].num_columns_in;
1421 uint32_t num_rows_out = component[i].num_rows_out;
1422 uint32_t num_columns_out = component[i].num_columns_out;
1423 uint32_t num_bytes_per_input = component[i].num_bytes_per_input;
1424 uint32_t num_bytes_per_output = component[i].num_bytes_per_output;
1425 if ((component[i].operation == kDnnAffineOp)
1426 || (component[i].operation == kDnnDiagonalOp)
1427 || (component[i].operation == kDnnRecurrentOp)
1428 || (component[i].operation == kDnnConvolutional1dOp)
1429 || (component[i].operation == kDnnInterleaveOp)
1430 || (component[i].operation == kDnnDeinterleaveOp)
1431 || (component[i].operation == kDnnCopyOp)) {
1432 out_file << "<layer_index> " << std::dec << layer << "\n";
1435 out_file << "<component_operation> " << intel_dnn_operation_name[component[i].operation] << "\n";
1436 out_file << "<macro_operation> " << intel_dnn_macro_operation_name[component[i].macro_operation] << "\n";
1437 out_file << "<num_rows_in> " << std::dec << num_rows_in << "\n";
1438 out_file << "<num_columns_in> " << std::dec << num_columns_in << "\n";
1439 out_file << "<num_rows_out> " << std::dec << num_rows_out << "\n";
1440 out_file << "<num_columns_out> " << std::dec << num_columns_out << "\n";
1441 out_file << "<orientation_in> " << std::dec << (component[i].orientation_in == kDnnInterleavedOrientation ?
1442 "interleaved" : "deinterleaved") << "\n";
1443 out_file << "<orientation_out> " << std::dec << (component[i].orientation_out == kDnnInterleavedOrientation ?
1444 "interleaved" : "deinterleaved") << "\n";
1446 if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1447 out_file << "<num_bytes_per_input> " << std::dec << sizeof(float) << "\n";
1448 out_file << "<num_bytes_per_output> " << std::dec << sizeof(float) << "\n";
1450 out_file << "<num_bytes_per_input> " << std::dec << num_bytes_per_input << "\n";
1451 out_file << "<num_bytes_per_output> " << std::dec << num_bytes_per_output << "\n";
1453 out_file << "<input_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1454 << MemoryOffset(component[i].ptr_inputs, ptr_dnn_memory_) << "\n";
1455 out_file << "<output_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1456 << MemoryOffset(component[i].ptr_outputs, ptr_dnn_memory_) << "\n";
1457 switch (component[i].operation) {
1459 case kDnnDiagonalOp: {
1460 uint32_t num_bytes_per_weight = component[i].op.affine.num_bytes_per_weight;
1461 uint32_t num_bytes_per_bias = component[i].op.affine.num_bytes_per_bias;
1462 float weight_scale_factor = component[i].op.affine.weight_scale_factor;
1463 float output_scale_factor = component[i].output_scale_factor;
1464 uint32_t num_weight_rows = (component[i].operation == kDnnDiagonalOp) ? 1 : num_rows_out;
1465 uint32_t num_weight_columns = num_rows_in;
1466 if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1467 out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
1468 out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
1470 out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
1471 out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
1473 if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1474 out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
1475 out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
1477 out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
1478 << weight_scale_factor << "\n";
1479 out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1480 << output_scale_factor << "\n";
1482 out_file << "<weight_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1483 << MemoryOffset(component[i].op.affine.ptr_weights, ptr_dnn_memory_) << "\n";
1484 out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1485 << MemoryOffset(component[i].op.affine.ptr_biases, ptr_dnn_memory_) << "\n";
1487 std::ofstream out_wfile((out_file_name.str() + "_weights.txt").c_str(), std::ios::out);
1488 std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out);
1490 if (num_bytes_per_weight == 1) {
1491 int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.affine.ptr_weights);
1492 intel_compound_bias_t *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.affine.ptr_biases);
1494 for (uint32_t row = 0; row < num_weight_rows; row++) {
1495 for (uint32_t col = 0; col < num_weight_columns; col++) {
1496 if (number_type == kDnnFloat) {
1498 static_cast<float>(ptr_weight[row * num_weight_columns + col]) * ptr_bias[row].multiplier
1499 / weight_scale_factor;
1500 out_wfile << std::setprecision(4) << val << " ";
1502 out_wfile << int((int8_t) ptr_weight[row * num_weight_columns + col]) << " ";
1508 } else if (num_bytes_per_weight == 2) {
1509 int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.affine.ptr_weights);
1511 for (uint32_t row = 0; row < num_weight_rows; row++) {
1512 for (uint32_t col = 0; col < num_weight_columns; col++) {
1513 if (number_type == kDnnFloat) {
1514 out_wfile << std::setprecision(12)
1515 << ptr_weight[row * num_weight_columns + col] / weight_scale_factor << " ";
1517 out_wfile << ptr_weight[row * num_weight_columns + col] << " ";
1523 } else if (number_type_ == kDnnFloat) {
1524 float *ptr_weight = reinterpret_cast<float *>(component[i].op.affine.ptr_weights);
1526 for (uint32_t row = 0; row < num_weight_rows; row++) {
1527 for (uint32_t col = 0; col < num_weight_columns; col++) {
1528 out_wfile << std::setprecision(5)
1529 << ptr_weight[row * num_weight_columns + col] << " ";
1535 fprintf(stderr, "Unsupported weight type in WriteDnnText!\n");
1538 if (number_type_ == kDnnInt) {
1539 if (num_bytes_per_weight == 1) {
1540 intel_compound_bias_t
1541 *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.affine.ptr_biases);
1543 for (uint32_t row = 0; row < num_rows_out; row++) {
1544 out_bfile << std::setw(8) << ptr_biases[row].bias << ", ";
1545 out_bfile << std::setw(8) << int(ptr_biases[row].multiplier) << "\n";
1549 int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.affine.ptr_biases);
1551 for (uint32_t row = 0; row < num_rows_out; row++) {
1552 if (number_type == kDnnInt) {
1553 out_bfile << std::setw(8) << ptr_biases[row] << "\n";
1555 out_bfile << std::setw(8) << ptr_biases[row] / output_scale_factor << "\n";
1562 float *ptr_biases = reinterpret_cast<float *>(component[i].op.affine.ptr_biases);
1565 for (uint32_t row = 0; row < num_rows_out; row++) {
1566 out_bfile << std::setprecision(5) << ptr_biases[row] << "\n";
1572 case kDnnConvolutional1dOp: {
1573 uint32_t num_filters = component[i].op.conv1D.num_filters;
1574 uint32_t num_filter_rows = component[i].op.conv1D.num_filter_rows;
1575 uint32_t num_filter_coefficients = component[i].op.conv1D.num_filter_coefficients;
1576 uint32_t num_feature_maps = component[i].op.conv1D.num_feature_maps;
1577 uint32_t num_feature_map_rows = component[i].op.conv1D.num_feature_map_rows;
1578 uint32_t num_feature_map_columns = component[i].op.conv1D.num_feature_map_columns;
1579 uint32_t num_filter_outputs =
1580 component[i].op.conv1D.num_feature_map_rows - component[i].op.conv1D.num_filter_rows + 1;
1581 uint32_t num_bytes_per_weight = component[i].op.conv1D.num_bytes_per_weight;
1582 uint32_t num_bytes_per_bias = component[i].op.conv1D.num_bytes_per_bias;
1583 float weight_scale_factor = component[i].op.conv1D.weight_scale_factor;
1584 float output_scale_factor = component[i].output_scale_factor;
1585 out_file << "<num_filters> " << std::dec << num_filters << "\n";
1586 out_file << "<num_filter_coefficients> " << std::dec << num_filter_coefficients << "\n";
1587 out_file << "<num_filter_rows> " << std::dec << num_filter_rows << "\n";
1588 out_file << "<num_feature_maps> " << std::dec << num_feature_maps << "\n";
1589 out_file << "<num_feature_map_rows> " << std::dec << num_feature_map_rows << "\n";
1590 out_file << "<num_feature_map_columns> " << std::dec << num_feature_map_columns << "\n";
1591 if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1592 out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
1593 out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
1595 out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
1596 out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
1598 if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1599 out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
1600 out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
1602 out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
1603 << weight_scale_factor << "\n";
1604 out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1605 << output_scale_factor << "\n";
1607 out_file << "<filter_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1608 << MemoryOffset(component[i].op.conv1D.ptr_filters, ptr_dnn_memory_) << "\n";
1609 out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1610 << MemoryOffset(component[i].op.conv1D.ptr_biases, ptr_dnn_memory_) << "\n";
1613 std::ofstream out_wfile((out_file_name.str() + "_weights.txt").c_str(), std::ios::out);
1614 std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out);
1617 if (num_bytes_per_weight == 1) {
1618 int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.conv1D.ptr_filters);
1619 intel_compound_bias_t *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.conv1D.ptr_biases);
1621 for (uint32_t row = 0; row < num_filters; row++) {
1622 for (uint32_t col = 0; col < num_filter_coefficients; col++) {
1623 if (number_type == kDnnFloat) {
1624 float val = static_cast<float>(ptr_weight[row * num_filter_coefficients + col])
1625 * ptr_bias[row].multiplier / weight_scale_factor;
1626 out_wfile << std::setprecision(12) <<val << "\n";
1628 out_wfile << "0x" << std::setfill('0') << std::setw(2) << std::hex
1629 << int((uint8_t) ptr_weight[row * num_filter_coefficients + col]) << "\n";
1634 } else if (num_bytes_per_weight == 2) {
1635 int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.conv1D.ptr_filters);
1637 for (uint32_t row = 0; row < num_filters; row++) {
1638 for (uint32_t col = 0; col < num_filter_coefficients; col++) {
1639 if (number_type == kDnnFloat) {
1640 out_wfile << std::setprecision(12)
1641 << ptr_weight[row * num_filter_coefficients + col] / weight_scale_factor
1644 out_wfile << "0x" << std::setfill('0') << std::setw(4) << std::hex
1645 << ptr_weight[row * num_filter_coefficients + col] << "\n";
1650 } else if (number_type_ == kDnnFloat) {
1651 float *ptr_weight = reinterpret_cast<float *>(component[i].op.conv1D.ptr_filters);
1653 for (uint32_t row = 0; row < num_filters; row++) {
1654 for (uint32_t col = 0; col < num_filter_coefficients; col++) {
1655 out_wfile << std::setprecision(12)
1656 << ptr_weight[row * num_filter_coefficients + col] << "\n";
1662 fprintf(stderr, "Unsupported filter weight type in WriteDnnText!\n");
1666 if (number_type_ == kDnnInt) {
1667 if (number_type == kDnnInt) {
1668 if (num_bytes_per_weight == 1) {
1669 intel_compound_bias_t
1670 *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.conv1D.ptr_biases);
1672 for (uint32_t row = 0; row < num_filters; row++) {
1673 out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex
1674 << ptr_biases[row].bias << " ";
1675 out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex
1676 << int(ptr_biases[row].multiplier) << "\n";
1680 int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.conv1D.ptr_biases);
1682 for (uint32_t row = 0; row < num_filters; row++) {
1683 out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex << ptr_biases[row]
1689 int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.conv1D.ptr_biases);
1691 for (uint32_t row = 0; row < num_filters; row++) {
1692 out_bfile << std::setprecision(12)
1693 << ptr_biases[row] / output_scale_factor << "\n";
1698 float *ptr_biases = reinterpret_cast<float *>(component[i].op.conv1D.ptr_biases);
1700 for (uint32_t row = 0; row < num_filters; row++) {
1701 out_bfile << std::setprecision(12) << ptr_biases[row] << "\n";
1708 case kDnnRecurrentOp: {
1709 float weight_scale_factor = component[i].op.recurrent.weight_scale_factor;
1710 float output_scale_factor = component[i].output_scale_factor;
1711 uint32_t num_vector_delay = component[i].op.recurrent.num_vector_delay;
1712 uint32_t num_bytes_per_weight = component[i].op.recurrent.num_bytes_per_weight;
1713 uint32_t num_bytes_per_bias = component[i].op.recurrent.num_bytes_per_bias;
1714 uint32_t num_weight_rows = num_columns_out;
1715 uint32_t num_weight_columns = num_columns_in + num_columns_out;
1716 out_file << "<num_vector_delay> " << std::dec << num_vector_delay << "\n";
1717 if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1718 out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
1719 out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
1721 out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
1722 out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
1724 if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1725 out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
1726 out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
1728 out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
1729 << weight_scale_factor << "\n";
1730 out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1731 << output_scale_factor << "\n";
1733 out_file << "<weight_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1734 << MemoryOffset(component[i].op.recurrent.ptr_weights, ptr_dnn_memory_) << "\n";
1735 out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1736 << MemoryOffset(component[i].op.recurrent.ptr_biases, ptr_dnn_memory_) << "\n";
1737 out_file << "<feedback_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1738 << MemoryOffset(component[i].op.recurrent.ptr_feedbacks, ptr_dnn_memory_) << "\n";
1739 if (num_bytes_per_weight == 1) {
1740 int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.recurrent.ptr_weights);
1741 intel_compound_bias_t
1742 *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.recurrent.ptr_biases);
1744 for (uint32_t row = 0; row < num_weight_rows; row++) {
1745 out_file << "<weight_row> ";
1746 for (uint32_t col = 0; col < num_weight_columns; col++) {
1747 if (number_type == kDnnFloat) {
1749 static_cast<float>(ptr_weight[row * num_weight_columns + col]) * ptr_bias[col].multiplier
1750 / weight_scale_factor;
1751 out_file << std::setprecision(12) << std::scientific << val << " ";
1753 out_file << "0x" << std::setfill('0') << std::setw(2) << std::hex
1754 << int((uint8_t) ptr_weight[row * num_weight_columns + col]) << " ";
1760 } else if (num_bytes_per_weight == 2) {
1761 int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.recurrent.ptr_weights);
1763 for (uint32_t row = 0; row < num_weight_rows; row++) {
1764 out_file << "<weight_row> ";
1765 for (uint32_t col = 0; col < num_weight_columns; col++) {
1766 if (number_type == kDnnFloat) {
1767 out_file << std::setprecision(12) << std::scientific
1768 << ptr_weight[row * num_weight_columns + col] / weight_scale_factor << " ";
1770 out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
1771 << ptr_weight[row * num_weight_columns + col] << " ";
1777 } else if (number_type_ == kDnnFloat) {
1778 float *ptr_weight = reinterpret_cast<float *>(component[i].op.recurrent.ptr_weights);
1780 for (uint32_t row = 0; row < num_weight_rows; row++) {
1781 out_file << "<weight_row> ";
1782 for (uint32_t col = 0; col < num_weight_columns; col++) {
1783 out_file << std::setprecision(12) << std::scientific
1784 << ptr_weight[row * num_weight_columns + col] << " ";
1790 fprintf(stderr, "Unsupported weight type in WriteDnnText!\n");
1793 if (number_type_ == kDnnInt) {
1794 if (number_type == kDnnInt) {
1795 if (num_bytes_per_weight == 1) {
1796 intel_compound_bias_t
1797 *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.recurrent.ptr_biases);
1798 out_file << "<compound_bias>" << " ";
1800 for (uint32_t col = 0; col < num_columns_out; col++) {
1801 out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
1802 << ptr_biases[col].bias << " ";
1803 out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
1804 << ptr_biases[col].multiplier << " ";
1808 int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.recurrent.ptr_biases);
1809 out_file << "<bias>" << " ";
1811 for (uint32_t col = 0; col < num_columns_out; col++) {
1812 out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex << ptr_biases[col]
1818 int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.recurrent.ptr_biases);
1819 out_file << "<bias>" << " ";
1821 for (uint32_t col = 0; col < num_columns_out; col++) {
1822 out_file << std::setprecision(12) << std::scientific
1823 << ptr_biases[col] / output_scale_factor << " ";
1828 float *ptr_biases = reinterpret_cast<float *>(component[i].op.recurrent.ptr_biases);
1829 out_file << "<bias>" << " ";
1831 for (uint32_t col = 0; col < num_columns_out; col++) {
1832 out_file << std::setprecision(12) << std::scientific << ptr_biases[col] << " ";
1839 case kDnnMaxPoolOp: {
1840 uint32_t num_pool_type = (component[i].op.maxpool.do_sum_not_max) ? 2 : 1;
1841 out_file << "<pool_type> " << std::dec << num_pool_type << "\n";
1842 out_file << "<pool_size> " << std::dec << component[i].op.maxpool.num_inputs << "\n";
1843 out_file << "<pool_step> " << std::dec << component[i].op.maxpool.num_inputs_step << "\n";
1844 out_file << "<pool_num_rows> " << std::dec << component[i].op.maxpool.num_inputs_stride << "\n";
1845 out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1846 << component[i].output_scale_factor << "\n";
1849 case kDnnPiecewiselinearOp: {
1850 intel_pwl_segment_t *ptr_segment = component[i].op.pwl.ptr_segments;
1851 DnnActivationType func_id = component[i].op.pwl.func_id.type;
1852 uint32_t num_segments = component[i].op.pwl.num_segments;
1853 float output_scale_factor = component[i].output_scale_factor;
1854 out_file << "<func_id> " << intel_dnn_activation_name[func_id] << "\n";
1855 out_file << "<num_bytes_per_slope> " << std::dec << sizeof(int16_t) << "\n";
1856 out_file << "<num_bytes_per_intercept> " << std::dec << sizeof(int16_t) << "\n";
1857 out_file << "<num_bytes_per_offset> " << std::dec << sizeof(int32_t) << "\n";
1858 if (number_type == kDnnFloat) {
1859 out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
1860 out_file << "<num_segments> " << std::dec << 0 << "\n";
1861 out_file << "<segment_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1862 << MemoryOffset(component[i].op.pwl.ptr_segments, ptr_dnn_memory_) << "\n";
1864 out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1865 << output_scale_factor << "\n";
1866 out_file << "<num_segments> " << std::dec << num_segments << "\n";
1867 out_file << "<segment_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1868 << MemoryOffset(component[i].op.pwl.ptr_segments, ptr_dnn_memory_) << "\n";
1869 if (number_type_ == kDnnInt) {
1870 out_file << "<slope> ";
1871 for (int segment = 0; segment < num_segments; segment++) {
1872 out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
1873 << ptr_segment[segment].slope << " ";
1876 out_file << "<intercept> ";
1877 for (int segment = 0; segment < component[i].op.pwl.num_segments; segment++) {
1878 out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
1879 << ptr_segment[segment].yBase << " ";
1882 out_file << "<offset> ";
1883 for (int segment = 0; segment < component[i].op.pwl.num_segments; segment++) {
1884 out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
1885 << ptr_segment[segment].xBase << " ";
1888 } else if (num_segments > 0) {
1890 "Number of segments must be zero in floating point model in WriteDnnText!\n");
1896 case kDnnInterleaveOp:
1897 out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1898 << component[i].output_scale_factor << "\n";
1900 case kDnnDeinterleaveOp:
1901 out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1902 << component[i].output_scale_factor << "\n";
1905 out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1906 << component[i].output_scale_factor << "\n";
1907 out_file << "<num_copy_rows> " << std::dec << component[i].op.copy.num_copy_rows << "\n";
1908 out_file << "<num_copy_columns> " << std::dec << component[i].op.copy.num_copy_columns << "\n";
1911 out_file << "<Error!!!> Unsupported Component : "
1912 << intel_dnn_operation_name[component[i].operation] << "\n";
1913 // fprintf(stderr, "Component type %s not yet supported in AmIntelDnn::WriteDnnText()!\n",
1914 // intel_dnn_operation_name[component[i].operation]);
1919 if (ptr_active_outputs() != nullptr) {
1920 out_file << "<activelist_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1921 << MemoryOffset(ptr_active_outputs(), ptr_dnn_memory_) << "\n";
1923 out_file << "<end_of_file>\n";
1926 fprintf(stderr, "Failed to open %s for writing!\n", filename);
1931 void AmIntelDnn::InitGNAStruct(intel_nnet_type_t *ptr_nnet) {
1932 intel_nnet_layer_t *pLayer;
1934 if (ptr_nnet == nullptr)
1935 THROW_GNA_EXCEPTION << "Invalid input parameter";
1936 if (ptr_nnet->pLayers != nullptr)
1937 THROW_GNA_EXCEPTION << "InitGNAStruct can't work on prellocated layers array";
1938 if (component.empty())
1939 THROW_GNA_EXCEPTION << "empty model in AmIntelDnn::FillGNAStruct()";
1941 ptr_nnet->nLayers = 0;
1942 for (auto && c : component) {
1943 if (c.operation == kDnnAffineOp
1944 || (c.operation == kDnnDiagonalOp)
1945 || (c.operation == kDnnConvolutional1dOp)
1946 || (c.operation == kDnnDeinterleaveOp)
1947 || (c.operation == kDnnInterleaveOp)
1948 || (c.operation == kDnnRecurrentOp)
1949 || (c.operation == kDnnCopyOp)
1951 ptr_nnet->nLayers++;
1954 ptr_nnet->nGroup = num_group_in();
1955 ptr_nnet->pLayers = reinterpret_cast<intel_nnet_layer_t *>(_mm_malloc(ptr_nnet->nLayers * sizeof(intel_nnet_layer_t), 64));
1956 if (ptr_nnet->pLayers == nullptr)
1957 THROW_GNA_EXCEPTION << "out of memory in AmIntelDnn::FillGNAStruct()";
1958 pLayer = ptr_nnet->pLayers;
1960 for (int i = 0; i < component.size(); i++) {
1961 // std::cout << "Component + " << i <<"=GNA_" << std::distance(ptr_nnet->pLayers, pLayer) << "\n";
1962 switch (component[i].operation) {
1964 pLayer->nInputRows = component[i].num_rows_in;
1965 pLayer->nInputColumns = component[i].num_columns_in;
1966 pLayer->nOutputRows = component[i].num_rows_out;
1967 pLayer->nOutputColumns = component[i].num_columns_out;
1968 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
1969 pLayer->nBytesPerOutput = component[i].num_bytes_per_output; // will be overwritten if PWL op is needed
1970 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
1971 pLayer->pInputs = component[i].ptr_inputs;
1972 pLayer->pOutputsIntermediate = component[i].ptr_outputs;
1973 pLayer->pOutputs = component[i].ptr_outputs; // will be overwritten if PWL op is needed
1974 pLayer->nLayerKind = INTEL_AFFINE;
1976 pLayer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64);
1977 if (pLayer->pLayerStruct == nullptr) {
1978 THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_AFFINE layer structure.";
1980 auto pAffineLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
1981 pAffineLayer->pwl.pSegments = nullptr;
1982 pAffineLayer->pwl.nSegments = 0;
1984 pAffineLayer->affine.nBytesPerBias = component[i].op.affine.num_bytes_per_bias;
1985 pAffineLayer->affine.nBytesPerWeight = component[i].op.affine.num_bytes_per_weight;
1986 pAffineLayer->affine.pBiases = component[i].op.affine.ptr_biases;
1987 pAffineLayer->affine.pWeights = component[i].op.affine.ptr_weights;
1989 if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
1993 case kDnnDiagonalOp:
1994 pLayer->nInputRows = component[i].num_rows_in;
1995 pLayer->nInputColumns = component[i].num_columns_in;
1996 pLayer->nOutputRows = component[i].num_rows_out;
1997 pLayer->nOutputColumns = component[i].num_columns_out;
1998 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
1999 pLayer->nBytesPerOutput = component[i].num_bytes_per_output; // will be overwritten if PWL op is needed
2000 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2001 pLayer->pInputs = component[i].ptr_inputs;
2002 pLayer->pOutputsIntermediate = component[i].ptr_outputs;
2003 pLayer->pOutputs = component[i].ptr_outputs; // will be overwritten if PWL op is needed
2004 pLayer->nLayerKind = INTEL_AFFINE_DIAGONAL;
2006 pLayer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64);
2007 if (pLayer->pLayerStruct == nullptr) {
2008 THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_AFFINE_DIAGONAL layer structure.";
2010 auto pDiagonalLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
2011 pDiagonalLayer->pwl.pSegments = nullptr;
2012 pDiagonalLayer->pwl.nSegments = 0;
2014 pDiagonalLayer->affine.nBytesPerBias = component[i].op.affine.num_bytes_per_bias;
2015 pDiagonalLayer->affine.nBytesPerWeight = component[i].op.affine.num_bytes_per_weight;
2016 pDiagonalLayer->affine.pBiases = component[i].op.affine.ptr_biases;
2017 pDiagonalLayer->affine.pWeights = component[i].op.affine.ptr_weights;
2019 if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
2023 case kDnnRecurrentOp:
2024 pLayer->nInputRows = component[i].num_rows_in;
2025 pLayer->nInputColumns = component[i].num_columns_in;
2026 pLayer->nOutputRows = component[i].num_rows_out;
2027 pLayer->nOutputColumns = component[i].num_columns_out;
2028 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2029 pLayer->nBytesPerOutput = component[i].num_bytes_per_output; // will be overwritten if PWL op is needed
2030 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2031 pLayer->pInputs = component[i].ptr_inputs;
2032 pLayer->pOutputsIntermediate = component[i].ptr_outputs;
2033 pLayer->pOutputs = component[i].ptr_outputs; // will be overwritten if PWL op is needed
2034 pLayer->nLayerKind = INTEL_RECURRENT;
2036 pLayer->pLayerStruct = _mm_malloc(sizeof(intel_recurrent_layer_t), 64);
2037 if (pLayer->pLayerStruct == nullptr) {
2038 THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_RECURRENT layer structure.";
2040 auto pRecurrentLayer = reinterpret_cast<intel_recurrent_layer_t *>(pLayer->pLayerStruct);
2041 pRecurrentLayer->pFeedbackBuffer = component[i].op.recurrent.ptr_feedbacks;
2042 pRecurrentLayer->pwl.pSegments = nullptr;
2043 pRecurrentLayer->pwl.nSegments = 0;
2045 pRecurrentLayer->affine.nBytesPerBias = component[i].op.recurrent.num_bytes_per_bias;
2046 pRecurrentLayer->affine.nBytesPerWeight = component[i].op.recurrent.num_bytes_per_weight;
2047 pRecurrentLayer->affine.pBiases = component[i].op.recurrent.ptr_biases;
2048 pRecurrentLayer->affine.pWeights = component[i].op.recurrent.ptr_weights;
2050 if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
2054 case kDnnConvolutional1dOp:
2055 pLayer->nInputRows = component[i].num_rows_in;
2056 pLayer->nInputColumns = component[i].num_columns_in;
2057 pLayer->nOutputRows = component[i].num_rows_out;
2058 pLayer->nOutputColumns = component[i].num_columns_out;
2059 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2060 pLayer->nBytesPerOutput = component[i].num_bytes_per_output; // will be overwritten
2061 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2062 pLayer->pInputs = component[i].ptr_inputs;
2063 pLayer->pOutputsIntermediate = component[i].ptr_outputs;
2064 pLayer->pOutputs = component[i].ptr_outputs; // will be overwritten
2065 pLayer->nLayerKind = INTEL_CONVOLUTIONAL;
2067 pLayer->pLayerStruct = _mm_malloc(sizeof(intel_convolutional_layer_t), 64);
2068 if (pLayer->pLayerStruct == nullptr) {
2069 THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_CONVOLUTIONAL layer structure.";
2071 auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
2072 pConvolutionalLayer->nBytesBias = component[i].op.conv1D.num_bytes_per_bias;
2073 pConvolutionalLayer->nBytesFilterCoefficient = component[i].op.conv1D.num_bytes_per_weight;
2074 pConvolutionalLayer->nFilters = component[i].op.conv1D.num_filters;
2075 pConvolutionalLayer->nFilterRows = component[i].op.conv1D.num_filter_rows;
2076 pConvolutionalLayer->nFilterCoefficients = component[i].op.conv1D.num_filter_coefficients;
2077 pConvolutionalLayer->nFeatureMaps = component[i].op.conv1D.num_feature_maps;
2078 pConvolutionalLayer->nFeatureMapRows = component[i].op.conv1D.num_feature_map_rows;
2079 pConvolutionalLayer->nFeatureMapColumns = component[i].op.conv1D.num_feature_map_columns;
2080 pConvolutionalLayer->poolType = INTEL_NO_POOLING; // will be overwritten
2081 pConvolutionalLayer->nPoolSize = 0; // will be overwritten
2082 pConvolutionalLayer->nPoolStride = 0; // will be overwritten
2083 pConvolutionalLayer->pwl.nSegments = 0; // will be overwritten
2084 pConvolutionalLayer->pwl.pSegments = nullptr; // will be overwritten
2085 pConvolutionalLayer->pBiases = component[i].op.conv1D.ptr_biases;
2086 pConvolutionalLayer->pFilters = component[i].op.conv1D.ptr_filters;
2088 if (i == component.size() - 1 || ((component[i + 1].operation != kDnnMaxPoolOp)
2089 && (component[i + 1].operation != kDnnPiecewiselinearOp))) {
2095 THROW_GNA_EXCEPTION << "Pooling component with no preceeding component";
2096 } else if (pLayer->nLayerKind == INTEL_CONVOLUTIONAL) {
2097 if (pLayer->pLayerStruct == nullptr) {
2098 THROW_GNA_EXCEPTION "INTEL_CONVOLUTIONAL layer structure was not initialized.";
2100 auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
2101 // it is possible to have activation preceding to maxpool
2102 if (pConvolutionalLayer->pwl.nSegments != 0) {
2103 THROW_GNA_EXCEPTION << "Encountered activation component before pooling component at." << i;
2105 pConvolutionalLayer->poolType =
2106 (component[i].op.maxpool.do_sum_not_max) ? INTEL_SUM_POOLING : INTEL_MAX_POOLING;
2107 pConvolutionalLayer->nPoolSize = component[i].op.maxpool.num_inputs;
2108 pConvolutionalLayer->nPoolStride = component[i].op.maxpool.num_inputs_step;
2111 // number of output columns correction - based on GNA-library expectations
2112 auto nFltSize = pConvolutionalLayer->nFilterCoefficients;
2113 auto fltStrideSz = pConvolutionalLayer->nFeatureMaps * pConvolutionalLayer->nFeatureMapColumns; // always move 1 "row"
2114 auto maxNCOE = (pLayer->nInputColumns - nFltSize) / fltStrideSz + 1;
2115 // FLAT input matrix, pooled outputs per filter
2116 pLayer->nOutputColumns = pConvolutionalLayer->nFilters * ((maxNCOE - 1) / pConvolutionalLayer->nPoolStride + 1);
2119 // pLayer->nOutputColumns /= pConvolutionalLayer->nPoolStride;
2122 THROW_GNA_EXCEPTION << "Pooling component applied to non-convolutional layer";
2125 case kDnnPiecewiselinearOp:
2126 pLayer->pOutputs = component[i].ptr_outputs;
2127 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
2128 if (pLayer->pLayerStruct == nullptr) {
2129 THROW_GNA_EXCEPTION << pLayer->nLayerKind << " layer structure was not initialized.";
2132 THROW_GNA_EXCEPTION << "PWL component with no preceding component.";
2133 } else if ((component[i - 1].operation == kDnnAffineOp)
2134 || (component[i - 1].operation == kDnnDiagonalOp)) {
2135 auto pAffineLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
2136 pAffineLayer->pwl.nSegments = component[i].op.pwl.num_segments;
2137 pAffineLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
2138 } else if (component[i - 1].operation == kDnnRecurrentOp) {
2139 auto pRecurrentLayer = reinterpret_cast<intel_recurrent_layer_t *>(pLayer->pLayerStruct);
2140 pRecurrentLayer->pwl.nSegments = component[i].op.pwl.num_segments;
2141 pRecurrentLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
2142 } else if ((component[i - 1].operation == kDnnConvolutional1dOp)
2143 || ((component[i - 1].operation == kDnnMaxPoolOp)
2144 && (component[i - 2].operation == kDnnConvolutional1dOp))) {
2145 auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
2146 pConvolutionalLayer->pwl.nSegments = component[i].op.pwl.num_segments;
2147 pConvolutionalLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
2148 if (component[i - 1].operation != kDnnMaxPoolOp) {
2149 pLayer->nOutputColumns = component[i].num_columns_out;
2155 case kDnnInterleaveOp:
2156 pLayer->nInputRows = component[i].num_rows_in;
2157 pLayer->nInputColumns = component[i].num_columns_in;
2158 pLayer->nOutputRows = component[i].num_rows_out;
2159 pLayer->nOutputColumns = component[i].num_columns_out;
2160 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2161 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
2162 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2163 pLayer->pInputs = component[i].ptr_inputs;
2164 pLayer->pOutputsIntermediate = nullptr;
2165 pLayer->pOutputs = component[i].ptr_outputs;
2166 pLayer->nLayerKind = INTEL_INTERLEAVE;
2167 pLayer->pLayerStruct = nullptr;
2170 case kDnnDeinterleaveOp:
2171 pLayer->nInputRows = component[i].num_rows_in;
2172 pLayer->nInputColumns = component[i].num_columns_in;
2173 pLayer->nOutputRows = component[i].num_rows_out;
2174 pLayer->nOutputColumns = component[i].num_columns_out;
2175 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2176 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
2177 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2178 pLayer->pInputs = component[i].ptr_inputs;
2179 pLayer->pOutputsIntermediate = nullptr;
2180 pLayer->pOutputs = component[i].ptr_outputs;
2181 pLayer->nLayerKind = INTEL_DEINTERLEAVE;
2182 pLayer->pLayerStruct = nullptr;
2186 pLayer->nInputRows = component[i].num_columns_in;
2187 pLayer->nInputColumns = component[i].num_rows_in;
2188 pLayer->nOutputRows = component[i].num_columns_out;
2189 pLayer->nOutputColumns = component[i].num_rows_out;
2190 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2191 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
2192 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2193 pLayer->pInputs = component[i].ptr_inputs;
2194 pLayer->pOutputsIntermediate = nullptr;
2195 pLayer->pOutputs = component[i].ptr_outputs;
2196 pLayer->nLayerKind = INTEL_COPY;
2197 pLayer->pLayerStruct = nullptr;
2199 pLayer->pLayerStruct = _mm_malloc(sizeof(intel_copy_layer_t), 64);
2200 if (pLayer->pLayerStruct == nullptr) {
2201 THROW_GNA_EXCEPTION << pLayer->nLayerKind << " could not allocate memory for INTEL_COPY layer structure.";
2203 auto *pCopyLayer = reinterpret_cast<intel_copy_layer_t *>(pLayer->pLayerStruct);
2204 pCopyLayer->nCopyRows = component[i].op.copy.num_copy_columns;
2205 pCopyLayer->nCopyCols = component[i].op.copy.num_copy_rows;
2210 THROW_GNA_EXCEPTION << "GNA does yet not support " << intel_dnn_operation_name[component[i].operation];
2214 // enable debugging of partial array of components
2215 ptr_nnet->nLayers = std::distance(ptr_nnet->pLayers, pLayer);
2218 void AmIntelDnn::DestroyGNAStruct(intel_nnet_type_t *ptr_nnet) {
2219 ptr_nnet->nGroup = 0;
2220 if (ptr_nnet->pLayers != nullptr) {
2221 for (int i = 0; i < ptr_nnet->nLayers; i++) {
2222 switch (ptr_nnet->pLayers[i].nLayerKind) {
2223 case INTEL_AFFINE:break;
2224 case INTEL_AFFINE_DIAGONAL:break;
2225 case INTEL_RECURRENT:break;
2226 case INTEL_CONVOLUTIONAL:break;
2227 case INTEL_INTERLEAVE:break;
2228 case INTEL_DEINTERLEAVE:break;
2229 case INTEL_COPY:break;
2232 if (ptr_nnet->pLayers[i].pLayerStruct != nullptr) {
2233 _mm_free(ptr_nnet->pLayers[i].pLayerStruct);
2236 if (ptr_nnet->pLayers != nullptr) {
2237 _mm_free(ptr_nnet->pLayers);
2240 ptr_nnet->nLayers = 0;
2243 void AmIntelDnn::GetScaledOutput(float *ptr_output, uint32_t component_index) {
2244 if (component_index > num_components()) {
2245 fprintf(stderr, "Illegal component index %d in GetScaledOutput\n", component_index);
2248 if (ptr_output != nullptr) {
2249 float scale_factor = OutputScaleFactor(component_index);
2250 uint32_t num_elements = component[component_index].num_rows_out * component[component_index].num_columns_out;
2251 if (number_type_ == kDnnFloat) {
2252 float *ptr_input = reinterpret_cast<float *>(component[component_index].ptr_outputs);
2253 for (uint32_t i = 0; i < num_elements; i++) {
2254 ptr_output[i] = ptr_input[i] / scale_factor;
2256 } else if (component[component_index].num_bytes_per_output == 2) {
2257 int16_t *ptr_input = reinterpret_cast<int16_t *>(component[component_index].ptr_outputs);
2258 for (uint32_t i = 0; i < num_elements; i++) {
2259 ptr_output[i] = static_cast<float>(ptr_input[i]) / scale_factor;
2262 int32_t *ptr_input = reinterpret_cast<int32_t *>(component[component_index].ptr_outputs);
2263 for (uint32_t i = 0; i < num_elements; i++) {
2264 ptr_output[i] = static_cast<float>(ptr_input[i]) / scale_factor;
2268 fprintf(stderr, "Output pointer is nullptr in GetScaledOutput\n");
2273 void AmIntelDnn::WriteInputAndOutputTextGNA(intel_nnet_type_t * nnet) {
2276 for (int i = 0; i < nnet->nLayers; i++) {
2277 auto component = nnet->pLayers;
2278 std::stringstream out_file_name;
2279 auto getLayerType = [](intel_layer_kind_t kind){
2281 case INTEL_AFFINE : return "affine";
2282 case INTEL_AFFINE_DIAGONAL : return "diag";
2283 case INTEL_RECURRENT : return "recurrent";
2284 case INTEL_CONVOLUTIONAL : return "convolution";
2285 case INTEL_INTERLEAVE : return "interleave";
2286 case INTEL_DEINTERLEAVE : return "deinterleave";
2287 case INTEL_COPY : return "copy";
2288 default: return "unknown";
2291 out_file_name << std::setfill('0') << std::setw(2) << i << "_"
2292 << getLayerType(component[i].nLayerKind)
2293 << "-" << nnet->pLayers[i].nInputRows
2294 << "-" << nnet->pLayers[i].nOutputRows;
2296 auto inputfileName = getDumpFolderNameGNA() + out_file_name.str() + "_input.txt";
2297 auto outFileName = getDumpFolderNameGNA() + out_file_name.str() + "_output.txt";
2298 auto pwlFileName = getDumpFolderNameGNA() + out_file_name.str() + "_pwl.txt";
2299 auto refOutputFileName = getRefFolderName() + out_file_name.str() + "_output.txt";
2301 std::ofstream out_file(outFileName.c_str(), std::ios::out);
2302 std::ofstream pwl_file(pwlFileName.c_str(), std::ios::out);
2303 std::ifstream ref_out_file(refOutputFileName.c_str(), std::ios::in);
2304 std::ofstream in_file(inputfileName.c_str(), std::ios::out);
2306 float summOfDiff = 0.f;
2307 float summOfSqDiff = 0.f;
2311 auto write_pwl = [&pwl_file](intel_pwl_func_t & pwl) {
2312 for (int k =0; k < pwl.nSegments; k++) {
2313 pwl_file << pwl.pSegments[k].slope << ", " << pwl.pSegments[k].xBase << ", " << pwl.pSegments[k].yBase << "\n";
2316 if (nnet->pLayers[i].nLayerKind == INTEL_AFFINE || nnet->pLayers[i].nLayerKind == INTEL_AFFINE_DIAGONAL) {
2317 auto affine = reinterpret_cast<intel_affine_layer_t*>(nnet->pLayers[i].pLayerStruct);
2318 write_pwl(affine->pwl);
2320 if (nnet->pLayers[i].nLayerKind == INTEL_CONVOLUTIONAL) {
2321 auto conv = reinterpret_cast<intel_convolutional_layer_t*>(nnet->pLayers[i].pLayerStruct);
2322 write_pwl(conv->pwl);
2325 for (int k = 0; k < component[i].nOutputRows; k++) {
2326 for (int j = 0; j < component[i].nOutputColumns; j++) {
2327 float floatValue = 0.f;
2328 if (component[i].nBytesPerOutput == 4) {
2329 auto value = (reinterpret_cast<int32_t *>(component[i].pOutputs)[k * component[i].nOutputColumns + j]);
2330 floatValue = (static_cast<float>(value) / 1.0);
2332 auto value = reinterpret_cast<int16_t *>(component[i].pOutputs)[k * component[i].nOutputColumns + j];
2333 floatValue = (static_cast<float>(value) / 1.0);
2335 out_file << std::setw(8) << floatValue << "\n";
2337 float ref_value = 0.f;
2338 ref_out_file >> ref_value;
2339 float diff = (ref_value - floatValue);
2340 diff = diff < 0 ? -diff : diff;
2342 summOfSqDiff += diff * diff;
2343 maxD = std::max(maxD, diff);
2349 auto rmse = sqrt(summOfSqDiff / numItems);
2350 auto avg = summOfDiff / numItems;
2351 std :: cout << std::left << std::setw(55) << out_file_name.str()
2352 << " RMSE="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << rmse
2353 << " avg=" << std::fixed << std::setprecision(5) << std::right << std::setw(8) << avg
2354 << " maxD="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << maxD << std::endl;
2358 for (int k = 0; k < component[i].nInputRows; k++) {
2359 for (int j = 0; j < component[i].nInputColumns; j++) {
2360 if (component[i].nBytesPerInput == 4) {
2361 in_file << std::setw(8)
2362 << (reinterpret_cast<int32_t *>(component[i].pInputs)[k * component[i].nInputColumns + j]);
2364 in_file << std::setw(8)
2365 << (reinterpret_cast<int16_t *>(component[i].pInputs)[k * component[i].nInputColumns + j]);
2375 void AmIntelDnn::WriteInputAndOutputText() {
2377 for (int i = 0; i < num_components(); i++) {
2378 std::stringstream out_file_name;
2379 out_file_name << std::setfill('0') << std::setw(2) << i << "_"
2380 << intel_dnn_operation_name[component[i].operation]
2381 << "-" << component[i].num_rows_in
2382 << "-" << component[i].num_rows_out;
2383 if (component[i].operation == kDnnPiecewiselinearOp) {
2384 out_file_name << "-" << intel_dnn_activation_name[component[i].op.pwl.func_id];
2386 auto inputfileName = getDumpFolderName() + out_file_name.str() + "_input.txt";
2387 auto outFileName = getDumpFolderName() + out_file_name.str() + "_output.txt";
2388 auto refOutputFileName = getRefFolderName() + out_file_name.str() + "_output.txt";
2390 std::ofstream out_file(outFileName.c_str(), std::ios::out);
2391 std::ifstream ref_out_file(refOutputFileName.c_str(), std::ios::in);
2392 std::ofstream in_file(inputfileName.c_str(), std::ios::out);
2394 float summOfDiff = 0.f;
2395 float summOfSqDiff = 0.f;
2399 for (int k = 0; k < component[i].num_rows_out; k++) {
2400 for (int j = 0; j < component[i].num_columns_out; j++) {
2401 float floatValue = 0.f;
2402 if (component[i].num_bytes_per_output == 4) {
2403 if (number_type_ == kDnnInt) {
2404 auto value = reinterpret_cast<int32_t *>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
2405 floatValue = static_cast<float>(value);
2408 floatValue = reinterpret_cast<float*>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
2411 auto value = reinterpret_cast<int16_t *>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
2412 floatValue = static_cast<float>(value);
2414 out_file << std::setw(8) << floatValue / component[i].output_scale_factor << "\n";
2417 float ref_value = 0.f;
2418 ref_out_file >> ref_value;
2419 float diff = (ref_value - floatValue);
2420 diff = diff < 0.f ? -diff : diff;
2422 summOfSqDiff += diff * diff;
2423 maxD = std::max(maxD, diff);
2429 auto rmse = sqrt(summOfSqDiff / numItems);
2430 auto avg = summOfDiff / numItems;
2431 std :: cout << std::left << std::setw(55) << out_file_name.str()
2432 << " RMSE="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << rmse
2433 << " avg=" << std::fixed << std::setprecision(5) << std::right << std::setw(8) << avg
2434 << " maxD="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << maxD << std::endl;
2437 float input_scale_factor = component[i].output_scale_factor;
2438 if (component[i].operation == kDnnAffineOp ||
2439 component[i].operation == kDnnDiagonalOp) {
2440 input_scale_factor /= component[i].op.affine.weight_scale_factor;
2441 } else if (component[i].operation == kDnnConvolutional1dOp) {
2442 input_scale_factor /= component[i].op.conv1D.weight_scale_factor;
2443 } else if (component[i].operation == kDnnPiecewiselinearOp) {
2444 input_scale_factor = 1.f;
2447 for (int k = 0; k < component[i].num_rows_in; k++) {
2448 for (int j = 0; j < component[i].num_columns_in; j++) {
2449 float floatValue = 0.f;
2450 if (component[i].num_bytes_per_input == 4) {
2451 if (number_type_ == kDnnInt) {
2452 auto value = reinterpret_cast<int32_t *>(component[i].ptr_inputs)[k * component[i].num_columns_in + j];
2453 floatValue = static_cast<float>(value);
2455 floatValue = reinterpret_cast<float *>(component[i].ptr_inputs)[k * component[i].num_columns_in + j];
2458 auto value = reinterpret_cast<int16_t *>(component[i].ptr_inputs)[k * component[i].num_columns_in+ j];
2459 floatValue = static_cast<float>(value);
2461 in_file << std::setw(8) << floatValue / input_scale_factor << "\n";
2468 bool isCompatibleDnn(AmIntelDnn dnn1, AmIntelDnn dnn2) {
2469 bool isCompatible = true;
2471 // compare basic structures to see if they are compatible
2472 if (dnn1.num_components() != dnn2.num_components()) isCompatible = false;
2473 for (int i = 0; i < dnn1.num_components(); i++) {
2474 if (dnn1.component[i].num_rows_in != dnn2.component[i].num_rows_in) isCompatible = false;
2475 if (dnn1.component[i].num_columns_in != dnn2.component[i].num_columns_in) isCompatible = false;
2476 if (dnn1.component[i].num_rows_out != dnn2.component[i].num_rows_out) isCompatible = false;
2477 if (dnn1.component[i].num_columns_out != dnn2.component[i].num_columns_out) isCompatible = false;
2478 if (dnn1.component[i].operation != dnn2.component[i].operation) isCompatible = false;
2481 return (isCompatible);
2484 void ClearScoreError(intel_score_error_t *error) {
2485 error->num_scores = 0;
2486 error->num_errors = 0;
2487 error->max_error = 0.0;
2488 error->sum_error = 0.0;
2489 error->sum_squared_error = 0.0;
2490 error->max_rel_error = 0.0;
2491 error->sum_rel_error = 0.0;
2492 error->sum_squared_rel_error = 0.0;
2495 void UpdateScoreError(intel_score_error_t *error, intel_score_error_t *total_error) {
2496 total_error->num_errors += error->num_errors;
2497 total_error->num_scores += error->num_scores;
2498 total_error->sum_error += error->sum_error;
2499 total_error->sum_squared_error += error->sum_squared_error;
2500 if (error->max_error > total_error->max_error) {
2501 total_error->max_error = error->max_error;
2503 total_error->sum_rel_error += error->sum_rel_error;
2504 total_error->sum_squared_rel_error += error->sum_squared_rel_error;
2505 if (error->max_rel_error > total_error->max_rel_error) {
2506 total_error->max_rel_error = error->max_rel_error;
2510 void SoftmaxGoogle(float *ptr_output, float *ptr_input, const uint32_t num_outputs, const uint32_t num_inputs) {
2511 // Assumes input vector contains log likelihoods
2512 // The computes x[i] = x[i] - log(sum_j exp(x[j]))
2513 // This normalizes the likelihoods by the sum of likelihoods but stores them as log likelihoods
2515 float max_score = ptr_input[0];
2518 // find max score for normalization to [0,1]
2519 for (uint32_t i = 0; i < num_inputs; i++) {
2520 if (ptr_input[i] > max_score) {
2521 max_score = ptr_input[i];
2524 for (uint32_t i = 0; i < num_inputs; i++) {
2525 sum += exp(ptr_input[i] - max_score);
2527 if (sum < 1.0e-20) {
2528 fprintf(stderr, "Warning: attempt to take log(0) in SoftmaxGoogle()!\n");
2531 diff = max_score + log(sum);
2532 for (uint32_t i = 0; i < num_outputs; i++) {
2533 ptr_output[i] = ptr_input[i] - diff;