Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / src / gna_plugin / dnn.cpp
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4 // dnn.cpp : component based neural network class for ease of use
5 //
6 extern bool global_debug;
7
8 #include <cstdlib>
9 #include <cstdio>
10 #include <cmath>
11 #include <set>
12 #include <details/ie_exception.hpp>
13 #include <algorithm>
14 #include <gna-api-types-xnn.h>
15
16 #ifndef _NO_MKL_
17 #include <mkl_dnn.h>
18 #endif
19 #include "dnn.h"
20 #ifdef INTEGER_REF
21 #include "convnet.h"
22 #include "igemv16.h"
23 #include "igemv8.h"
24 #include "sgemm.h"
25 #else
26 #include "floatmath.h"
27 #endif
28 #include "pwl.h"
29 #include "util.h"
30 #include "gna_plugin_log.hpp"
31
32 #ifdef WIN32
33 # define rand_r(X) rand()
34 #endif
35
36 /**
37  * whether to dump weights and biases
38  */
39 #define DUMP_WB
40 /**
41  * in light mode only layer names are dumped
42  * @param filename
43  * @param number_type
44  * @return
45  */
46 #define LIGHT_DUMP
47
48 static int & getDumpFolderId() {
49     static int N = 0;
50     return N;
51 }
52
53 static std::string getDumpFolderNameGNA() {
54     return std::string("./gna_layers/")+std::to_string(getDumpFolderId() - 1)+"/";
55 }
56
57 static std::string getDumpFolderName() {
58     return std::string("./layers/")+std::to_string(getDumpFolderId() - 1)+"/";
59 }
60
61 static std::string getRefFolderName() {
62     return std::string("./ref_layers/")+std::to_string(getDumpFolderId() - 1)+"/";
63 }
64
65 void AmIntelDnn::BeginNewWrite() {
66     getDumpFolderId()++;
67 }
68
69
70 void AmIntelDnn::Init(void *ptr_memory,
71                       uint32_t num_memory_bytes,
72                       intel_dnn_number_type_t number_type,
73                       float scale_factor) {
74     ptr_dnn_memory_ = ptr_memory;
75     num_bytes_dnn_memory_ = num_memory_bytes;
76     number_type_ = number_type;
77     input_scale_factor_ = scale_factor;
78
79     ptr_active_outputs_ = nullptr;
80     num_active_outputs_ = 0;
81     num_left_context = 0;
82     num_right_context = 0;
83     do_rotate_input = false;
84     softmax_type = kSoftmaxNone;
85     ptr_sumgroup_sizes = nullptr;
86     num_sumgroup_sizes = 0;
87     ptr_priors = nullptr;
88
89
90     //  component.clear();
91 }
92
93 void AmIntelDnn::InitActiveList(uint32_t *ptr_active_list) {
94     ptr_active_outputs_ = ptr_active_list;
95     if (ptr_active_list == nullptr) {
96         if (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) {
97             num_active_outputs_ = component[component.size() - 1].num_rows_out;
98         } else {
99             num_active_outputs_ = component[component.size() - 1].num_columns_out;
100         }
101     } else {
102         num_active_outputs_ = 0;
103     }
104 }
105
106 void AmIntelDnn::AddComponents(uint32_t num_components_to_add) {
107     component.resize(component.size() + num_components_to_add);
108     for (uint32_t i = 0; i < num_components_to_add; i++) {
109         ClearComponent(component.size() - i - 1);
110     }
111 }
112
113 void AmIntelDnn::ClearComponent(uint32_t component_index) {
114     if (component_index > component.size() - 1) {
115         fprintf(stderr, "Error:  attempt to clear non-existent component!\n");
116         throw -1;
117     }
118     component[component_index].num_rows_in = 0;
119     component[component_index].num_columns_in = 0;
120     component[component_index].num_rows_out = 0;
121     component[component_index].num_columns_out = 0;
122     component[component_index].num_bytes_per_input = 0;
123     component[component_index].num_bytes_per_output = 0;
124     component[component_index].operation = kDnnNullOp;
125     component[component_index].macro_operation = kDnnMacroOpNone;
126     component[component_index].orientation_in = kDnnUnknownOrientation;
127     component[component_index].orientation_out = kDnnUnknownOrientation;
128     component[component_index].ptr_inputs = nullptr;
129     component[component_index].ptr_outputs = nullptr;
130     memset(&component[component_index].op, 0, sizeof(component[component_index].op));
131 }
132
133 void AmIntelDnn::ClearState() {
134     // To support recurrent networks, provide mechanism to clear persistent state
135     // (e.g., between utterances for speech recognition).  For recurrent component,
136     // this means clearing the feedback buffer.  For other components, just clear the
137     // output buffer since any feedback will come from some component's output.
138     for (uint32_t i = 0; i < component.size(); i++) {
139         if (component[i].operation == kDnnRecurrentOp) {
140             memset(component[i].op.recurrent.ptr_feedbacks,
141                    0,
142                    component[i].op.recurrent.num_vector_delay * component[i].num_columns_out
143                        * component[i].num_bytes_per_input);
144         } else {
145             memset(component[i].ptr_outputs,
146                    0,
147                    component[i].num_bytes_per_output * component[i].num_rows_out * component[i].num_columns_out);
148         }
149     }
150 }
151
152 void AmIntelDnn::InitAffineComponentPrivate(intel_dnn_component_t &comp,
153                                             uint32_t num_rows_in,
154                                             uint32_t num_columns,
155                                             uint32_t num_rows_out,
156                                             uint32_t num_bytes_per_input,
157                                             uint32_t num_bytes_per_output,
158                                             uint32_t num_bytes_per_weight,
159                                             uint32_t num_bytes_per_bias,
160                                             float weight_scale_factor,
161                                             float output_scale_factor,
162                                             void *&ptr_inputs,
163                                             void *&ptr_outputs,
164                                             void *&ptr_weights,
165                                             void *&ptr_biases,
166                                             bool isDiag,
167                                             bool postInitMem) {
168     comp.num_rows_in = num_rows_in;
169     comp.num_columns_in = num_columns;
170     comp.num_rows_out = num_rows_out;
171     comp.num_columns_out = num_columns;
172     comp.num_bytes_per_input = num_bytes_per_input;
173     comp.num_bytes_per_output = num_bytes_per_output;
174     comp.operation = isDiag ? kDnnDiagonalOp : kDnnAffineOp;
175     comp.macro_operation = kDnnMacroOpNone;
176     comp.orientation_in = kDnnInterleavedOrientation;
177     comp.orientation_out = kDnnInterleavedOrientation;
178     comp.op.affine.num_bytes_per_weight = num_bytes_per_weight;
179     comp.op.affine.num_bytes_per_bias = num_bytes_per_bias;
180     comp.op.affine.weight_scale_factor = weight_scale_factor;
181     comp.output_scale_factor = output_scale_factor;
182     if (!postInitMem) {
183         comp.op.affine.ptr_weights = ptr_weights;
184         comp.op.affine.ptr_biases = ptr_biases;
185         comp.ptr_inputs = ptr_inputs;
186         comp.ptr_outputs = ptr_outputs;
187     } else {
188         ptr_weights = &comp.op.affine.ptr_weights;
189         ptr_biases = &comp.op.affine.ptr_biases;
190         ptr_inputs = &comp.ptr_inputs;
191         ptr_outputs = &comp.ptr_outputs;
192     }
193 }
194
195 void AmIntelDnn::InitDiagonalComponent(uint32_t component_index,
196                                        uint32_t num_rows_in,
197                                        uint32_t num_columns,
198                                        uint32_t num_rows_out,
199                                        uint32_t num_bytes_per_input,
200                                        uint32_t num_bytes_per_output,
201                                        uint32_t num_bytes_per_weight,
202                                        uint32_t num_bytes_per_bias,
203                                        float weight_scale_factor,
204                                        float output_scale_factor,
205                                        void *ptr_inputs,
206                                        void *ptr_outputs,
207                                        void *ptr_weights,
208                                        void *ptr_biases) {
209     component[component_index].num_rows_in = num_rows_in;
210     component[component_index].num_columns_in = num_columns;
211     component[component_index].num_rows_out = num_rows_out;
212     component[component_index].num_columns_out = num_columns;
213     component[component_index].num_bytes_per_input = num_bytes_per_input;
214     component[component_index].num_bytes_per_output = num_bytes_per_output;
215     component[component_index].operation = kDnnDiagonalOp;
216     component[component_index].macro_operation = kDnnMacroOpNone;
217     component[component_index].orientation_in = kDnnInterleavedOrientation;
218     component[component_index].orientation_out = kDnnInterleavedOrientation;
219     component[component_index].ptr_inputs = ptr_inputs;
220     component[component_index].ptr_outputs = ptr_outputs;
221     component[component_index].op.affine.num_bytes_per_weight = num_bytes_per_weight;
222     component[component_index].op.affine.num_bytes_per_bias = num_bytes_per_bias;
223     component[component_index].op.affine.weight_scale_factor = weight_scale_factor;
224     component[component_index].output_scale_factor = output_scale_factor;
225     component[component_index].op.affine.ptr_weights = ptr_weights;
226     component[component_index].op.affine.ptr_biases = ptr_biases;
227 }
228
229 void AmIntelDnn::InitConvolutional1DComponentPrivate(intel_dnn_component_t &comp,
230                                               uint32_t num_rows_in,
231                                               uint32_t num_columns_in,
232                                               uint32_t num_rows_out,
233                                               uint32_t num_columns_out,
234                                               uint32_t num_bytes_per_input,
235                                               uint32_t num_bytes_per_output,
236                                               uint32_t num_bytes_per_weight,
237                                               uint32_t num_bytes_per_bias,
238                                               uint32_t num_filters,
239                                               uint32_t num_filter_rows,
240                                               uint32_t num_filter_coefficients,
241                                               uint32_t num_feature_maps,
242                                               uint32_t num_feature_map_rows,
243                                               uint32_t num_feature_map_columns,
244                                               float weight_scale_factor,
245                                               float output_scale_factor,
246                                               void *&ptr_inputs,
247                                               void *&ptr_outputs,
248                                               void *&ptr_filters,
249                                               void *&ptr_biases,
250                                               bool postInitMem) {
251     comp.num_rows_in = num_rows_in;
252     comp.num_columns_in = num_columns_in;
253     comp.num_rows_out = num_rows_out;
254     comp.num_columns_out = num_columns_out;
255     comp.num_bytes_per_input = num_bytes_per_input;
256     comp.num_bytes_per_output = num_bytes_per_output;
257     comp.operation = kDnnConvolutional1dOp;
258     comp.macro_operation = kDnnMacroOpNone;
259     comp.orientation_in = kDnnNonInterleavedOrientation;
260     comp.orientation_out = kDnnNonInterleavedOrientation;
261     comp.ptr_inputs = ptr_inputs;
262     comp.ptr_outputs = ptr_outputs;
263     comp.op.conv1D.num_bytes_per_weight = num_bytes_per_weight;
264     comp.op.conv1D.num_bytes_per_bias = num_bytes_per_bias;
265     comp.op.conv1D.num_filters = num_filters;
266     comp.op.conv1D.num_filter_rows = num_filter_rows;
267     comp.op.conv1D.num_filter_coefficients = num_filter_coefficients;
268     comp.op.conv1D.num_feature_maps = num_feature_maps;
269     comp.op.conv1D.num_feature_map_rows = num_feature_map_rows;
270     comp.op.conv1D.num_feature_map_columns = num_feature_map_columns;
271     comp.op.conv1D.weight_scale_factor = weight_scale_factor;
272     comp.output_scale_factor = output_scale_factor;
273
274     if (!postInitMem) {
275         comp.op.conv1D.ptr_filters = ptr_filters;
276         comp.op.conv1D.ptr_biases  = ptr_biases;
277         comp.ptr_inputs = ptr_inputs;
278         comp.ptr_outputs = ptr_outputs;
279     } else {
280         ptr_filters = &comp.op.conv1D.ptr_filters;
281         ptr_biases  = &comp.op.conv1D.ptr_biases;
282         ptr_inputs  = &comp.ptr_inputs;
283         ptr_outputs = &comp.ptr_outputs;
284     }
285 }
286
287 void AmIntelDnn::InitMaxpoolComponentPrivate(intel_dnn_component_t &comp,
288                                       uint32_t num_rows_in,
289                                       uint32_t num_columns_in,
290                                       uint32_t num_rows_out,
291                                       uint32_t num_columns_out,
292                                       uint32_t num_bytes_per_input,
293                                       uint32_t num_bytes_per_output,
294                                       uint32_t num_pool_size,
295                                       uint32_t num_pool_step,
296                                       uint32_t num_pool_stride,
297                                       bool do_sum_not_max,
298                                       float output_scale_factor,
299                                       void *&ptr_inputs,
300                                       void *&ptr_outputs,
301                                       bool postInitMem) {
302     comp.num_rows_in = num_rows_in;
303     comp.num_columns_in = num_columns_in;
304     comp.num_rows_out = num_rows_out;
305     comp.num_columns_out = num_columns_out;
306     comp.num_bytes_per_input = num_bytes_per_input;
307     comp.num_bytes_per_output = num_bytes_per_output;
308     comp.operation = kDnnMaxPoolOp;
309     comp.macro_operation = kDnnMacroOpNone;
310     comp.orientation_in = kDnnNonInterleavedOrientation;
311     comp.orientation_out = kDnnNonInterleavedOrientation;
312     comp.op.maxpool.num_inputs = num_pool_size;
313     comp.op.maxpool.num_inputs_step = num_pool_step;
314     comp.op.maxpool.num_inputs_stride = num_pool_stride;
315     comp.op.maxpool.do_sum_not_max = do_sum_not_max;
316     comp.output_scale_factor = output_scale_factor;
317
318     if (!postInitMem) {
319         comp.ptr_inputs = ptr_inputs;
320         comp.ptr_outputs = ptr_outputs;
321     } else {
322         ptr_inputs  = &comp.ptr_inputs;
323         ptr_outputs = &comp.ptr_outputs;
324     }
325 }
326
327 void AmIntelDnn::InitCopyComponentPrivate(intel_dnn_component_t &comp,
328                                           intel_dnn_orientation_t orientation,
329                                           uint32_t num_rows_in,
330                                           uint32_t num_columns_in,
331                                           uint32_t num_rows_out,
332                                           uint32_t num_columns_out,
333                                           uint32_t num_bytes_per_input,
334                                           uint32_t num_bytes_per_output,
335                                           float output_scale_factor,
336                                           uint32_t num_copy_rows,
337                                           uint32_t num_copy_columns,
338                                           void *&ptr_inputs,
339                                           void *&ptr_outputs,
340                                           bool postInitMem) {
341     comp.num_rows_in = num_rows_in;
342     comp.num_columns_in = num_columns_in;
343     comp.num_rows_out = num_rows_out;
344     comp.num_columns_out = num_columns_out;
345     comp.num_bytes_per_input = num_bytes_per_input;
346     comp.num_bytes_per_output = num_bytes_per_output;
347     comp.operation = kDnnCopyOp;
348     comp.macro_operation = kDnnMacroOpNone;
349     comp.orientation_in = orientation;
350     comp.orientation_out = orientation;
351     comp.ptr_inputs = ptr_inputs;
352     comp.ptr_outputs = ptr_outputs;
353     comp.output_scale_factor = output_scale_factor;
354     comp.op.copy.num_copy_rows = num_copy_rows;
355     comp.op.copy.num_copy_columns = num_copy_columns;
356
357     if (!postInitMem) {
358         comp.ptr_inputs = ptr_inputs;
359         comp.ptr_outputs = ptr_outputs;
360     } else {
361         ptr_inputs  = &comp.ptr_inputs;
362         ptr_outputs = &comp.ptr_outputs;
363     }
364 }
365
366 void AmIntelDnn::InitPiecewiseLinearComponentPrivate(intel_dnn_component_t &comp,
367                                                      DnnActivation function_id,
368                                                      intel_dnn_orientation_t orientation,
369                                                      uint32_t num_rows,
370                                                      uint32_t num_columns,
371                                                      uint32_t num_bytes_per_input,
372                                                      uint32_t num_bytes_per_output,
373                                                      uint32_t num_segments,
374                                                      float output_scale_factor,
375                                                      void *&ptr_inputs,
376                                                      void *&ptr_outputs,
377                                                      intel_pwl_segment_t *ptr_segments,
378                                                      bool postInitMem) {
379     comp.num_rows_in = num_rows;
380     comp.num_columns_in = num_columns;
381     comp.num_rows_out = num_rows;
382     comp.num_columns_out = num_columns;
383     comp.num_bytes_per_input = num_bytes_per_input;
384     comp.num_bytes_per_output = num_bytes_per_output;
385     comp.operation = kDnnPiecewiselinearOp;
386     comp.macro_operation = kDnnMacroOpNone;
387     comp.orientation_in = orientation;
388     comp.orientation_out = orientation;
389     comp.op.pwl.func_id = function_id;
390     comp.op.pwl.num_segments = num_segments;
391     comp.output_scale_factor = output_scale_factor;
392
393     if (!postInitMem) {
394         comp.ptr_inputs = ptr_inputs;
395         comp.ptr_outputs = ptr_outputs;
396         comp.op.pwl.ptr_segments = ptr_segments;
397     } else {
398         ptr_inputs = &comp.ptr_inputs;
399         ptr_outputs = &comp.ptr_outputs;
400         if (ptr_segments != nullptr) {
401             *reinterpret_cast<intel_pwl_segment_t **>(ptr_segments) =
402                 reinterpret_cast<intel_pwl_segment_t *>(& comp.op.pwl.ptr_segments);
403         }
404     }
405 }
406
407 void AmIntelDnn::InitRecurrentComponent(uint32_t component_index,
408                                         uint32_t num_rows,
409                                         uint32_t num_columns_in,
410                                         uint32_t num_columns_out,
411                                         uint32_t num_bytes_per_input,
412                                         uint32_t num_bytes_per_output,
413                                         uint32_t num_vector_delay,
414                                         uint32_t num_bytes_per_weight,
415                                         uint32_t num_bytes_per_bias,
416                                         float weight_scale_factor,
417                                         float output_scale_factor,
418                                         void *ptr_inputs,
419                                         void *ptr_feedbacks,
420                                         void *ptr_outputs,
421                                         void *ptr_weights,
422                                         void *ptr_biases) {
423     component[component_index].num_rows_in = num_rows;
424     component[component_index].num_columns_in = num_columns_in;
425     component[component_index].num_rows_out = num_rows;
426     component[component_index].num_columns_out = num_columns_out;
427     component[component_index].num_bytes_per_input = num_bytes_per_input;
428     component[component_index].num_bytes_per_output = num_bytes_per_output;
429     component[component_index].operation = kDnnRecurrentOp;
430     component[component_index].macro_operation = kDnnMacroOpNone;
431     component[component_index].orientation_in = kDnnNonInterleavedOrientation;
432     component[component_index].orientation_out = kDnnNonInterleavedOrientation;
433     component[component_index].ptr_inputs = ptr_inputs;
434     component[component_index].ptr_outputs = ptr_outputs;
435     component[component_index].op.recurrent.num_vector_delay = num_vector_delay;
436     component[component_index].op.recurrent.num_bytes_per_weight = num_bytes_per_weight;
437     component[component_index].op.recurrent.num_bytes_per_bias = num_bytes_per_bias;
438     component[component_index].op.recurrent.weight_scale_factor = weight_scale_factor;
439     component[component_index].output_scale_factor = output_scale_factor;
440     component[component_index].op.recurrent.ptr_feedbacks = ptr_feedbacks;
441     component[component_index].op.recurrent.ptr_weights = ptr_weights;
442     component[component_index].op.recurrent.ptr_biases = ptr_biases;
443 }
444
445 void AmIntelDnn::InitInterleaveComponent(uint32_t component_index, uint32_t num_rows, uint32_t num_columns,
446                                          uint32_t num_bytes_per_input, uint32_t num_bytes_per_output,
447                                          float output_scale_factor, void *ptr_inputs, void *ptr_outputs) {
448     component[component_index].num_rows_in = num_rows;
449     component[component_index].num_columns_in = num_columns;
450     component[component_index].num_rows_out = num_columns;
451     component[component_index].num_columns_out = num_rows;
452     component[component_index].num_bytes_per_input = num_bytes_per_input;
453     component[component_index].num_bytes_per_output = num_bytes_per_output;
454     component[component_index].operation = kDnnInterleaveOp;
455     component[component_index].macro_operation = kDnnMacroOpNone;
456     component[component_index].orientation_in = kDnnNonInterleavedOrientation;
457     component[component_index].orientation_out = kDnnInterleavedOrientation;
458     component[component_index].ptr_inputs = ptr_inputs;
459     component[component_index].ptr_outputs = ptr_outputs;
460     component[component_index].output_scale_factor = output_scale_factor;
461 }
462
463 void AmIntelDnn::InitDeinterleaveComponent(uint32_t component_index, uint32_t num_rows, uint32_t num_columns,
464                                            uint32_t num_bytes_per_input, uint32_t num_bytes_per_output,
465                                            float output_scale_factor, void *ptr_inputs, void *ptr_outputs) {
466     component[component_index].num_rows_in = num_rows;
467     component[component_index].num_columns_in = num_columns;
468     component[component_index].num_rows_out = num_columns;
469     component[component_index].num_columns_out = num_rows;
470     component[component_index].num_bytes_per_input = num_bytes_per_input;
471     component[component_index].num_bytes_per_output = num_bytes_per_output;
472     component[component_index].operation = kDnnDeinterleaveOp;
473     component[component_index].macro_operation = kDnnMacroOpNone;
474     component[component_index].orientation_in = kDnnInterleavedOrientation;
475     component[component_index].orientation_out = kDnnNonInterleavedOrientation;
476     component[component_index].ptr_inputs = ptr_inputs;
477     component[component_index].ptr_outputs = ptr_outputs;
478     component[component_index].output_scale_factor = output_scale_factor;
479 }
480
481 __inline void ApplyAffineTransform(intel_dnn_component_t *component, uint32_t *list, uint32_t listsize) {
482     auto transform = &component->op.affine;
483     int m = component->num_rows_out;
484     int n = component->num_columns_in;
485     int k = component->num_rows_in;
486     int lda = component->num_rows_in;
487     int ldb = component->num_columns_in;
488     int ldc = component->num_columns_out;
489
490     switch (component->num_bytes_per_input) {
491 #ifdef INTEGER_REF
492         case 2:
493             if (component->op.affine.num_bytes_per_weight == 1) {
494                 int8_t *A = reinterpret_cast<int8_t*>(transform->ptr_weights);
495                 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
496                 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
497                 intel_compound_bias_t *bias = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
498                 if (list == nullptr) {
499                     //  PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
500                     //  PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
501                     //  PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
502                     igemm8_gna(m, n, k, A, lda, B, ldb, bias, C, ldc);
503                 } else {
504                     //  PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
505                     //  PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
506                     //  PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
507                     igemm8_gna_subset(m, n, k, A, lda, B, ldb, bias, C, ldc, list, listsize);
508                 }
509                 //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
510             } else if (component->op.affine.num_bytes_per_weight == 2) {
511                 int16_t *A = reinterpret_cast<int16_t*>(transform->ptr_weights);
512                 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
513                 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
514                 int32_t *bias = reinterpret_cast<int32_t*>(transform->ptr_biases);
515                 if (list == nullptr) {
516                     for (uint32_t i = 0; i < m; i++) {
517                         for (uint32_t j = 0; j < n; j++) {
518                             C[i*ldc+j] = bias[i];
519                         }
520                     }
521                     //  PrintMatrixInt16("A int16", A, k, m, lda, component->op.affine.weight_scale_factor);
522                     //  PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.weight_scale_factor);
523                     //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
524                     cblas_igemm16(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc);
525                 } else {
526                     for (int l = 0; l < listsize; l++) {
527                         int i = list[l];
528                         for (uint32_t j = 0; j < n; j++) {
529                             C[l*ldc+j] = bias[i];
530                         }
531                     }
532                     //  PrintMatrixInt16("A int16", A, k, m, lda, component->op.affine.scale_factor);
533                     //  PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.scale_factor);
534                     //  PrintMatrixInt32("C int32", C, m, n, ldc, component->op.affine.scale_factor * component->op.affine.scale_factor);
535                     cblas_igemm16_subset(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc, list, listsize);
536                 }
537                 //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
538             } else {
539                 fprintf(stderr, "Bad weight width in ApplyAffineTransform!\n");
540                 throw -1;
541             }
542             break;
543 #endif  // #ifdef INTEGER_REF
544         case 4: {
545             auto A = reinterpret_cast<float *>(transform->ptr_weights);
546             auto B = reinterpret_cast<float *>(component->ptr_inputs);
547             auto C = reinterpret_cast<float *>(component->ptr_outputs);
548             auto bias = reinterpret_cast<float *>(transform->ptr_biases);
549             if (list == nullptr) {
550                 for (uint32_t i = 0; i < m; i++) {
551                     for (uint32_t j = 0; j < n; j++) {
552                         C[i * ldc + j] = bias[i];
553                     }
554                 }
555                 //  if (global_debug) PrintMatrixFloat32("A float", A, m, k, lda);
556                 //  if (global_debug) PrintMatrixFloat32("B float", B, k, n, ldb);
557                 //  if (global_debug) PrintMatrixFloat32("C float before", C, m, n, ldc);
558                 cblas_sgemm1(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc);
559                 //  if (global_debug) PrintMatrixFloat32("C float after", C, m, n, ldc);
560             } else {
561                 for (int l = 0; l < listsize; l++) {
562                     int i = list[l];
563                     for (uint32_t j = 0; j < n; j++) {
564                         C[l * ldc + j] = bias[i];
565                     }
566                 }
567                 //  PrintMatrixFloat32("A float", A, k, m, lda);
568                 //  PrintMatrixFloat32("trans(B) float", B, k, n, ldb);
569                 //  PrintMatrixFloat32("C float before", C, listsize, n, ldc);
570                 cblas_sgemm_subset(CblasRowMajor,
571                                    CblasNoTrans,
572                                    CblasNoTrans,
573                                    m,
574                                    n,
575                                    k,
576                                    1.0,
577                                    A,
578                                    lda,
579                                    B,
580                                    ldb,
581                                    1.0,
582                                    C,
583                                    ldc,
584                                    list,
585                                    listsize);
586                 //  PrintMatrixFloat32("C float after", C, listsize, n, ldc);
587             }
588         }
589             break;
590         default:fprintf(stderr, "Bad data width in ApplyAffineTransform!\n");
591             throw -1;
592     }
593 }
594
595 __inline void ApplyDiagonalTransform(intel_dnn_component_t *component) {
596     auto transform = &component->op.affine;
597     int m = component->num_rows_out;
598     int n = component->num_columns_in;
599     int ldb = component->num_columns_in;
600     int ldc = component->num_columns_out;
601
602     switch (component->num_bytes_per_input) {
603 #ifdef INTEGER_REF
604         case 2:
605             if (component->op.affine.num_bytes_per_weight == 1) {
606                 int8_t *A = reinterpret_cast<int8_t*>(transform->ptr_weights);
607                 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
608                 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
609                 intel_compound_bias_t *bias = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
610                 //  PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
611                 //  PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
612                 //  PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
613                 isbmm8_gna(m, n, A, lda, B, ldb, bias, C, ldc);
614                 //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
615             } else if (component->op.affine.num_bytes_per_weight == 2) {
616                 int16_t *A = reinterpret_cast<int16_t*>(transform->ptr_weights);
617                 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
618                 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
619                 int32_t *bias = reinterpret_cast<int32_t*>(transform->ptr_biases);
620                 for (uint32_t i = 0; i < m; i++) {
621                     for (uint32_t j = 0; j < n; j++) {
622                         C[i*ldc+j] = bias[i];
623                     }
624                 }
625                 //  PrintMatrixInt16("A int16", A, 1, m, lda, component->op.affine.weight_scale_factor);
626                 //  PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.weight_scale_factor);
627                 //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
628                 cblas_isbmm16(m, n, A, lda, B, ldb, C, ldc);
629                 //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
630             } else {
631                 fprintf(stderr, "Bad weight width in ApplyDiagonalTransform!\n");
632                 throw -1;
633             }
634             break;
635 #endif  // #ifdef INTEGER_REF
636         case 4: {
637             auto A = reinterpret_cast<float *>(transform->ptr_weights);
638             auto B = reinterpret_cast<float *>(component->ptr_inputs);
639             auto C = reinterpret_cast<float *>(component->ptr_outputs);
640             auto bias = reinterpret_cast<float *>(transform->ptr_biases);
641             for (uint32_t i = 0; i < m; i++) {
642                 for (uint32_t j = 0; j < n; j++) {
643                     C[i * ldc + j] = bias[i];
644                 }
645             }
646             //  PrintMatrixFloat32("A float", A, 1, m, lda);
647             //  PrintMatrixFloat32("B float", B, k, n, ldb);
648             //  PrintMatrixFloat32("C float before", C, m, n, ldc);
649             for (uint32_t j = 0; j < n; j++) {
650                 float *Bcol = B + j * ldb;
651                 float *Ccol = C + j * ldc;
652                 cblas_ssbmv1(CblasRowMajor, CblasLower, m, 0, 1.0, A, 1, Bcol, 1, 1.0, Ccol, 1);
653             }
654             //  PrintMatrixFloat32("C float after", C, m, n, ldc);
655         }
656             break;
657         default:fprintf(stderr, "Bad data width in ApplyDiagonalTransform!\n");
658             throw -1;
659     }
660 }
661
662 __inline void ApplyRecurrentTransform(intel_dnn_component_t *component, uint32_t row, void *ptr_feedbacks) {
663     intel_recurrent_t *transform = &component->op.recurrent;
664     int k1 = component->num_columns_in;
665     int k2 = component->num_columns_out;
666     int n = k2;
667
668     if (component->op.recurrent.ptr_feedbacks == nullptr) {
669         fprintf(stderr, "nullptr feedback pointer in ApplyRecurrentTransform()!\n");
670         throw -1;
671     }
672
673     switch (component->num_bytes_per_input) {
674 #ifdef INTEGER_REF
675         case 2:
676             if (component->op.recurrent.num_bytes_per_weight == 1) {
677                 int16_t *A1 = reinterpret_cast<int16_t*>(component->ptr_inputs) + row * component->num_columns_in;
678                 int16_t *A2 = reinterpret_cast<int16_t*>(ptr_feedbacks);
679                 int8_t *X = reinterpret_cast<int8_t*>(transform->ptr_weights);
680                 intel_compound_bias_t *B = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
681                 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs) + row * component->num_columns_out;
682                 //  PrintMatrixInt16("A1 int", A1, 1, k1, k1, component->op.recurrent.weight_scale_factor);
683                 //  PrintMatrixInt16("A2 int", A2, 1, k2, k2);
684                 //  PrintMatrixInt8("X int", X, k, n, n, component->op.recurrent.weight_scale_factor);
685                 //  PrintMatrixInt32("B int", B, 1, 2*n, 2*n, component->output_scale_factor);
686                 igemv8_gna_split(n, k1, k2, A1, A2, X, B, C);
687                 //  PrintMatrixInt32("C int", C, 1, n, n, component->output_scale_factor);
688             } else if (component->op.recurrent.num_bytes_per_weight == 2) {
689                 int16_t *A1 = reinterpret_cast<int16_t*>(component->ptr_inputs) + row * component->num_columns_in;
690                 int16_t *A2 = reinterpret_cast<int16_t*>(ptr_feedbacks);
691                 int16_t *X = reinterpret_cast<int16_t*>(transform->ptr_weights);
692                 int32_t *B = reinterpret_cast<int32_t*>(transform->ptr_biases);
693                 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs) + row * component->num_columns_out;
694                 //  PrintMatrixInt16("A1 int", A1, 1, k1, k1, component->op.recurrent.weight_scale_factor);
695                 //  PrintMatrixInt16("A2 int", A2, 1, k2, k2, component->op.recurrent.weight_scale_factor);
696                 //  PrintMatrixInt16("X int", X, k, n, n, component->op.recurrent.weight_scale_factor);
697                 //  PrintMatrixInt32("B int", B, 1, n, n, component->output_scale_factor);
698                 igemv16_split(n, k1, k2, A1, A2, X, B, C);
699                 //  PrintMatrixInt32("C int", C, 1, n, n, component->output_scale_factor);
700             } else {
701                 fprintf(stderr, "Weight width not supported in ApplyRecurrentTransform!\n");
702                 throw -1;
703             }
704             break;
705 #endif  // #ifdef INTEGER_REF
706         case 4: {
707             auto A1 = reinterpret_cast<float *>(component->ptr_inputs) + row * component->num_columns_in;
708             auto A2 = reinterpret_cast<float *>(ptr_feedbacks);
709             auto X = reinterpret_cast<float *>(transform->ptr_weights);
710             auto B = reinterpret_cast<float *>(transform->ptr_biases);
711             auto C = reinterpret_cast<float *>(component->ptr_outputs) + row * component->num_columns_out;
712             //  PrintMatrixFloat32("A1 float", A1, 1, k1, k1);
713             //  PrintMatrixFloat32("A2 float", A2, 1, k2, k2);
714             //  PrintMatrixFloat32("X float", X, k, n, n);
715             //  PrintMatrixFloat32("B float", B, 1, n, n);
716             sgemv_split(n, k1, k2, A1, A2, X, B, C);
717             //  PrintMatrixFloat32("C float", C, 1, n, n);
718         }
719             break;
720         default:fprintf(stderr, "Bad data width in ApplyRecurrentTransform!\n");
721             throw -1;
722     }
723 }
724
725 __inline void ApplyConvolutional1DTransform(intel_dnn_component_t *component) {
726     switch (component->num_bytes_per_input) {
727 #ifdef INTEGER_REF
728         case 2:
729             CNNFilter16(component);
730             break;
731 #endif  // #ifdef INTEGER_REF
732         case 4:
733             //  PrintMatrixFloat32("Input float", reinterpret_cast<float*>(component->ptr_inputs),
734             //  component->num_rows_in, component->num_columns_in, component->num_columns_in);
735             //  PrintMatrixFloat32("Filt float", reinterpret_cast<float*>(component->op.conv1D.ptr_filters),
736             //  component->op.conv1D.num_filters,
737             //  component->op.conv1D.num_filter_rows*component->op.conv1D.num_feature_map_columns*component->op.conv1D.num_feature_maps,
738             //  component->op.conv1D.num_filter_rows*component->op.conv1D.num_feature_map_columns*component->op.conv1D.num_feature_maps);
739             //  PrintMatrixFloat32("Bias float", reinterpret_cast<float*>(component->op.conv1D.ptr_biases), 1,
740             // component->op.conv1D.num_filters, component->op.conv1D.num_filters);
741             CNNFilter32(component);
742             //  PrintMatrixFloat32("Output float", reinterpret_cast<float*>(component->ptr_outputs, component->num_rows_out,
743             // component->num_columns_out, component->num_columns_out);
744             break;
745         default:fprintf(stderr, "Bad data width in ApplyConvolutionalTransform!\n");
746             throw -1;
747     }
748 }
749
750 __inline void ApplyPiecewiseLinearTransform(intel_dnn_component_t *component,
751                                             intel_dnn_number_type_t number_type,
752                                             uint32_t listsize) {
753     if (number_type == kDnnFloat) {
754         // PrintMatrixFloat32("PWL Input float", reinterpret_cast<float*>(component->ptr_inputs), component->num_rows_in,
755         // component->num_columns_in, component->num_columns_in);
756         PwlApply32(component, listsize);
757         // PrintMatrixFloat32("PWL Output float", reinterpret_cast<float*>(component->ptr_outputs), component->num_rows_out,
758         // component->num_columns_out, component->num_columns_out);
759 #ifdef INTEGER_REF
760         } else if (component->num_bytes_per_output == 2) {
761             PwlApply16(component, listsize);
762 #endif  // #ifdef INTEGER_REF
763     } else {
764         fprintf(stderr, "Bad data width in ApplyPiecewiseLinearTransform!\n");
765         throw -1;
766     }
767 }
768
769 __inline void ApplyPiecewiseLinearTransform(intel_dnn_component_t *component,
770                                             intel_dnn_number_type_t number_type,
771                                             uint32_t listsize,
772                                             uint32_t num_row) {
773     if (number_type == kDnnFloat) {
774         PwlApply32(component, num_row, num_row, 0, listsize - 1);
775 #ifdef INTEGER_REF
776         } else if (component->num_bytes_per_output == 2) {
777             PwlApply16(component, num_row, num_row, 0, listsize-1);
778 #endif  // #ifdef INTEGER_REF
779     } else {
780         fprintf(stderr, "Bad data width in ApplyPiecewiseLinearTransform!\n");
781         throw -1;
782     }
783 }
784
785 __inline void ApplyMaxPoolTransform(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) {
786     if (component->num_bytes_per_input == 4) {
787         // PrintMatrixFloat32("Input float", reinterpret_cast<float*>(component->ptr_inputs), component->num_rows_in,
788         // component->num_columns_in, component->num_columns_in);
789         CNNMaxPool(component, number_type);
790         // PrintMatrixFloat32("Output float", reinterpret_cast<float*>(component->ptr_outputs), component->num_rows_out,
791         // component->num_columns_out, component->num_columns_out);
792     } else {
793         fprintf(stderr, "Bad data width in ApplyMaxPoolTransform!\n");
794         throw -1;
795     }
796 }
797
798 __inline void ApplyTranspose(intel_dnn_component_t *component) {
799     int m = component->num_rows_in;
800     int n = component->num_columns_in;
801     int lda = component->num_columns_in;
802     int ldb = component->num_columns_out;
803     // B = Transpose(A) where A is mxn and B is nxm
804     switch (component->num_bytes_per_input) {
805 #ifdef INTEGER_REF
806         case 1:
807             {
808                 int8_t *A = reinterpret_cast<int8_t*>(component->ptr_inputs);
809                 int8_t *B = reinterpret_cast<int8_t*>(component->ptr_outputs);
810                 for (uint32_t row = 0; row < m; row++) {
811                     for (uint32_t col = 0; col < n; col++) {
812                         B[col*ldb+row] = A[row*lda+col];
813                     }
814                 }
815             }
816             break;
817         case 2:
818             {
819                 int16_t *A = reinterpret_cast<int16_t*>(component->ptr_inputs);
820                 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_outputs);
821                 for (uint32_t row = 0; row < m; row++) {
822                     for (uint32_t col = 0; col < n; col++) {
823                         B[col*ldb+row] = A[row*lda+col];
824                     }
825                 }
826             }
827             break;
828 #endif  // #ifdef INTEGER_REF
829         case 4: {
830             auto A = reinterpret_cast<float *>(component->ptr_inputs);
831             auto B = reinterpret_cast<float *>(component->ptr_outputs);
832             for (uint32_t row = 0; row < m; row++) {
833                 for (uint32_t col = 0; col < n; col++) {
834                     B[col * ldb + row] = A[row * lda + col];
835                 }
836             }
837         }
838             break;
839         default:fprintf(stderr, "Bad data width in ApplyInterleave!\n");
840             throw -1;
841     }
842 }
843
844 __inline void ApplyCopy(intel_dnn_component_t *component) {
845     auto src = reinterpret_cast<uint8_t *>(component->ptr_inputs);
846     auto dst = reinterpret_cast<uint8_t *>(component->ptr_outputs);
847     int32_t m = component->op.copy.num_copy_rows;
848     int32_t n = component->op.copy.num_copy_columns;
849     int32_t lda = component->num_columns_in;
850     int32_t ldb = component->num_columns_out;
851     if (m > component->num_rows_in) {
852         fprintf(stderr, "Error:  attempt to copy more columns than matrix has!\n");
853         throw -1;
854     } else {
855         switch (component->num_bytes_per_input) {
856 #ifdef INTEGER_REF
857             case 2:
858                 {
859                     int16_t *A = reinterpret_cast<int16_t*>(src);
860                     int16_t *B = reinterpret_cast<int16_t*>(dst);
861                     for (uint32_t row = 0; row < m; row++) {
862                         for (uint32_t col = 0; col < n; col++) {
863                             B[row*ldb + col] = A[row*lda + col];
864                         }
865                     }
866                 }
867                 break;
868 #endif  // #ifdef INTEGER_REF
869             case 4: {
870                 auto A = reinterpret_cast<float *>(src);
871                 auto B = reinterpret_cast<float *>(dst);
872                 for (uint32_t row = 0; row < m; row++) {
873                     for (uint32_t col = 0; col < n; col++) {
874                         B[row * ldb + col] = A[row * lda + col];
875                     }
876                 }
877             }
878                 break;
879             default:fprintf(stderr, "Bad data width in ApplyCopy!\n");
880                 throw -1;
881         }
882     }
883 }
884
885 uint32_t AmIntelDnn::CopyActiveList(std::vector<std::vector<uint32_t> > &active_list, uint32_t list_index) {
886     if (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) {
887         num_active_outputs_ = component[component.size() - 1].num_rows_out;
888     } else {
889         num_active_outputs_ = component[component.size() - 1].num_columns_out;
890     }
891
892     if (!active_list.empty()) {
893         if (list_index >= active_list.size()) {
894             fprintf(stderr, "Index %d beyond end of active list in CopyActiveList()\n", list_index);
895             throw -1;
896         }
897         if (active_list[list_index].size() > component[component.size() - 1].num_rows_out) {
898             fprintf(stderr, "Active list too large in CopyActiveList()\n");
899             throw -1;
900         }
901
902         if (ptr_active_outputs_ != nullptr) {
903             num_active_outputs_ = active_list[list_index].size();
904             memcpy(ptr_active_outputs_, active_list[list_index].data(), num_active_outputs_ * sizeof(uint32_t));
905         }
906     }
907
908     return (num_active_outputs_);
909 }
910
911 void AmIntelDnn::Propagate() {
912     for (uint32_t i = 0; i < component.size(); i++) {
913         intel_dnn_component_t *comp = &component[i];
914         uint32_t *ptr_active_outputs = nullptr;
915         uint32_t num_active_outputs = (comp->orientation_out == kDnnInterleavedOrientation)
916                                       ? comp->num_rows_out : comp->num_columns_out;
917
918         if (i == component.size() - 1) {  // active list applies to last component
919             ptr_active_outputs = ptr_active_outputs_;
920             num_active_outputs = num_active_outputs_;
921         } else if (i == component.size() - 2) {  // also applies to last two components when last is PWL
922             if ((component[i].operation == kDnnAffineOp) && (component[i + 1].operation == kDnnPiecewiselinearOp)) {
923                 ptr_active_outputs = ptr_active_outputs_;
924                 num_active_outputs = num_active_outputs_;
925             }
926         }
927
928         switch (comp->operation) {
929             case kDnnAffineOp :ApplyAffineTransform(comp, ptr_active_outputs, num_active_outputs);
930                 break;
931             case kDnnDiagonalOp:ApplyDiagonalTransform(comp);
932                 break;
933             case kDnnRecurrentOp:
934                 if ((i < component.size() - 1) && (component[i + 1].operation == kDnnPiecewiselinearOp)) {
935                     intel_dnn_component_t *comp_pwl = &component[i + 1];
936                     for (uint32_t j = 0; j < comp->num_rows_in; j++) {
937                         void *ptr_feedbacks =
938                             reinterpret_cast<void *>(reinterpret_cast<int32_t *>(comp->op.recurrent.ptr_feedbacks) + j * comp_pwl->num_columns_out);
939                         ApplyRecurrentTransform(comp, j, ptr_feedbacks);
940                         //  PrintOutputs(i);
941                         ApplyPiecewiseLinearTransform(comp_pwl, number_type_, num_active_outputs, j);
942                     }
943                     i++;  // skip next component
944                 } else {
945                     fprintf(stderr, "Missing PiecewiseLinear component after Recurrent component in Propagate!\n");
946                     throw -1;
947                 }
948                 break;
949             case kDnnConvolutional1dOp:ApplyConvolutional1DTransform(comp);
950                 break;
951             case kDnnPiecewiselinearOp:ApplyPiecewiseLinearTransform(comp, number_type_, num_active_outputs);
952                 break;
953             case kDnnMaxPoolOp:ApplyMaxPoolTransform(comp, number_type_);
954                 break;
955             case kDnnInterleaveOp:ApplyTranspose(comp);
956                 break;
957             case kDnnDeinterleaveOp:ApplyTranspose(comp);
958                 break;
959             case kDnnCopyOp:ApplyCopy(comp);
960                 break;
961             default:fprintf(stderr, "Bad operation in Propagate!\n");
962                 throw -1;
963                 break;
964         }
965         //  PrintOutputs(i); fflush(stdout);
966     }
967 }
968
969 intel_dnn_macro_operation_t AmIntelDnn::MacroOperation(uint32_t component_index) {
970     return (component[component_index].macro_operation);
971 }
972
973 void AmIntelDnn::SetMacroOperation(uint32_t component_index, intel_dnn_macro_operation_t macro_operation) {
974     component[component_index].macro_operation = macro_operation;
975 }
976
977 float AmIntelDnn::InputScaleFactor(uint32_t component_index) {
978     float scale_factor = 1.0;
979
980     if (component_index == 0) {
981         scale_factor = input_scale_factor_;
982     } else {
983         if (component[component_index - 1].operation == kDnnAffineOp) {
984             scale_factor = component[component_index - 1].output_scale_factor;
985         } else if (component[component_index - 1].operation == kDnnDiagonalOp) {
986             scale_factor = component[component_index - 1].output_scale_factor;
987         } else if (component[component_index - 1].operation == kDnnConvolutional1dOp) {
988             scale_factor = component[component_index - 1].output_scale_factor;
989         } else if (component[component_index - 1].operation == kDnnRecurrentOp) {
990             scale_factor = component[component_index - 1].output_scale_factor;
991         } else if (component[component_index - 1].operation == kDnnInterleaveOp) {
992             scale_factor = component[component_index - 1].output_scale_factor;
993         } else if (component[component_index - 1].operation == kDnnDeinterleaveOp) {
994             scale_factor = component[component_index - 1].output_scale_factor;
995         } else if (component[component_index - 1].operation == kDnnCopyOp) {
996             scale_factor = component[component_index - 1].output_scale_factor;
997         }
998     }
999
1000     return (scale_factor);
1001 }
1002
1003 float AmIntelDnn::WeightScaleFactor(uint32_t component_index) {
1004     float scale_factor = 1.0;
1005
1006     if (component[component_index].operation == kDnnAffineOp) {
1007         scale_factor = component[component_index].op.affine.weight_scale_factor;
1008     } else if (component[component_index].operation == kDnnDiagonalOp) {
1009         scale_factor = component[component_index].op.affine.weight_scale_factor;
1010     } else if (component[component_index].operation == kDnnConvolutional1dOp) {
1011         scale_factor = component[component_index].op.conv1D.weight_scale_factor;
1012     } else if (component[component_index].operation == kDnnRecurrentOp) {
1013         scale_factor = component[component_index].op.recurrent.weight_scale_factor;
1014     }
1015
1016     return (scale_factor);
1017 }
1018
1019 float AmIntelDnn::OutputScaleFactor(intel_dnn_component_t &comp) {
1020     return comp.output_scale_factor;
1021 }
1022
1023 void AmIntelDnn::SetOutputScaleFactor(uint32_t component_index, float scale_factor) {
1024     component[component_index].output_scale_factor = scale_factor;
1025 }
1026
1027 void AmIntelDnn::PrintOutputs(uint32_t component_index) {
1028     float scale_factor = OutputScaleFactor(component_index);
1029     uint32_t num_rows = component[component_index].num_rows_out;
1030     uint32_t num_columns = component[component_index].num_columns_out;
1031
1032     printf("component %d : %s\n", component_index, intel_dnn_operation_name[component[component_index].operation]);
1033     if (number_type_ == kDnnFloat) {
1034         auto ptr_output = reinterpret_cast<float *>(component[component_index].ptr_outputs);
1035         for (int i = 0; i < num_rows; i++) {
1036             for (int j = 0; j < num_columns; j++) {
1037                 printf("%d %d : %e\n", i, j, ptr_output[i * num_columns + j] / scale_factor);
1038             }
1039         }
1040     } else {
1041         switch (component[component_index].num_bytes_per_output) {
1042             case 1: {
1043                 auto ptr_output = reinterpret_cast<int8_t *>(component[component_index].ptr_outputs);
1044                 for (int i = 0; i < num_rows; i++) {
1045                     for (int j = 0; j < num_columns; j++) {
1046                         printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
1047                     }
1048                 }
1049             }
1050                 break;
1051             case 2: {
1052                 auto ptr_output = reinterpret_cast<int16_t *>(component[component_index].ptr_outputs);
1053                 for (int i = 0; i < num_rows; i++) {
1054                     for (int j = 0; j < num_columns; j++) {
1055                         printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
1056                     }
1057                 }
1058             }
1059                 break;
1060             case 4: {
1061                 auto ptr_output = reinterpret_cast<int32_t *>(component[component_index].ptr_outputs);
1062                 for (int i = 0; i < num_rows; i++) {
1063                     for (int j = 0; j < num_columns; j++) {
1064                         printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
1065                     }
1066                 }
1067             }
1068                 break;
1069             default:
1070                 fprintf(stderr,
1071                         "Bad num_bytes_per_output in component %d in AmIntelDnn::PrintOutputs()\n",
1072                         component_index);
1073                 throw -1;
1074         }
1075     }
1076 }
1077
1078 uint32_t AmIntelDnn::CompareScores(void *ptr_refscorearray, intel_score_error_t *score_error, uint32_t num_frames) {
1079     intel_dnn_component_t *ptr_component = &component[component.size() - 1];
1080     intel_dnn_orientation_t orientation = ptr_component->orientation_out;
1081     float scale_factor = OutputScaleFactor(component.size() - 1);
1082     uint32_t num_errors = 0;
1083     uint32_t num_rows = (orientation == kDnnInterleavedOrientation) ? ptr_component->num_rows_out : num_frames;
1084     uint32_t num_columns = (orientation == kDnnInterleavedOrientation) ? num_frames : ptr_component->num_columns_out;
1085     uint32_t num_row_step_ref =
1086         (orientation == kDnnInterleavedOrientation) ? ptr_component->num_rows_out : ptr_component->num_columns_out;
1087     uint32_t num_row_step = ptr_component->num_columns_out;
1088
1089     if (ptr_component->operation == kDnnAffineOp) {
1090         num_rows = num_active_outputs_;
1091     }
1092
1093     ClearScoreError(score_error);
1094
1095     if (number_type_ == kDnnFloat) {
1096         auto A = reinterpret_cast<float *>(ptr_component->ptr_outputs);
1097         auto B = reinterpret_cast<float *>(ptr_refscorearray);
1098         for (int i = 0; i < num_rows; i++) {
1099             for (int j = 0; j < num_columns; j++) {
1100                 float score = A[i * num_row_step + j];
1101                 float refscore =
1102                     (orientation == kDnnInterleavedOrientation) ? B[j * num_row_step_ref + i] : B[i * num_row_step_ref
1103                         + j];
1104                 float scaled_score = score / scale_factor;
1105                 float error = fabs(refscore - scaled_score);
1106                 float rel_error = error / (fabs(refscore) + 1e-20);
1107                 float squared_error = error * error;
1108                 float squared_rel_error = rel_error * rel_error;
1109                 score_error->num_scores++;
1110                 score_error->sum_error += error;
1111                 score_error->sum_squared_error += squared_error;
1112                 if (error > score_error->max_error) {
1113                     score_error->max_error = error;
1114                 }
1115                 score_error->sum_rel_error += rel_error;
1116                 score_error->sum_squared_rel_error += squared_rel_error;
1117                 if (rel_error > score_error->max_rel_error) {
1118                     score_error->max_rel_error = rel_error;
1119                 }
1120                 if (error > score_error->threshold) {
1121                     num_errors++;
1122                 }
1123             }
1124         }
1125     } else if (number_type_ == kDnnInt) {
1126         auto B = reinterpret_cast<float *>(ptr_refscorearray);
1127         for (int i = 0; i < num_rows; i++) {
1128             for (int j = 0; j < num_columns; j++) {
1129                 float score;
1130                 if (ptr_component->num_bytes_per_output == 4) {
1131                     auto A = reinterpret_cast<int32_t *>(ptr_component->ptr_outputs);
1132                     score = static_cast<float>(A[i * num_row_step + j]);
1133                 } else if (ptr_component->num_bytes_per_output == 2) {
1134                     auto A = reinterpret_cast<int16_t *>(ptr_component->ptr_outputs);
1135                     score = static_cast<float>(A[i * num_row_step + j]);
1136                 } else {
1137                     fprintf(stderr,
1138                             "Unsupported output width (%d) in AmIntelDnn::CompareScores()!\n",
1139                             ptr_component->num_bytes_per_output);
1140                     throw -1;
1141                 }
1142                 float refscore =
1143                     (orientation == kDnnInterleavedOrientation) ? B[j * num_row_step_ref + i] : B[i * num_row_step_ref
1144                         + j];
1145                 float scaled_score = score / scale_factor;
1146                 float error = fabs(refscore - scaled_score);
1147                 float rel_error = error / (fabs(refscore) + 1e-20);
1148                 float squared_error = error * error;
1149                 float squared_rel_error = rel_error * rel_error;
1150                 score_error->num_scores++;
1151                 score_error->sum_error += error;
1152                 score_error->sum_squared_error += squared_error;
1153                 if (error > score_error->max_error) {
1154                     score_error->max_error = error;
1155                 }
1156                 score_error->sum_rel_error += rel_error;
1157                 score_error->sum_squared_rel_error += squared_rel_error;
1158                 if (rel_error > score_error->max_rel_error) {
1159                     score_error->max_rel_error = rel_error;
1160                 }
1161                 if (error > score_error->threshold) {
1162                     num_errors++;
1163                 }
1164             }
1165         }
1166     } else {
1167         fprintf(stderr, "Unknown number type in AmIntelDnn::CompareScores()!\n");
1168         throw -1;
1169     }
1170
1171     score_error->num_errors = num_errors;
1172
1173     return (num_errors);
1174 }
1175
1176 void AmIntelDnn::WriteGraphWizModel(const char *filename) {
1177     auto & components = component;
1178
1179 #define IS_AFFINE(k)\
1180     (components[k].operation == kDnnAffineOp ||\
1181      components[k].operation == kDnnDiagonalOp)
1182
1183 #define IS_CONV(k)\
1184     (components[k].operation == kDnnConvolutional1dOp)
1185
1186 #define IS_RELU(k)\
1187     (components[k].operation == kDnnPiecewiselinearOp &&\
1188      components[k].op.pwl.func_id == kActRelu)
1189
1190
1191 #define IS_DIAG(k)\
1192     (components[k].operation == kDnnDiagonalOp)
1193
1194 #define OUTPUTS(idx)\
1195     components[idx].ptr_outputs, components[idx].num_rows_out*components[idx].num_columns_out * components[idx].num_bytes_per_output
1196
1197 #define INPUTS(idx)\
1198     components[idx].ptr_inputs, components[idx].num_rows_in*components[idx].num_columns_in * components[idx].num_bytes_per_input
1199
1200 #define BIASES(idx)\
1201     components[idx].op.affine.ptr_biases,  components[idx].num_rows_in*components[idx].num_columns_in * components[idx].op.affine.num_bytes_per_bias
1202
1203 #define WEIGHTS(idx)\
1204     components[idx].op.affine.ptr_weights, components[idx].op.affine.num_bytes_per_weight * components[idx].num_rows_in*components[idx].num_columns_in * \
1205             (IS_DIAG(idx) ? 1 : components[idx].num_rows_out*components[idx].num_columns_out)
1206
1207     auto intersected = [](void * ptra, size_t asize, void * ptrb, size_t bsize) {
1208         return !(((reinterpret_cast<char*>(ptra) + asize) <= ptrb) || ((reinterpret_cast<char*>(ptrb) + bsize) <= ptra));
1209     };
1210
1211     auto equals = [](void * ptra, size_t asize, void * ptrb, size_t bsize) {
1212         // return !((((char*)ptra + asize) < ptrb) || (((char*)ptrb + bsize) < ptra));
1213         return ptra >= ptrb  && ptra < reinterpret_cast<char*>(ptrb) + bsize;
1214     };
1215
1216     std::fstream graph("graph.dot", std::ios::out);
1217     graph << "strict digraph {";
1218     std::set<void*> weights;
1219     std::set<void*> biases;
1220     std::set<void*> outputs;
1221     std::set<std::string> layersNames;
1222
1223     auto generate_layer_name = [&](int k) {
1224         std::string l;
1225         if (components[k].operation == kDnnPiecewiselinearOp) {
1226             l += intel_dnn_activation_name[components[k].op.pwl.func_id];
1227         } else {
1228             l += intel_dnn_operation_name[components[k].operation];
1229         }
1230         l += "_" + std::to_string(k);
1231         if (components[k].operation == kDnnPiecewiselinearOp) {
1232             graph << l << " [shape=box, style=filled, fillcolor=yellow";
1233         } else {
1234             graph << l << " [shape=box";
1235         }
1236
1237         graph << ", label=<<TABLE BORDER=\"0\" CELLBORDER=\"1\" CELLSPACING=\"0\">\n"
1238             "  <TR><TD  colspan=\"2\">" <<  l << "</TD></TR>\n"
1239             "  <TR><TD  colspan=\"2\">" <<  components[k].num_rows_in << "x" <<  components[k].num_rows_out<< "</TD></TR>\n";
1240         if (IS_AFFINE(k)) {
1241             graph << "  <TR><TD> wscale</TD><TD>" <<  components[k].op.affine.weight_scale_factor<< "</TD></TR>\n";
1242             graph << "  <TR><TD> wbit</TD><TD>" <<  components[k].op.affine.num_bytes_per_weight<< "</TD></TR>\n";
1243             graph << "  <TR><TD> bbit</TD><TD>" <<  components[k].op.affine.num_bytes_per_bias<< "</TD></TR>\n";
1244         }
1245         if (IS_RELU(k)) {
1246             graph << "  <TR><TD> negative_slope</TD><TD>" <<  components[k].op.pwl.func_id.negative_slope<< "</TD></TR>\n";
1247         }
1248         if (IS_CONV(k)) {
1249             auto &conv = components[k].op.conv1D;
1250             graph << "  <TR><TD> num_filters</TD><TD>" <<  conv.num_filters<< "</TD></TR>\n";
1251             graph << "  <TR><TD> num_filter_rows</TD><TD>" <<  conv.num_filter_rows<< "</TD></TR>\n";
1252             graph << "  <TR><TD> num_filter_coefficients</TD><TD>" <<  conv.num_filter_coefficients<< "</TD></TR>\n";
1253             graph << "  <TR><TD> num_feature_maps</TD><TD>" <<  conv.num_feature_maps<< "</TD></TR>\n";
1254             graph << "  <TR><TD> num_feature_map_rows</TD><TD>" <<  conv.num_feature_map_rows<< "</TD></TR>\n";
1255             graph << "  <TR><TD> num_feature_map_columns</TD><TD>" <<  conv.num_feature_map_columns<< "</TD></TR>\n";
1256             graph << "  <TR><TD> wscale</TD><TD>" <<  conv.weight_scale_factor<< "</TD></TR>\n";
1257             graph << "  <TR><TD> wbit</TD><TD>" <<  conv.num_bytes_per_weight<< "</TD></TR>\n";
1258             graph << "  <TR><TD> bbit</TD><TD>" <<  conv.num_bytes_per_bias<< "</TD></TR>\n";
1259         }
1260         graph<<   "  <TR><TD> num_rows_in</TD><TD>" <<  components[k].num_rows_in<< "</TD></TR>\n"
1261                   "  <TR><TD> num_columns_in</TD><TD>" <<  components[k].num_columns_in<< "</TD></TR>\n"
1262                   "  <TR><TD> num_rows_out</TD><TD>" <<  components[k].num_rows_out<< "</TD></TR>\n"
1263                   "  <TR><TD> num_columns_out</TD><TD>" <<  components[k].num_columns_out<< "</TD></TR>\n"
1264                   "  <TR><TD> oscale</TD><TD>" <<  components[k].output_scale_factor<< "</TD></TR>\n"
1265                   "  <TR><TD> ibit</TD><TD>" <<  components[k].num_bytes_per_input<< "</TD></TR>\n"
1266                   "  <TR><TD> obit</TD><TD>" <<  components[k].num_bytes_per_output<< "</TD></TR>\n"
1267             "</TABLE>>];\n";
1268
1269         return l;
1270     };
1271
1272
1273     for (int k = 0; k < components.size(); ++k) {
1274         std::string l = generate_layer_name(k);
1275         layersNames.insert(l);
1276         int lidx = std::distance(layersNames.begin(), layersNames.find(l));
1277         int widx = 0;
1278         int bidx = 0;
1279
1280         if (IS_AFFINE(k)) {
1281             weights.insert(components[k].op.affine.ptr_weights);
1282             biases.insert(components[k].op.affine.ptr_biases);
1283
1284             widx = std::distance(weights.begin(), weights.find(components[k].op.affine.ptr_weights));
1285             bidx = std::distance(biases.begin(), biases.find(components[k].op.affine.ptr_biases));
1286         }
1287
1288
1289         auto lw =  "weights_" +  std::to_string(lidx) + "_" + std::to_string(widx);;
1290         auto lb =  "biases_" +  std::to_string(lidx) + "_" + std::to_string(bidx);
1291
1292         if (IS_AFFINE(k)) {
1293             graph << lw << " -> " << l << "[style=bold];";
1294             graph << lb << " -> " << l << "[style=bold];";
1295         }
1296
1297         graph << "\n";
1298
1299         bool inputConnected = false;
1300
1301         for (int k2 = 0; k2 < components.size(); ++k2) {
1302             if (k2 == k) continue;
1303
1304
1305             std::string r = generate_layer_name(k2);
1306
1307             int w2idx = 0;
1308             int b2idx = 0;
1309
1310             if (IS_AFFINE(k2)) {
1311                 weights.insert(components[k2].op.affine.ptr_weights);
1312                 biases.insert(components[k2].op.affine.ptr_biases);
1313
1314                 w2idx = std::distance(weights.begin(), weights.find(components[k2].op.affine.ptr_weights));
1315                 b2idx = std::distance(biases.begin(), biases.find(components[k2].op.affine.ptr_biases));
1316             }
1317
1318             auto rw =  "weights_" + std::to_string(w2idx);
1319             auto rb =  "biases_" + std::to_string(b2idx);
1320
1321             // ----------------------------------------------------------
1322             // output to input connections
1323             if (intersected(OUTPUTS(k2), INPUTS(k))) {
1324                 graph << r <<" -> "<< l << ";";
1325                 inputConnected = true;
1326             }
1327
1328             // ----------------------------------------------------------
1329             // output to biases connections
1330             if (IS_AFFINE(k) && intersected(OUTPUTS(k2), BIASES(k))) {
1331                 graph << r << " -> " << lb << " [label=\"OB\", fontcolor=blue, color=blue, style=dashed];";
1332             }
1333
1334             // ----------------------------------------------------------
1335             // output to weights connections
1336             if (IS_AFFINE(k) && equals(OUTPUTS(k2), WEIGHTS(k))) {
1337                 graph << r << " -> " << lw << " [label=\"OW\", fontcolor=magenta, color=magenta, style=dashed];";
1338             }
1339
1340             // ----------------------------------------------------------
1341             // weights to input connections
1342             if (IS_AFFINE(k2) && equals(WEIGHTS(k2), INPUTS(k))) {
1343                 graph << rw << " -> " << l << " [label=\"WI\", fontcolor=red, color=red, style=dashed];";
1344                 inputConnected = true;
1345             }
1346
1347             // ----------------------------------------------------------
1348             // weights to bias connections
1349             if (IS_AFFINE(k2) && IS_AFFINE(k) && equals(WEIGHTS(k2), BIASES(k))) {
1350                 graph << rw << " -> " << lb << " [label=\"WB\", fontcolor=darkgreen,color=darkgreen, style=dashed];";
1351             }
1352         }
1353         if (!inputConnected) {
1354             // drawing tmp connection
1355             outputs.insert(components[k].ptr_inputs);
1356             auto tidx = std::distance(outputs.begin(), outputs.find(components[k].ptr_inputs));
1357             graph << tidx << " -> " << l
1358                   << " [label=\"FROM_TMP\", fontcolor=darkgreen,color=orange, style=dashed];";
1359         }
1360     }
1361
1362     for (int k = 0; k < components.size(); ++k) {
1363         std::string l = generate_layer_name(k);
1364
1365         int tidx = 0;
1366         for (auto tmpOutPtrs : outputs) {
1367             if (components[k].ptr_outputs == tmpOutPtrs) {
1368                 graph << l << " -> " << tidx << " [label=\"TO_TMP\", fontcolor=darkgreen,color=orange, style=dashed];";
1369             }
1370             tidx++;
1371         }
1372     }
1373
1374     graph << "}";
1375 }
1376
1377 void AmIntelDnn::WriteDnnText(const char *filename, intel_dnn_number_type_t number_type) {
1378     if ((number_type_ == kDnnFloat) && (number_type == kDnnInt)) {
1379         fprintf(stderr, "Error trying to write floating point DNN as integer in AmIntelDnn::WriteDnnText().\n");
1380         fprintf(stderr, "  Please convert to integer first.\n");
1381         throw -1;
1382     }
1383 #ifndef LIGHT_DUMP
1384     std::ofstream out_file1(filename, std::ios::out);
1385     std::ofstream &out_file = out_file1;
1386 #else
1387     std::ofstream out_file((std::string(filename) + ".light").c_str(), std::ios::out);
1388 #endif
1389     if (out_file.good()) {
1390         uint32_t num_inputs = component[0].num_rows_in;
1391         uint32_t num_outputs =
1392             (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) ? component[component.size()
1393                 - 1].num_rows_out : component[component.size() - 1].num_columns_out;
1394         uint32_t num_layers = num_gna_layers();
1395         uint32_t num_group = this->num_group_in();
1396         uint32_t layer = 0;
1397
1398         out_file << "<intel_dnn_file>\n";
1399         out_file << "<number_type> " << intel_dnn_number_type_name[number_type] << "\n";
1400         out_file << "<softmax_type> " << intel_dnn_softmax_name[softmax_type] << "\n";
1401         out_file << "<num_memory_bytes> " << std::dec << num_bytes_dnn_memory_ << "\n";
1402         out_file << "<num_group> " << std::dec << num_group << "\n";
1403         out_file << "<number_inputs> " << std::dec << num_inputs << "\n";
1404         out_file << "<num_outputs> " << std::dec << num_outputs << "\n";
1405         out_file << "<num_layers> " << std::dec << num_layers << "\n";
1406         for (uint32_t i = 0; i < component.size(); i++) {
1407 #ifdef LIGHT_DUMP
1408             std::stringstream out_file_name;
1409             out_file_name << getDumpFolderName() << std::setfill('0') << std::setw(2) << i << "_"
1410                           << intel_dnn_operation_name[component[i].operation]
1411                           << "-" << component[i].num_rows_in
1412                           << "-" << component[i].num_rows_out;
1413             if (component[i].operation == kDnnPiecewiselinearOp) {
1414                 out_file_name << "-" << intel_dnn_activation_name[component[i].op.pwl.func_id.type];
1415             }
1416             std::ofstream out_file((out_file_name.str() + ".txt").c_str(), std::ios::out);
1417 #endif
1418
1419             uint32_t num_rows_in = component[i].num_rows_in;
1420             uint32_t num_columns_in = component[i].num_columns_in;
1421             uint32_t num_rows_out = component[i].num_rows_out;
1422             uint32_t num_columns_out = component[i].num_columns_out;
1423             uint32_t num_bytes_per_input = component[i].num_bytes_per_input;
1424             uint32_t num_bytes_per_output = component[i].num_bytes_per_output;
1425             if ((component[i].operation == kDnnAffineOp)
1426                 || (component[i].operation == kDnnDiagonalOp)
1427                 || (component[i].operation == kDnnRecurrentOp)
1428                 || (component[i].operation == kDnnConvolutional1dOp)
1429                 || (component[i].operation == kDnnInterleaveOp)
1430                 || (component[i].operation == kDnnDeinterleaveOp)
1431                 || (component[i].operation == kDnnCopyOp)) {
1432                 out_file << "<layer_index> " << std::dec << layer << "\n";
1433                 layer++;
1434             }
1435             out_file << "<component_operation> " << intel_dnn_operation_name[component[i].operation] << "\n";
1436             out_file << "<macro_operation> " << intel_dnn_macro_operation_name[component[i].macro_operation] << "\n";
1437             out_file << "<num_rows_in> " << std::dec << num_rows_in << "\n";
1438             out_file << "<num_columns_in> " << std::dec << num_columns_in << "\n";
1439             out_file << "<num_rows_out> " << std::dec << num_rows_out << "\n";
1440             out_file << "<num_columns_out> " << std::dec << num_columns_out << "\n";
1441             out_file << "<orientation_in> " << std::dec << (component[i].orientation_in == kDnnInterleavedOrientation ?
1442             "interleaved" : "deinterleaved") << "\n";
1443             out_file << "<orientation_out> " << std::dec << (component[i].orientation_out == kDnnInterleavedOrientation ?
1444                                                             "interleaved" : "deinterleaved") << "\n";
1445
1446             if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1447                 out_file << "<num_bytes_per_input> " << std::dec << sizeof(float) << "\n";
1448                 out_file << "<num_bytes_per_output> " << std::dec << sizeof(float) << "\n";
1449             } else {
1450                 out_file << "<num_bytes_per_input> " << std::dec << num_bytes_per_input << "\n";
1451                 out_file << "<num_bytes_per_output> " << std::dec << num_bytes_per_output << "\n";
1452             }
1453             out_file << "<input_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1454                      << MemoryOffset(component[i].ptr_inputs, ptr_dnn_memory_) << "\n";
1455             out_file << "<output_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1456                      << MemoryOffset(component[i].ptr_outputs, ptr_dnn_memory_) << "\n";
1457             switch (component[i].operation) {
1458                 case kDnnAffineOp:
1459                 case kDnnDiagonalOp: {
1460                     uint32_t num_bytes_per_weight = component[i].op.affine.num_bytes_per_weight;
1461                     uint32_t num_bytes_per_bias = component[i].op.affine.num_bytes_per_bias;
1462                     float weight_scale_factor = component[i].op.affine.weight_scale_factor;
1463                     float output_scale_factor = component[i].output_scale_factor;
1464                     uint32_t num_weight_rows = (component[i].operation == kDnnDiagonalOp) ? 1 : num_rows_out;
1465                     uint32_t num_weight_columns = num_rows_in;
1466                     if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1467                         out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
1468                         out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
1469                     } else {
1470                         out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
1471                         out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
1472                     }
1473                     if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1474                         out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
1475                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
1476                     } else {
1477                         out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
1478                                  << weight_scale_factor << "\n";
1479                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1480                                  << output_scale_factor << "\n";
1481                     }
1482                     out_file << "<weight_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1483                              << MemoryOffset(component[i].op.affine.ptr_weights, ptr_dnn_memory_) << "\n";
1484                     out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1485                              << MemoryOffset(component[i].op.affine.ptr_biases, ptr_dnn_memory_) << "\n";
1486
1487                     std::ofstream out_wfile((out_file_name.str() + "_weights.txt").c_str(), std::ios::out);
1488                     std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out);
1489
1490                     if (num_bytes_per_weight == 1) {
1491                         int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.affine.ptr_weights);
1492                         intel_compound_bias_t *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.affine.ptr_biases);
1493 #ifdef DUMP_WB
1494                         for (uint32_t row = 0; row < num_weight_rows; row++) {
1495                             for (uint32_t col = 0; col < num_weight_columns; col++) {
1496                                 if (number_type == kDnnFloat) {
1497                                     float val =
1498                                         static_cast<float>(ptr_weight[row * num_weight_columns + col]) * ptr_bias[row].multiplier
1499                                             / weight_scale_factor;
1500                                     out_wfile << std::setprecision(4) << val << " ";
1501                                 } else {
1502                                     out_wfile <<  int((int8_t) ptr_weight[row * num_weight_columns + col]) << " ";
1503                                 }
1504                                 out_wfile << "\n";
1505                             }
1506                         }
1507 #endif
1508                     } else if (num_bytes_per_weight == 2) {
1509                         int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.affine.ptr_weights);
1510 #ifdef DUMP_WB
1511                         for (uint32_t row = 0; row < num_weight_rows; row++) {
1512                             for (uint32_t col = 0; col < num_weight_columns; col++) {
1513                                 if (number_type == kDnnFloat) {
1514                                     out_wfile << std::setprecision(12)
1515                                               << ptr_weight[row * num_weight_columns + col] / weight_scale_factor << " ";
1516                                 } else {
1517                                     out_wfile << ptr_weight[row * num_weight_columns + col] << " ";
1518                                 }
1519                                 out_wfile << "\n";
1520                             }
1521                         }
1522 #endif
1523                     } else if (number_type_ == kDnnFloat) {
1524                         float *ptr_weight = reinterpret_cast<float *>(component[i].op.affine.ptr_weights);
1525 #ifdef DUMP_WB
1526                         for (uint32_t row = 0; row < num_weight_rows; row++) {
1527                             for (uint32_t col = 0; col < num_weight_columns; col++) {
1528                                 out_wfile << std::setprecision(5)
1529                                           << ptr_weight[row * num_weight_columns + col] << " ";
1530                                 out_wfile << "\n";
1531                             }
1532                         }
1533 #endif
1534                     } else {
1535                         fprintf(stderr, "Unsupported weight type in WriteDnnText!\n");
1536                         throw -1;
1537                     }
1538                     if (number_type_ == kDnnInt) {
1539                         if (num_bytes_per_weight == 1) {
1540                             intel_compound_bias_t
1541                                 *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.affine.ptr_biases);
1542 #ifdef DUMP_WB
1543                             for (uint32_t row = 0; row < num_rows_out; row++) {
1544                                 out_bfile << std::setw(8) << ptr_biases[row].bias << ", ";
1545                                 out_bfile << std::setw(8) << int(ptr_biases[row].multiplier) << "\n";
1546                             }
1547 #endif
1548                         } else {
1549                             int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.affine.ptr_biases);
1550 #ifdef DUMP_WB
1551                             for (uint32_t row = 0; row < num_rows_out; row++) {
1552                                 if (number_type == kDnnInt) {
1553                                     out_bfile << std::setw(8) << ptr_biases[row] << "\n";
1554                                 } else {
1555                                     out_bfile << std::setw(8) << ptr_biases[row] / output_scale_factor << "\n";
1556                                 }
1557                             }
1558 #endif
1559                         }
1560
1561                     } else {
1562                         float *ptr_biases = reinterpret_cast<float *>(component[i].op.affine.ptr_biases);
1563 #ifdef DUMP_WB
1564
1565                         for (uint32_t row = 0; row < num_rows_out; row++) {
1566                             out_bfile << std::setprecision(5) << ptr_biases[row] << "\n";
1567                         }
1568 #endif
1569                     }
1570                 }
1571                 break;
1572                 case kDnnConvolutional1dOp: {
1573                     uint32_t num_filters = component[i].op.conv1D.num_filters;
1574                     uint32_t num_filter_rows = component[i].op.conv1D.num_filter_rows;
1575                     uint32_t num_filter_coefficients = component[i].op.conv1D.num_filter_coefficients;
1576                     uint32_t num_feature_maps = component[i].op.conv1D.num_feature_maps;
1577                     uint32_t num_feature_map_rows = component[i].op.conv1D.num_feature_map_rows;
1578                     uint32_t num_feature_map_columns = component[i].op.conv1D.num_feature_map_columns;
1579                     uint32_t num_filter_outputs =
1580                         component[i].op.conv1D.num_feature_map_rows - component[i].op.conv1D.num_filter_rows + 1;
1581                     uint32_t num_bytes_per_weight = component[i].op.conv1D.num_bytes_per_weight;
1582                     uint32_t num_bytes_per_bias = component[i].op.conv1D.num_bytes_per_bias;
1583                     float weight_scale_factor = component[i].op.conv1D.weight_scale_factor;
1584                     float output_scale_factor = component[i].output_scale_factor;
1585                     out_file << "<num_filters> " << std::dec << num_filters << "\n";
1586                     out_file << "<num_filter_coefficients> " << std::dec << num_filter_coefficients << "\n";
1587                     out_file << "<num_filter_rows> " << std::dec << num_filter_rows << "\n";
1588                     out_file << "<num_feature_maps> " << std::dec << num_feature_maps << "\n";
1589                     out_file << "<num_feature_map_rows> " << std::dec << num_feature_map_rows << "\n";
1590                     out_file << "<num_feature_map_columns> " << std::dec << num_feature_map_columns << "\n";
1591                     if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1592                         out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
1593                         out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
1594                     } else {
1595                         out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
1596                         out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
1597                     }
1598                     if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1599                         out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
1600                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
1601                     } else {
1602                         out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
1603                                  << weight_scale_factor << "\n";
1604                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1605                                  << output_scale_factor << "\n";
1606                     }
1607                     out_file << "<filter_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1608                              << MemoryOffset(component[i].op.conv1D.ptr_filters, ptr_dnn_memory_) << "\n";
1609                     out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1610                              << MemoryOffset(component[i].op.conv1D.ptr_biases, ptr_dnn_memory_) << "\n";
1611
1612
1613                     std::ofstream out_wfile((out_file_name.str() + "_weights.txt").c_str(), std::ios::out);
1614                     std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out);
1615
1616
1617                     if (num_bytes_per_weight == 1) {
1618                         int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.conv1D.ptr_filters);
1619                         intel_compound_bias_t *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.conv1D.ptr_biases);
1620 #ifdef DUMP_WB
1621                         for (uint32_t row = 0; row < num_filters; row++) {
1622                             for (uint32_t col = 0; col < num_filter_coefficients; col++) {
1623                                 if (number_type == kDnnFloat) {
1624                                     float val = static_cast<float>(ptr_weight[row * num_filter_coefficients + col])
1625                                         * ptr_bias[row].multiplier / weight_scale_factor;
1626                                     out_wfile << std::setprecision(12) <<val << "\n";
1627                                 } else {
1628                                     out_wfile << "0x" << std::setfill('0') << std::setw(2) << std::hex
1629                                              << int((uint8_t) ptr_weight[row * num_filter_coefficients + col]) << "\n";
1630                                 }
1631                             }
1632                         }
1633 #endif
1634                     } else if (num_bytes_per_weight == 2) {
1635                         int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.conv1D.ptr_filters);
1636 #ifdef DUMP_WB
1637                         for (uint32_t row = 0; row < num_filters; row++) {
1638                             for (uint32_t col = 0; col < num_filter_coefficients; col++) {
1639                                 if (number_type == kDnnFloat) {
1640                                     out_wfile << std::setprecision(12)
1641                                              << ptr_weight[row * num_filter_coefficients + col] / weight_scale_factor
1642                                              << "\n";
1643                                 } else {
1644                                     out_wfile << "0x" << std::setfill('0') << std::setw(4) << std::hex
1645                                              << ptr_weight[row * num_filter_coefficients + col] << "\n";
1646                                 }
1647                             }
1648                         }
1649 #endif
1650                     } else if (number_type_ == kDnnFloat) {
1651                         float *ptr_weight = reinterpret_cast<float *>(component[i].op.conv1D.ptr_filters);
1652 #ifdef DUMP_WB
1653                         for (uint32_t row = 0; row < num_filters; row++) {
1654                             for (uint32_t col = 0; col < num_filter_coefficients; col++) {
1655                                 out_wfile << std::setprecision(12)
1656                                          << ptr_weight[row * num_filter_coefficients + col] << "\n";
1657                             }
1658                             out_wfile << "\n";
1659                         }
1660 #endif
1661                     } else {
1662                         fprintf(stderr, "Unsupported filter weight type in WriteDnnText!\n");
1663                         throw -1;
1664                     }
1665
1666                     if (number_type_ == kDnnInt) {
1667                         if (number_type == kDnnInt) {
1668                             if (num_bytes_per_weight == 1) {
1669                                 intel_compound_bias_t
1670                                     *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.conv1D.ptr_biases);
1671 #ifdef DUMP_WB
1672                                 for (uint32_t row = 0; row < num_filters; row++) {
1673                                     out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex
1674                                              << ptr_biases[row].bias << " ";
1675                                     out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex
1676                                              << int(ptr_biases[row].multiplier) << "\n";
1677                                 }
1678 #endif
1679                             } else {
1680                                 int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.conv1D.ptr_biases);
1681 #ifdef DUMP_WB
1682                                 for (uint32_t row = 0; row < num_filters; row++) {
1683                                     out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex << ptr_biases[row]
1684                                              << "\n";
1685                                 }
1686 #endif
1687                             }
1688                         } else {
1689                             int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.conv1D.ptr_biases);
1690 #ifdef DUMP_WB
1691                             for (uint32_t row = 0; row < num_filters; row++) {
1692                                 out_bfile << std::setprecision(12)
1693                                          << ptr_biases[row] / output_scale_factor << "\n";
1694                             }
1695 #endif
1696                         }
1697                     } else {
1698                         float *ptr_biases = reinterpret_cast<float *>(component[i].op.conv1D.ptr_biases);
1699 #ifdef DUMP_WB
1700                         for (uint32_t row = 0; row < num_filters; row++) {
1701                             out_bfile << std::setprecision(12) << ptr_biases[row] << "\n";
1702                         }
1703 #endif
1704                     }
1705                     out_file << "\n";
1706                 }
1707                     break;
1708                 case kDnnRecurrentOp: {
1709                     float weight_scale_factor = component[i].op.recurrent.weight_scale_factor;
1710                     float output_scale_factor = component[i].output_scale_factor;
1711                     uint32_t num_vector_delay = component[i].op.recurrent.num_vector_delay;
1712                     uint32_t num_bytes_per_weight = component[i].op.recurrent.num_bytes_per_weight;
1713                     uint32_t num_bytes_per_bias = component[i].op.recurrent.num_bytes_per_bias;
1714                     uint32_t num_weight_rows = num_columns_out;
1715                     uint32_t num_weight_columns = num_columns_in + num_columns_out;
1716                     out_file << "<num_vector_delay> " << std::dec << num_vector_delay << "\n";
1717                     if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1718                         out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
1719                         out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
1720                     } else {
1721                         out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
1722                         out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
1723                     }
1724                     if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1725                         out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
1726                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
1727                     } else {
1728                         out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
1729                                  << weight_scale_factor << "\n";
1730                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1731                                  << output_scale_factor << "\n";
1732                     }
1733                     out_file << "<weight_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1734                              << MemoryOffset(component[i].op.recurrent.ptr_weights, ptr_dnn_memory_) << "\n";
1735                     out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1736                              << MemoryOffset(component[i].op.recurrent.ptr_biases, ptr_dnn_memory_) << "\n";
1737                     out_file << "<feedback_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1738                              << MemoryOffset(component[i].op.recurrent.ptr_feedbacks, ptr_dnn_memory_) << "\n";
1739                     if (num_bytes_per_weight == 1) {
1740                         int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.recurrent.ptr_weights);
1741                         intel_compound_bias_t
1742                             *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.recurrent.ptr_biases);
1743 #ifdef DUMP_WB
1744                         for (uint32_t row = 0; row < num_weight_rows; row++) {
1745                             out_file << "<weight_row> ";
1746                             for (uint32_t col = 0; col < num_weight_columns; col++) {
1747                                 if (number_type == kDnnFloat) {
1748                                     float val =
1749                                         static_cast<float>(ptr_weight[row * num_weight_columns + col]) * ptr_bias[col].multiplier
1750                                             / weight_scale_factor;
1751                                     out_file << std::setprecision(12) << std::scientific << val << " ";
1752                                 } else {
1753                                     out_file << "0x" << std::setfill('0') << std::setw(2) << std::hex
1754                                              << int((uint8_t) ptr_weight[row * num_weight_columns + col]) << " ";
1755                                 }
1756                             }
1757                             out_file << "\n";
1758                         }
1759 #endif
1760                     } else if (num_bytes_per_weight == 2) {
1761                         int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.recurrent.ptr_weights);
1762 #ifdef DUMP_WB
1763                         for (uint32_t row = 0; row < num_weight_rows; row++) {
1764                             out_file << "<weight_row> ";
1765                             for (uint32_t col = 0; col < num_weight_columns; col++) {
1766                                 if (number_type == kDnnFloat) {
1767                                     out_file << std::setprecision(12) << std::scientific
1768                                              << ptr_weight[row * num_weight_columns + col] / weight_scale_factor << " ";
1769                                 } else {
1770                                     out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
1771                                              << ptr_weight[row * num_weight_columns + col] << " ";
1772                                 }
1773                             }
1774                             out_file << "\n";
1775                         }
1776 #endif
1777                     } else if (number_type_ == kDnnFloat) {
1778                         float *ptr_weight = reinterpret_cast<float *>(component[i].op.recurrent.ptr_weights);
1779 #ifdef DUMP_WB
1780                         for (uint32_t row = 0; row < num_weight_rows; row++) {
1781                             out_file << "<weight_row> ";
1782                             for (uint32_t col = 0; col < num_weight_columns; col++) {
1783                                 out_file << std::setprecision(12) << std::scientific
1784                                          << ptr_weight[row * num_weight_columns + col] << " ";
1785                             }
1786                             out_file << "\n";
1787                         }
1788 #endif
1789                     } else {
1790                         fprintf(stderr, "Unsupported weight type in WriteDnnText!\n");
1791                         throw -1;
1792                     }
1793                     if (number_type_ == kDnnInt) {
1794                         if (number_type == kDnnInt) {
1795                             if (num_bytes_per_weight == 1) {
1796                                 intel_compound_bias_t
1797                                     *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.recurrent.ptr_biases);
1798                                 out_file << "<compound_bias>" << " ";
1799 #ifdef DUMP_WB
1800                                 for (uint32_t col = 0; col < num_columns_out; col++) {
1801                                     out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
1802                                              << ptr_biases[col].bias << " ";
1803                                     out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
1804                                              << ptr_biases[col].multiplier << " ";
1805                                 }
1806 #endif
1807                             } else {
1808                                 int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.recurrent.ptr_biases);
1809                                 out_file << "<bias>" << " ";
1810 #ifdef DUMP_WB
1811                                 for (uint32_t col = 0; col < num_columns_out; col++) {
1812                                     out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex << ptr_biases[col]
1813                                              << " ";
1814                                 }
1815 #endif
1816                             }
1817                         } else {
1818                             int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.recurrent.ptr_biases);
1819                             out_file << "<bias>" << " ";
1820 #ifdef DUMP_WB
1821                             for (uint32_t col = 0; col < num_columns_out; col++) {
1822                                 out_file << std::setprecision(12) << std::scientific
1823                                          << ptr_biases[col] / output_scale_factor << " ";
1824                             }
1825 #endif
1826                         }
1827                     } else {
1828                         float *ptr_biases = reinterpret_cast<float *>(component[i].op.recurrent.ptr_biases);
1829                         out_file << "<bias>" << " ";
1830 #ifdef DUMP_WB
1831                         for (uint32_t col = 0; col < num_columns_out; col++) {
1832                             out_file << std::setprecision(12) << std::scientific << ptr_biases[col] << " ";
1833                         }
1834 #endif
1835                     }
1836                     out_file << "\n";
1837                 }
1838                     break;
1839                 case kDnnMaxPoolOp: {
1840                     uint32_t num_pool_type = (component[i].op.maxpool.do_sum_not_max) ? 2 : 1;
1841                     out_file << "<pool_type> " << std::dec << num_pool_type << "\n";
1842                     out_file << "<pool_size> " << std::dec << component[i].op.maxpool.num_inputs << "\n";
1843                     out_file << "<pool_step> " << std::dec << component[i].op.maxpool.num_inputs_step << "\n";
1844                     out_file << "<pool_num_rows> " << std::dec << component[i].op.maxpool.num_inputs_stride << "\n";
1845                     out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1846                              << component[i].output_scale_factor << "\n";
1847                 }
1848                     break;
1849                 case kDnnPiecewiselinearOp: {
1850                     intel_pwl_segment_t *ptr_segment = component[i].op.pwl.ptr_segments;
1851                     DnnActivationType func_id = component[i].op.pwl.func_id.type;
1852                     uint32_t num_segments = component[i].op.pwl.num_segments;
1853                     float output_scale_factor = component[i].output_scale_factor;
1854                     out_file << "<func_id> " << intel_dnn_activation_name[func_id] << "\n";
1855                     out_file << "<num_bytes_per_slope> " << std::dec << sizeof(int16_t) << "\n";
1856                     out_file << "<num_bytes_per_intercept> " << std::dec << sizeof(int16_t) << "\n";
1857                     out_file << "<num_bytes_per_offset> " << std::dec << sizeof(int32_t) << "\n";
1858                     if (number_type == kDnnFloat) {
1859                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
1860                         out_file << "<num_segments> " << std::dec << 0 << "\n";
1861                         out_file << "<segment_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1862                                  << MemoryOffset(component[i].op.pwl.ptr_segments, ptr_dnn_memory_) << "\n";
1863                     } else {
1864                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1865                                  << output_scale_factor << "\n";
1866                         out_file << "<num_segments> " << std::dec << num_segments << "\n";
1867                         out_file << "<segment_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1868                                  << MemoryOffset(component[i].op.pwl.ptr_segments, ptr_dnn_memory_) << "\n";
1869                         if (number_type_ == kDnnInt) {
1870                             out_file << "<slope> ";
1871                             for (int segment = 0; segment < num_segments; segment++) {
1872                                 out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
1873                                          << ptr_segment[segment].slope << " ";
1874                             }
1875                             out_file << "\n";
1876                             out_file << "<intercept> ";
1877                             for (int segment = 0; segment < component[i].op.pwl.num_segments; segment++) {
1878                                 out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
1879                                          << ptr_segment[segment].yBase << " ";
1880                             }
1881                             out_file << "\n";
1882                             out_file << "<offset> ";
1883                             for (int segment = 0; segment < component[i].op.pwl.num_segments; segment++) {
1884                                 out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
1885                                          << ptr_segment[segment].xBase << " ";
1886                             }
1887                             out_file << "\n";
1888                         } else if (num_segments > 0) {
1889                             fprintf(stderr,
1890                                     "Number of segments must be zero in floating point model in WriteDnnText!\n");
1891                             throw -1;
1892                         }
1893                     }
1894                 }
1895                     break;
1896                 case kDnnInterleaveOp:
1897                     out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1898                              << component[i].output_scale_factor << "\n";
1899                     break;
1900                 case kDnnDeinterleaveOp:
1901                     out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1902                              << component[i].output_scale_factor << "\n";
1903                     break;
1904                 case kDnnCopyOp:
1905                     out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1906                              << component[i].output_scale_factor << "\n";
1907                     out_file << "<num_copy_rows> " << std::dec << component[i].op.copy.num_copy_rows << "\n";
1908                     out_file << "<num_copy_columns> " << std::dec << component[i].op.copy.num_copy_columns << "\n";
1909                     break;
1910                 default:
1911                     out_file << "<Error!!!> Unsupported Component :  "
1912                              << intel_dnn_operation_name[component[i].operation] << "\n";
1913                     //  fprintf(stderr, "Component type %s not yet supported in AmIntelDnn::WriteDnnText()!\n",
1914                     //    intel_dnn_operation_name[component[i].operation]);
1915                     //  throw -1;
1916                     break;
1917             }
1918         }
1919         if (ptr_active_outputs() != nullptr) {
1920             out_file << "<activelist_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1921                      << MemoryOffset(ptr_active_outputs(), ptr_dnn_memory_) << "\n";
1922         }
1923         out_file << "<end_of_file>\n";
1924         out_file.close();
1925     } else {
1926         fprintf(stderr, "Failed to open %s for writing!\n", filename);
1927         throw -1;
1928     }
1929 }
1930
1931 void AmIntelDnn::InitGNAStruct(intel_nnet_type_t *ptr_nnet) {
1932     intel_nnet_layer_t *pLayer;
1933
1934     if (ptr_nnet == nullptr)
1935         THROW_GNA_EXCEPTION << "Invalid input parameter";
1936     if (ptr_nnet->pLayers != nullptr)
1937         THROW_GNA_EXCEPTION << "InitGNAStruct can't work on prellocated layers array";
1938     if (component.empty())
1939         THROW_GNA_EXCEPTION << "empty model in AmIntelDnn::FillGNAStruct()";
1940
1941     ptr_nnet->nLayers = 0;
1942     for (auto && c : component) {
1943         if (c.operation == kDnnAffineOp
1944             || (c.operation == kDnnDiagonalOp)
1945             || (c.operation == kDnnConvolutional1dOp)
1946             || (c.operation == kDnnDeinterleaveOp)
1947             || (c.operation == kDnnInterleaveOp)
1948             || (c.operation == kDnnRecurrentOp)
1949             || (c.operation == kDnnCopyOp)
1950             ) {
1951             ptr_nnet->nLayers++;
1952         }
1953     }
1954     ptr_nnet->nGroup = num_group_in();
1955     ptr_nnet->pLayers = reinterpret_cast<intel_nnet_layer_t *>(_mm_malloc(ptr_nnet->nLayers * sizeof(intel_nnet_layer_t), 64));
1956     if (ptr_nnet->pLayers == nullptr)
1957         THROW_GNA_EXCEPTION << "out of memory in AmIntelDnn::FillGNAStruct()";
1958     pLayer = ptr_nnet->pLayers;
1959
1960     for (int i = 0; i < component.size(); i++) {
1961         // std::cout << "Component + " << i <<"=GNA_" << std::distance(ptr_nnet->pLayers, pLayer) << "\n";
1962         switch (component[i].operation) {
1963             case kDnnAffineOp:
1964                 pLayer->nInputRows = component[i].num_rows_in;
1965                 pLayer->nInputColumns = component[i].num_columns_in;
1966                 pLayer->nOutputRows = component[i].num_rows_out;
1967                 pLayer->nOutputColumns = component[i].num_columns_out;
1968                 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
1969                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten if PWL op is needed
1970                 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
1971                 pLayer->pInputs = component[i].ptr_inputs;
1972                 pLayer->pOutputsIntermediate = component[i].ptr_outputs;
1973                 pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten if PWL op is needed
1974                 pLayer->nLayerKind = INTEL_AFFINE;
1975                 {
1976                     pLayer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64);
1977                     if (pLayer->pLayerStruct == nullptr) {
1978                         THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_AFFINE layer structure.";
1979                     }
1980                     auto pAffineLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
1981                     pAffineLayer->pwl.pSegments = nullptr;
1982                     pAffineLayer->pwl.nSegments = 0;
1983
1984                     pAffineLayer->affine.nBytesPerBias = component[i].op.affine.num_bytes_per_bias;
1985                     pAffineLayer->affine.nBytesPerWeight = component[i].op.affine.num_bytes_per_weight;
1986                     pAffineLayer->affine.pBiases = component[i].op.affine.ptr_biases;
1987                     pAffineLayer->affine.pWeights = component[i].op.affine.ptr_weights;
1988                 }
1989                 if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
1990                     pLayer++;
1991                 }
1992                 break;
1993             case kDnnDiagonalOp:
1994                 pLayer->nInputRows = component[i].num_rows_in;
1995                 pLayer->nInputColumns = component[i].num_columns_in;
1996                 pLayer->nOutputRows = component[i].num_rows_out;
1997                 pLayer->nOutputColumns = component[i].num_columns_out;
1998                 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
1999                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten if PWL op is needed
2000                 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2001                 pLayer->pInputs = component[i].ptr_inputs;
2002                 pLayer->pOutputsIntermediate = component[i].ptr_outputs;
2003                 pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten if PWL op is needed
2004                 pLayer->nLayerKind = INTEL_AFFINE_DIAGONAL;
2005                 {
2006                     pLayer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64);
2007                     if (pLayer->pLayerStruct == nullptr) {
2008                         THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_AFFINE_DIAGONAL layer structure.";
2009                     }
2010                     auto pDiagonalLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
2011                     pDiagonalLayer->pwl.pSegments = nullptr;
2012                     pDiagonalLayer->pwl.nSegments = 0;
2013
2014                     pDiagonalLayer->affine.nBytesPerBias = component[i].op.affine.num_bytes_per_bias;
2015                     pDiagonalLayer->affine.nBytesPerWeight = component[i].op.affine.num_bytes_per_weight;
2016                     pDiagonalLayer->affine.pBiases = component[i].op.affine.ptr_biases;
2017                     pDiagonalLayer->affine.pWeights = component[i].op.affine.ptr_weights;
2018                 }
2019                 if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
2020                     pLayer++;
2021                 }
2022                 break;
2023             case kDnnRecurrentOp:
2024                 pLayer->nInputRows = component[i].num_rows_in;
2025                 pLayer->nInputColumns = component[i].num_columns_in;
2026                 pLayer->nOutputRows = component[i].num_rows_out;
2027                 pLayer->nOutputColumns = component[i].num_columns_out;
2028                 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2029                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten if PWL op is needed
2030                 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2031                 pLayer->pInputs = component[i].ptr_inputs;
2032                 pLayer->pOutputsIntermediate = component[i].ptr_outputs;
2033                 pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten if PWL op is needed
2034                 pLayer->nLayerKind = INTEL_RECURRENT;
2035                 {
2036                     pLayer->pLayerStruct = _mm_malloc(sizeof(intel_recurrent_layer_t), 64);
2037                     if (pLayer->pLayerStruct == nullptr) {
2038                         THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_RECURRENT layer structure.";
2039                     }
2040                     auto pRecurrentLayer = reinterpret_cast<intel_recurrent_layer_t *>(pLayer->pLayerStruct);
2041                     pRecurrentLayer->pFeedbackBuffer = component[i].op.recurrent.ptr_feedbacks;
2042                     pRecurrentLayer->pwl.pSegments = nullptr;
2043                     pRecurrentLayer->pwl.nSegments = 0;
2044
2045                     pRecurrentLayer->affine.nBytesPerBias = component[i].op.recurrent.num_bytes_per_bias;
2046                     pRecurrentLayer->affine.nBytesPerWeight = component[i].op.recurrent.num_bytes_per_weight;
2047                     pRecurrentLayer->affine.pBiases = component[i].op.recurrent.ptr_biases;
2048                     pRecurrentLayer->affine.pWeights = component[i].op.recurrent.ptr_weights;
2049                 }
2050                 if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
2051                     pLayer++;
2052                 }
2053                 break;
2054             case kDnnConvolutional1dOp:
2055                 pLayer->nInputRows = component[i].num_rows_in;
2056                 pLayer->nInputColumns = component[i].num_columns_in;
2057                 pLayer->nOutputRows = component[i].num_rows_out;
2058                 pLayer->nOutputColumns = component[i].num_columns_out;
2059                 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2060                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten
2061                 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2062                 pLayer->pInputs = component[i].ptr_inputs;
2063                 pLayer->pOutputsIntermediate = component[i].ptr_outputs;
2064                 pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten
2065                 pLayer->nLayerKind = INTEL_CONVOLUTIONAL;
2066                 {
2067                     pLayer->pLayerStruct = _mm_malloc(sizeof(intel_convolutional_layer_t), 64);
2068                     if (pLayer->pLayerStruct == nullptr) {
2069                         THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_CONVOLUTIONAL layer structure.";
2070                     }
2071                     auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
2072                     pConvolutionalLayer->nBytesBias = component[i].op.conv1D.num_bytes_per_bias;
2073                     pConvolutionalLayer->nBytesFilterCoefficient = component[i].op.conv1D.num_bytes_per_weight;
2074                     pConvolutionalLayer->nFilters = component[i].op.conv1D.num_filters;
2075                     pConvolutionalLayer->nFilterRows = component[i].op.conv1D.num_filter_rows;
2076                     pConvolutionalLayer->nFilterCoefficients = component[i].op.conv1D.num_filter_coefficients;
2077                     pConvolutionalLayer->nFeatureMaps = component[i].op.conv1D.num_feature_maps;
2078                     pConvolutionalLayer->nFeatureMapRows = component[i].op.conv1D.num_feature_map_rows;
2079                     pConvolutionalLayer->nFeatureMapColumns = component[i].op.conv1D.num_feature_map_columns;
2080                     pConvolutionalLayer->poolType = INTEL_NO_POOLING;  //  will be overwritten
2081                     pConvolutionalLayer->nPoolSize = 0;  //  will be overwritten
2082                     pConvolutionalLayer->nPoolStride = 0;  //  will be overwritten
2083                     pConvolutionalLayer->pwl.nSegments = 0;  //  will be overwritten
2084                     pConvolutionalLayer->pwl.pSegments = nullptr;  //  will be overwritten
2085                     pConvolutionalLayer->pBiases = component[i].op.conv1D.ptr_biases;
2086                     pConvolutionalLayer->pFilters = component[i].op.conv1D.ptr_filters;
2087                 }
2088                 if (i == component.size() - 1 || ((component[i + 1].operation != kDnnMaxPoolOp)
2089                         && (component[i + 1].operation != kDnnPiecewiselinearOp))) {
2090                     pLayer++;
2091                 }
2092                 break;
2093             case kDnnMaxPoolOp:
2094                 if (i == 0) {
2095                     THROW_GNA_EXCEPTION << "Pooling component with no preceeding component";
2096                 } else if (pLayer->nLayerKind == INTEL_CONVOLUTIONAL) {
2097                     if (pLayer->pLayerStruct == nullptr) {
2098                         THROW_GNA_EXCEPTION "INTEL_CONVOLUTIONAL layer structure was not initialized.";
2099                     }
2100                     auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
2101                     // it is possible to have activation preceding to maxpool
2102                     if (pConvolutionalLayer->pwl.nSegments != 0) {
2103                         THROW_GNA_EXCEPTION << "Encountered activation component before pooling component at." << i;
2104                     } else {
2105                         pConvolutionalLayer->poolType =
2106                             (component[i].op.maxpool.do_sum_not_max) ? INTEL_SUM_POOLING : INTEL_MAX_POOLING;
2107                         pConvolutionalLayer->nPoolSize = component[i].op.maxpool.num_inputs;
2108                         pConvolutionalLayer->nPoolStride = component[i].op.maxpool.num_inputs_step;
2109
2110
2111                         // number of output columns correction - based on GNA-library expectations
2112                         auto nFltSize = pConvolutionalLayer->nFilterCoefficients;
2113                         auto fltStrideSz = pConvolutionalLayer->nFeatureMaps * pConvolutionalLayer->nFeatureMapColumns;  // always move 1 "row"
2114                         auto maxNCOE = (pLayer->nInputColumns - nFltSize) / fltStrideSz + 1;
2115                         // FLAT input matrix, pooled outputs per filter
2116                         pLayer->nOutputColumns = pConvolutionalLayer->nFilters * ((maxNCOE - 1) / pConvolutionalLayer->nPoolStride + 1);
2117
2118                         // old code
2119                         // pLayer->nOutputColumns /= pConvolutionalLayer->nPoolStride;
2120                     }
2121                 } else {
2122                     THROW_GNA_EXCEPTION << "Pooling component applied to non-convolutional layer";
2123                 }
2124                 break;
2125             case kDnnPiecewiselinearOp:
2126                 pLayer->pOutputs = component[i].ptr_outputs;
2127                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
2128                 if (pLayer->pLayerStruct == nullptr) {
2129                     THROW_GNA_EXCEPTION << pLayer->nLayerKind << " layer structure was not initialized.";
2130                 }
2131                 if (i == 0) {
2132                     THROW_GNA_EXCEPTION << "PWL component with no preceding component.";
2133                 } else if ((component[i - 1].operation == kDnnAffineOp)
2134                     || (component[i - 1].operation == kDnnDiagonalOp)) {
2135                     auto pAffineLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
2136                     pAffineLayer->pwl.nSegments = component[i].op.pwl.num_segments;
2137                     pAffineLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
2138                 } else if (component[i - 1].operation == kDnnRecurrentOp) {
2139                     auto pRecurrentLayer = reinterpret_cast<intel_recurrent_layer_t *>(pLayer->pLayerStruct);
2140                     pRecurrentLayer->pwl.nSegments = component[i].op.pwl.num_segments;
2141                     pRecurrentLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
2142                 } else if ((component[i - 1].operation == kDnnConvolutional1dOp)
2143                     || ((component[i - 1].operation == kDnnMaxPoolOp)
2144                         && (component[i - 2].operation == kDnnConvolutional1dOp))) {
2145                     auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
2146                     pConvolutionalLayer->pwl.nSegments = component[i].op.pwl.num_segments;
2147                     pConvolutionalLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
2148                     if (component[i - 1].operation != kDnnMaxPoolOp) {
2149                         pLayer->nOutputColumns = component[i].num_columns_out;
2150                     }
2151                 }
2152                 pLayer++;
2153
2154                 break;
2155             case kDnnInterleaveOp:
2156                 pLayer->nInputRows = component[i].num_rows_in;
2157                 pLayer->nInputColumns = component[i].num_columns_in;
2158                 pLayer->nOutputRows = component[i].num_rows_out;
2159                 pLayer->nOutputColumns = component[i].num_columns_out;
2160                 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2161                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
2162                 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2163                 pLayer->pInputs = component[i].ptr_inputs;
2164                 pLayer->pOutputsIntermediate = nullptr;
2165                 pLayer->pOutputs = component[i].ptr_outputs;
2166                 pLayer->nLayerKind = INTEL_INTERLEAVE;
2167                 pLayer->pLayerStruct = nullptr;
2168                 pLayer++;
2169                 break;
2170             case kDnnDeinterleaveOp:
2171                 pLayer->nInputRows = component[i].num_rows_in;
2172                 pLayer->nInputColumns = component[i].num_columns_in;
2173                 pLayer->nOutputRows = component[i].num_rows_out;
2174                 pLayer->nOutputColumns = component[i].num_columns_out;
2175                 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2176                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
2177                 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2178                 pLayer->pInputs = component[i].ptr_inputs;
2179                 pLayer->pOutputsIntermediate = nullptr;
2180                 pLayer->pOutputs = component[i].ptr_outputs;
2181                 pLayer->nLayerKind = INTEL_DEINTERLEAVE;
2182                 pLayer->pLayerStruct = nullptr;
2183                 pLayer++;
2184                 break;
2185             case kDnnCopyOp:
2186                 pLayer->nInputRows = component[i].num_columns_in;
2187                 pLayer->nInputColumns = component[i].num_rows_in;
2188                 pLayer->nOutputRows = component[i].num_columns_out;
2189                 pLayer->nOutputColumns = component[i].num_rows_out;
2190                 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2191                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
2192                 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2193                 pLayer->pInputs = component[i].ptr_inputs;
2194                 pLayer->pOutputsIntermediate = nullptr;
2195                 pLayer->pOutputs = component[i].ptr_outputs;
2196                 pLayer->nLayerKind = INTEL_COPY;
2197                 pLayer->pLayerStruct = nullptr;
2198                 {
2199                     pLayer->pLayerStruct = _mm_malloc(sizeof(intel_copy_layer_t), 64);
2200                     if (pLayer->pLayerStruct == nullptr) {
2201                         THROW_GNA_EXCEPTION << pLayer->nLayerKind << " could not allocate memory for INTEL_COPY layer structure.";
2202                     }
2203                     auto *pCopyLayer = reinterpret_cast<intel_copy_layer_t *>(pLayer->pLayerStruct);
2204                     pCopyLayer->nCopyRows = component[i].op.copy.num_copy_columns;
2205                     pCopyLayer->nCopyCols = component[i].op.copy.num_copy_rows;
2206                 }
2207                 pLayer++;
2208                 break;
2209             default: {
2210                 THROW_GNA_EXCEPTION << "GNA does yet not support " << intel_dnn_operation_name[component[i].operation];
2211             }
2212         }
2213     }
2214     // enable debugging of partial array of components
2215     ptr_nnet->nLayers = std::distance(ptr_nnet->pLayers, pLayer);
2216 }
2217
2218 void AmIntelDnn::DestroyGNAStruct(intel_nnet_type_t *ptr_nnet) {
2219     ptr_nnet->nGroup = 0;
2220     if (ptr_nnet->pLayers != nullptr) {
2221         for (int i = 0; i < ptr_nnet->nLayers; i++) {
2222             switch (ptr_nnet->pLayers[i].nLayerKind) {
2223                 case INTEL_AFFINE:break;
2224                 case INTEL_AFFINE_DIAGONAL:break;
2225                 case INTEL_RECURRENT:break;
2226                 case INTEL_CONVOLUTIONAL:break;
2227                 case INTEL_INTERLEAVE:break;
2228                 case INTEL_DEINTERLEAVE:break;
2229                 case INTEL_COPY:break;
2230                 default:break;
2231             }
2232             if (ptr_nnet->pLayers[i].pLayerStruct != nullptr) {
2233                 _mm_free(ptr_nnet->pLayers[i].pLayerStruct);
2234             }
2235         }
2236         if (ptr_nnet->pLayers != nullptr) {
2237             _mm_free(ptr_nnet->pLayers);
2238         }
2239     }
2240     ptr_nnet->nLayers = 0;
2241 }
2242
2243 void AmIntelDnn::GetScaledOutput(float *ptr_output, uint32_t component_index) {
2244     if (component_index > num_components()) {
2245         fprintf(stderr, "Illegal component index %d in GetScaledOutput\n", component_index);
2246         throw -1;
2247     }
2248     if (ptr_output != nullptr) {
2249         float scale_factor = OutputScaleFactor(component_index);
2250         uint32_t num_elements = component[component_index].num_rows_out * component[component_index].num_columns_out;
2251         if (number_type_ == kDnnFloat) {
2252             float *ptr_input = reinterpret_cast<float *>(component[component_index].ptr_outputs);
2253             for (uint32_t i = 0; i < num_elements; i++) {
2254                 ptr_output[i] = ptr_input[i] / scale_factor;
2255             }
2256         } else if (component[component_index].num_bytes_per_output == 2) {
2257             int16_t *ptr_input = reinterpret_cast<int16_t *>(component[component_index].ptr_outputs);
2258             for (uint32_t i = 0; i < num_elements; i++) {
2259                 ptr_output[i] = static_cast<float>(ptr_input[i]) / scale_factor;
2260             }
2261         } else {
2262             int32_t *ptr_input = reinterpret_cast<int32_t *>(component[component_index].ptr_outputs);
2263             for (uint32_t i = 0; i < num_elements; i++) {
2264                 ptr_output[i] = static_cast<float>(ptr_input[i]) / scale_factor;
2265             }
2266         }
2267     } else {
2268         fprintf(stderr, "Output pointer is nullptr in GetScaledOutput\n");
2269         throw -1;
2270     }
2271 }
2272
2273 void AmIntelDnn::WriteInputAndOutputTextGNA(intel_nnet_type_t * nnet) {
2274 #ifdef LIGHT_DUMP
2275     if (nnet) {
2276         for (int i = 0; i < nnet->nLayers; i++) {
2277             auto component = nnet->pLayers;
2278             std::stringstream out_file_name;
2279             auto getLayerType = [](intel_layer_kind_t kind){
2280                 switch (kind){
2281                     case INTEL_AFFINE : return "affine";
2282                     case INTEL_AFFINE_DIAGONAL : return "diag";
2283                     case INTEL_RECURRENT : return "recurrent";
2284                     case INTEL_CONVOLUTIONAL : return "convolution";
2285                     case INTEL_INTERLEAVE : return "interleave";
2286                     case INTEL_DEINTERLEAVE : return "deinterleave";
2287                     case INTEL_COPY : return "copy";
2288                     default: return "unknown";
2289                 }
2290             };
2291             out_file_name << std::setfill('0') << std::setw(2) << i << "_"
2292                           << getLayerType(component[i].nLayerKind)
2293                           << "-" << nnet->pLayers[i].nInputRows
2294                           << "-" << nnet->pLayers[i].nOutputRows;
2295
2296             auto inputfileName = getDumpFolderNameGNA() + out_file_name.str() + "_input.txt";
2297             auto outFileName = getDumpFolderNameGNA() + out_file_name.str() + "_output.txt";
2298             auto pwlFileName = getDumpFolderNameGNA() + out_file_name.str() + "_pwl.txt";
2299             auto refOutputFileName = getRefFolderName() + out_file_name.str() + "_output.txt";
2300
2301             std::ofstream out_file(outFileName.c_str(), std::ios::out);
2302             std::ofstream pwl_file(pwlFileName.c_str(), std::ios::out);
2303             std::ifstream ref_out_file(refOutputFileName.c_str(), std::ios::in);
2304             std::ofstream in_file(inputfileName.c_str(), std::ios::out);
2305
2306             float  summOfDiff = 0.f;
2307             float  summOfSqDiff = 0.f;
2308             float  maxD = 0.0f;
2309             int    numItems = 0;
2310
2311             auto write_pwl = [&pwl_file](intel_pwl_func_t & pwl) {
2312                 for (int k =0; k < pwl.nSegments; k++) {
2313                     pwl_file << pwl.pSegments[k].slope << ", " << pwl.pSegments[k].xBase << ", " << pwl.pSegments[k].yBase << "\n";
2314                 }
2315             };
2316             if (nnet->pLayers[i].nLayerKind == INTEL_AFFINE || nnet->pLayers[i].nLayerKind == INTEL_AFFINE_DIAGONAL) {
2317                 auto affine = reinterpret_cast<intel_affine_layer_t*>(nnet->pLayers[i].pLayerStruct);
2318                 write_pwl(affine->pwl);
2319             }
2320             if (nnet->pLayers[i].nLayerKind == INTEL_CONVOLUTIONAL) {
2321                 auto conv = reinterpret_cast<intel_convolutional_layer_t*>(nnet->pLayers[i].pLayerStruct);
2322                 write_pwl(conv->pwl);
2323             }
2324
2325             for (int k = 0; k < component[i].nOutputRows; k++) {
2326                 for (int j = 0; j < component[i].nOutputColumns; j++) {
2327                     float floatValue = 0.f;
2328                     if (component[i].nBytesPerOutput == 4) {
2329                         auto value = (reinterpret_cast<int32_t *>(component[i].pOutputs)[k * component[i].nOutputColumns + j]);
2330                         floatValue = (static_cast<float>(value) / 1.0);
2331                     } else {
2332                         auto value = reinterpret_cast<int16_t *>(component[i].pOutputs)[k * component[i].nOutputColumns + j];
2333                         floatValue = (static_cast<float>(value) / 1.0);
2334                     }
2335                     out_file << std::setw(8) << floatValue << "\n";
2336                     if (ref_out_file) {
2337                         float ref_value = 0.f;
2338                         ref_out_file >> ref_value;
2339                         float diff = (ref_value - floatValue);
2340                         diff = diff  < 0 ? -diff : diff;
2341                         summOfDiff += diff;
2342                         summOfSqDiff += diff * diff;
2343                         maxD = std::max(maxD, diff);
2344                         numItems++;
2345                     }
2346                 }
2347             }
2348             if (numItems) {
2349                 auto rmse = sqrt(summOfSqDiff / numItems);
2350                 auto avg = summOfDiff / numItems;
2351                 std :: cout << std::left << std::setw(55) << out_file_name.str()
2352                             << " RMSE="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << rmse
2353                             << " avg=" << std::fixed << std::setprecision(5) << std::right << std::setw(8) << avg
2354                             << " maxD="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << maxD << std::endl;
2355             }
2356
2357
2358             for (int k = 0; k < component[i].nInputRows; k++) {
2359                 for (int j = 0; j < component[i].nInputColumns; j++) {
2360                     if (component[i].nBytesPerInput == 4) {
2361                         in_file << std::setw(8)
2362                                 << (reinterpret_cast<int32_t *>(component[i].pInputs)[k * component[i].nInputColumns + j]);
2363                     } else {
2364                         in_file << std::setw(8)
2365                                 << (reinterpret_cast<int16_t *>(component[i].pInputs)[k * component[i].nInputColumns + j]);
2366                     }
2367                     in_file << "\n";
2368                 }
2369             }
2370         }
2371     }
2372 #endif
2373 }
2374
2375 void AmIntelDnn::WriteInputAndOutputText() {
2376 #ifdef LIGHT_DUMP
2377     for (int i = 0; i < num_components(); i++) {
2378         std::stringstream out_file_name;
2379         out_file_name << std::setfill('0') << std::setw(2) << i << "_"
2380                       << intel_dnn_operation_name[component[i].operation]
2381                       << "-" << component[i].num_rows_in
2382                       << "-" << component[i].num_rows_out;
2383         if (component[i].operation == kDnnPiecewiselinearOp) {
2384             out_file_name << "-" << intel_dnn_activation_name[component[i].op.pwl.func_id];
2385         }
2386         auto inputfileName = getDumpFolderName() + out_file_name.str() + "_input.txt";
2387         auto outFileName = getDumpFolderName() + out_file_name.str() + "_output.txt";
2388         auto refOutputFileName = getRefFolderName() + out_file_name.str() + "_output.txt";
2389
2390         std::ofstream out_file(outFileName.c_str(), std::ios::out);
2391         std::ifstream ref_out_file(refOutputFileName.c_str(), std::ios::in);
2392         std::ofstream in_file(inputfileName.c_str(), std::ios::out);
2393
2394         float  summOfDiff = 0.f;
2395         float  summOfSqDiff = 0.f;
2396         float  maxD = 0.0f;
2397         int    numItems = 0;
2398
2399         for (int k = 0; k < component[i].num_rows_out; k++) {
2400             for (int j = 0; j < component[i].num_columns_out; j++) {
2401                 float floatValue = 0.f;
2402                 if (component[i].num_bytes_per_output == 4) {
2403                     if (number_type_ == kDnnInt) {
2404                         auto value = reinterpret_cast<int32_t *>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
2405                         floatValue = static_cast<float>(value);
2406
2407                     } else {
2408                         floatValue = reinterpret_cast<float*>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
2409                     }
2410                 } else {
2411                     auto value = reinterpret_cast<int16_t *>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
2412                     floatValue = static_cast<float>(value);
2413                 }
2414                 out_file << std::setw(8) << floatValue / component[i].output_scale_factor << "\n";
2415
2416                 if (ref_out_file) {
2417                     float ref_value = 0.f;
2418                     ref_out_file >> ref_value;
2419                     float diff = (ref_value - floatValue);
2420                     diff = diff < 0.f ? -diff : diff;
2421                     summOfDiff += diff;
2422                     summOfSqDiff += diff * diff;
2423                     maxD = std::max(maxD, diff);
2424                     numItems++;
2425                 }
2426             }
2427         }
2428         if (numItems) {
2429             auto rmse = sqrt(summOfSqDiff / numItems);
2430             auto avg = summOfDiff / numItems;
2431             std :: cout << std::left << std::setw(55) << out_file_name.str()
2432                         << " RMSE="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << rmse
2433                         << " avg=" << std::fixed << std::setprecision(5) << std::right << std::setw(8) << avg
2434                         << " maxD="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << maxD << std::endl;
2435         }
2436
2437         float input_scale_factor = component[i].output_scale_factor;
2438         if (component[i].operation == kDnnAffineOp ||
2439             component[i].operation == kDnnDiagonalOp) {
2440             input_scale_factor /= component[i].op.affine.weight_scale_factor;
2441         } else if (component[i].operation == kDnnConvolutional1dOp) {
2442             input_scale_factor /= component[i].op.conv1D.weight_scale_factor;
2443         } else if (component[i].operation == kDnnPiecewiselinearOp) {
2444             input_scale_factor = 1.f;
2445         }
2446
2447         for (int k = 0; k < component[i].num_rows_in; k++) {
2448             for (int j = 0; j < component[i].num_columns_in; j++) {
2449                 float floatValue = 0.f;
2450                 if (component[i].num_bytes_per_input == 4) {
2451                     if (number_type_ == kDnnInt) {
2452                         auto value = reinterpret_cast<int32_t *>(component[i].ptr_inputs)[k * component[i].num_columns_in + j];
2453                         floatValue = static_cast<float>(value);
2454                     } else {
2455                         floatValue = reinterpret_cast<float *>(component[i].ptr_inputs)[k * component[i].num_columns_in + j];
2456                     }
2457                 } else {
2458                     auto value = reinterpret_cast<int16_t *>(component[i].ptr_inputs)[k * component[i].num_columns_in+ j];
2459                     floatValue = static_cast<float>(value);
2460                 }
2461                 in_file << std::setw(8) << floatValue / input_scale_factor << "\n";
2462             }
2463         }
2464 #endif
2465     }
2466 }
2467
2468 bool isCompatibleDnn(AmIntelDnn dnn1, AmIntelDnn dnn2) {
2469     bool isCompatible = true;
2470
2471     // compare basic structures to see if they are compatible
2472     if (dnn1.num_components() != dnn2.num_components()) isCompatible = false;
2473     for (int i = 0; i < dnn1.num_components(); i++) {
2474         if (dnn1.component[i].num_rows_in != dnn2.component[i].num_rows_in) isCompatible = false;
2475         if (dnn1.component[i].num_columns_in != dnn2.component[i].num_columns_in) isCompatible = false;
2476         if (dnn1.component[i].num_rows_out != dnn2.component[i].num_rows_out) isCompatible = false;
2477         if (dnn1.component[i].num_columns_out != dnn2.component[i].num_columns_out) isCompatible = false;
2478         if (dnn1.component[i].operation != dnn2.component[i].operation) isCompatible = false;
2479     }
2480
2481     return (isCompatible);
2482 }
2483
2484 void ClearScoreError(intel_score_error_t *error) {
2485     error->num_scores = 0;
2486     error->num_errors = 0;
2487     error->max_error = 0.0;
2488     error->sum_error = 0.0;
2489     error->sum_squared_error = 0.0;
2490     error->max_rel_error = 0.0;
2491     error->sum_rel_error = 0.0;
2492     error->sum_squared_rel_error = 0.0;
2493 }
2494
2495 void UpdateScoreError(intel_score_error_t *error, intel_score_error_t *total_error) {
2496     total_error->num_errors += error->num_errors;
2497     total_error->num_scores += error->num_scores;
2498     total_error->sum_error += error->sum_error;
2499     total_error->sum_squared_error += error->sum_squared_error;
2500     if (error->max_error > total_error->max_error) {
2501         total_error->max_error = error->max_error;
2502     }
2503     total_error->sum_rel_error += error->sum_rel_error;
2504     total_error->sum_squared_rel_error += error->sum_squared_rel_error;
2505     if (error->max_rel_error > total_error->max_rel_error) {
2506         total_error->max_rel_error = error->max_rel_error;
2507     }
2508 }
2509
2510 void SoftmaxGoogle(float *ptr_output, float *ptr_input, const uint32_t num_outputs, const uint32_t num_inputs) {
2511     // Assumes input vector contains log likelihoods
2512     // The computes x[i] = x[i] - log(sum_j exp(x[j]))
2513     // This normalizes the likelihoods by the sum of likelihoods but stores them as log likelihoods
2514
2515     float max_score = ptr_input[0];
2516     float sum = 0.0;
2517     float diff;
2518     // find max score for normalization to [0,1]
2519     for (uint32_t i = 0; i < num_inputs; i++) {
2520         if (ptr_input[i] > max_score) {
2521             max_score = ptr_input[i];
2522         }
2523     }
2524     for (uint32_t i = 0; i < num_inputs; i++) {
2525         sum += exp(ptr_input[i] - max_score);
2526     }
2527     if (sum < 1.0e-20) {
2528         fprintf(stderr, "Warning:  attempt to take log(0) in SoftmaxGoogle()!\n");
2529         sum = 1.0e-20;
2530     }
2531     diff = max_score + log(sum);
2532     for (uint32_t i = 0; i < num_outputs; i++) {
2533         ptr_output[i] = ptr_input[i] - diff;
2534     }
2535 }