Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / src / gna_plugin / dnn.h
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #pragma once
6
7 #include <stdlib.h>
8 #include <stdio.h>
9 #include <malloc.h>
10 #include <memory.h>
11 #include <xmmintrin.h>
12 #include <iostream>
13 #include <fstream>
14 #include <sstream>
15 #include <string>
16 #include <iomanip>
17 #include <type_traits>
18 #include <vector>
19 #include "gna-api.h"
20
21 #define DNN_MAX_BATCH_SIZE 8
22 #define DNN_MAX_INPUTS 3072
23 #define DNN_MAX_OUTPUTS 8192
24 #define DNN_MAX_ERROR 1.0e-4f
25 #define DNN_NUM_BYTES_INT_BIAS 4
26 #define DNN_NUM_BYTES_INT_AFFINE_OUT 4
27 #define DNN_RAND_INT8_AMPLITUDE 127.0f
28 #define DNN_RAND_INT16_AMPLITUDE 16384.0f
29 #define DNN_RAND_INT32_AMPLITUDE 1048576.0f
30 #define DNN_RAND_FLOAT32_AMPLITUDE 8.0f
31
32 enum DnnActivationType {
33     kActNone,
34     kActSigmoid,
35     kActTanh,
36     kActRelu,
37     kActLeakyRelu,
38     kActIdentity,
39     kActKaldiLstmClipping,
40     kActCustom,
41     kActNumType
42 };
43 struct DnnActivation {
44     // for prelu
45     DnnActivationType type;
46     float negative_slope;
47     operator DnnActivationType () const noexcept {
48         return type;
49     }
50     static DnnActivation fromType(DnnActivationType type) {
51         DnnActivation activation;
52         activation.type = type;
53         activation.negative_slope = 0.0f;
54         return activation;
55     }
56 };
57
58 static_assert(std::is_trivial<DnnActivation>::value, "DnnActivation is not trival type");
59
60 static const char *intel_dnn_activation_name[kActNumType] = {
61     "kActNone",
62     "kActSigmoid",
63     "kActTanh",
64     "kActRelu",
65     "kActLeakyRelu",
66     "kActIdentity",
67     "kActKaldiLstmClipping",
68     "kActCustom"
69 };
70
71 typedef enum DnnSoftmaxType {
72     kSoftmaxNone,
73     kSoftmaxKaldiSumgroup,
74     kSoftmaxEesen,
75     kSoftmaxGoogle,
76     kSoftmaxNumType
77 } intel_dnn_softmax_type_t;
78
79 static const char *intel_dnn_softmax_name[kSoftmaxNumType] = {
80     "kSoftmaxNone",
81     "kSoftmaxKaldiSumGroup",
82     "kSoftmaxKaldiApplyLog",
83     "kSoftmaxGoogle"
84 };
85
86 typedef enum {
87     kDnnUnknownOrientation,
88     kDnnInterleavedOrientation,
89     kDnnNonInterleavedOrientation,
90     kDnnNumOrientation
91 } intel_dnn_orientation_t;
92
93 typedef enum {
94     kDnnNullOp,
95     kDnnAffineOp,
96     kDnnDiagonalOp,
97     kDnnConvolutional1dOp,
98     kDnnPiecewiselinearOp,
99     kDnnMaxPoolOp,
100     kDnnRecurrentOp,
101     kDnnInterleaveOp,
102     kDnnDeinterleaveOp,
103     kDnnCopyOp,
104     kDnnNumOp
105 } intel_dnn_operation_t;
106
107 static const char *intel_dnn_operation_name[kDnnNumOp] = {
108     "kDnnNullOp",
109     "kDnnAffineOp",
110     "kDnnDiagonalOp",
111     "kDnnConvolutional1dOp",
112     "kDnnPiecewiselinearOp",
113     "kDnnMaxPoolOp",
114     "kDnnRecurrentOp",
115     "kDnnInterleaveOp",
116     "kDnnDeinterleaveOp",
117     "kDnnCopyOp"
118 };
119
120 typedef enum {
121     kDnnMacroOpNone,
122     kDnnMacroOpLstm,
123     kDnnMacroOpBiLstm,
124     kDnnNumMacroOp
125 } intel_dnn_macro_operation_t;
126
127 static const char *intel_dnn_macro_operation_name[kDnnNumMacroOp] = {
128     "kDnnMacroOpNone",
129     "kDnnMacroOpLstm",
130     "kDnnMacroOpBiLstm"
131 };
132
133 typedef enum {
134     kDnnFloat,
135     kDnnInt,
136     kDnnNumNumberType
137 } intel_dnn_number_type_t;
138
139 static const char *intel_dnn_number_type_name[kDnnNumNumberType] = {
140     "kDnnFloat",
141     "kDnnInt"
142 };
143
144 typedef struct {
145     uint32_t num_bytes_per_weight;
146     uint32_t num_bytes_per_bias;
147     float weight_scale_factor;
148     void *ptr_weights;
149     void *ptr_biases;
150 } intel_affine_t;
151
152 typedef struct {
153     uint32_t num_bytes_per_weight;
154     uint32_t num_bytes_per_bias;
155     uint32_t num_filters;
156     uint32_t num_filter_rows;
157     uint32_t num_filter_coefficients;
158     uint32_t num_feature_maps;
159     uint32_t num_feature_map_rows;
160     uint32_t num_feature_map_columns;
161     float weight_scale_factor;
162     void *ptr_filters;     // filters stored one after the other
163     void *ptr_biases;
164 } intel_convolutionalD_t;
165
166 typedef struct {
167     uint32_t num_inputs;         // pool size
168     uint32_t num_inputs_step;     // pool step
169     uint32_t num_inputs_stride;  // pool stride (number of convolution filters)
170     bool do_sum_not_max;
171 } intel_maxpool_t;
172
173 typedef struct {
174     DnnActivation func_id;       // identifies function being approximated
175     uint32_t num_segments;
176     intel_pwl_segment_t *ptr_segments;
177 } intel_piecewiselinear_t;
178
179 typedef struct {
180     uint32_t num_vector_delay;
181     uint32_t num_bytes_per_weight;
182     uint32_t num_bytes_per_bias;
183     float weight_scale_factor;
184     void *ptr_feedbacks;
185     void *ptr_weights;
186     void *ptr_biases;
187 } intel_recurrent_t;
188
189 typedef struct {
190 } intel_interleave_t;
191
192 typedef struct {
193 } intel_deinterleave_t;
194
195 typedef struct {
196     uint32_t num_copy_columns;        // number of columns to copy
197     uint32_t num_copy_rows;            // number of rows to copy
198 } intel_copy_t;
199
200 typedef struct {
201     uint32_t num_rows_in;
202     uint32_t num_columns_in;
203     uint32_t num_rows_out;
204     uint32_t num_columns_out;
205     uint32_t num_bytes_per_input;
206     uint32_t num_bytes_per_output;
207     intel_dnn_operation_t operation;
208     intel_dnn_macro_operation_t macro_operation;
209     intel_dnn_orientation_t orientation_in;
210     intel_dnn_orientation_t orientation_out;
211     union operation_struct_t {
212         intel_affine_t affine;
213         intel_convolutionalD_t conv1D;
214         intel_maxpool_t maxpool;
215         intel_piecewiselinear_t pwl;
216         intel_recurrent_t recurrent;
217         intel_interleave_t interleave;
218         intel_deinterleave_t deinterleave;
219         intel_copy_t copy;
220     } op;
221     void *ptr_inputs;
222     void *ptr_outputs;
223     float output_scale_factor;
224 } intel_dnn_component_t;
225
226 typedef struct {
227     uint32_t num_scores;
228     uint32_t num_errors;
229     float threshold;
230     float max_error;
231     float rms_error;
232     float sum_error;
233     float sum_rms_error;
234     float sum_squared_error;
235     float max_rel_error;
236     float sum_rel_error;
237     float sum_squared_rel_error;
238 } intel_score_error_t;
239
240 class AmIntelDnn {
241  public:
242     AmIntelDnn()
243         : ptr_active_outputs_(NULL),
244           num_active_outputs_(0),
245           input_scale_factor_(1.0),
246           num_left_context(0),
247           num_right_context(0),
248           do_rotate_input(false),
249           num_rotate_rows(0),
250           num_rotate_columns(0),
251           softmax_type(kSoftmaxNone),
252           ptr_sumgroup_sizes(NULL),
253           num_sumgroup_sizes(0),
254           ptr_priors(NULL),
255           ptr_dnn_memory_(NULL) {
256     }
257
258     ~AmIntelDnn() {
259         component.clear();
260         if (ptr_sumgroup_sizes != NULL) {
261             _mm_free(ptr_sumgroup_sizes);
262         }
263         if (ptr_priors != NULL) {
264             _mm_free(ptr_priors);
265         }
266     }
267
268     uint32_t num_components() { return (uint32_t) component.size(); }
269
270     void Init(void *ptr_memory, uint32_t num_memory_bytes, intel_dnn_number_type_t number_type, float scale_factor);
271     void InitActiveList(uint32_t *ptr_active_list);
272
273     template<class A, class B, class C, class D>
274     static void InitAffineComponent(intel_dnn_component_t &comp,
275                              uint32_t num_rows_in,
276                              uint32_t num_columns,
277                              uint32_t num_rows_out,
278                              uint32_t num_bytes_per_input,
279                              uint32_t num_bytes_per_output,
280                              uint32_t num_bytes_per_weight,
281                              uint32_t num_bytes_per_bias,
282                              float weight_scale_factor,
283                              float output_scale_factor,
284                              A *&ptr_inputs,
285                              B *&ptr_outputs,
286                              C *&ptr_weights,
287                              D *&ptr_biases,
288                              bool isDiag = false) {
289         InitAffineComponentPrivate(comp,
290                                    num_rows_in,
291                                    num_columns,
292                                    num_rows_out,
293                                    num_bytes_per_input,
294                                    num_bytes_per_output,
295                                    num_bytes_per_weight,
296                                    num_bytes_per_bias,
297                                    weight_scale_factor,
298                                    output_scale_factor,
299                                    (void *&) ptr_inputs,
300                                    (void *&) ptr_outputs,
301                                    (void *&) ptr_weights,
302                                    (void *&) ptr_biases,
303                                    isDiag,
304                                    true);
305     }
306
307     template<class A, class B, class C, class D>
308     void InitAffineComponent(uint32_t component_index,
309                              uint32_t num_rows_in,
310                              uint32_t num_columns,
311                              uint32_t num_rows_out,
312                              uint32_t num_bytes_per_input,
313                              uint32_t num_bytes_per_output,
314                              uint32_t num_bytes_per_weight,
315                              uint32_t num_bytes_per_bias,
316                              float weight_scale_factor,
317                              float output_scale_factor,
318                              A *&ptr_inputs,
319                              B *&ptr_outputs,
320                              C *&ptr_weights,
321                              D *&ptr_biases,
322                              bool isDiag = false) {
323         InitAffineComponentPrivate(component[component_index],
324                                    num_rows_in,
325                                    num_columns,
326                                    num_rows_out,
327                                    num_bytes_per_input,
328                                    num_bytes_per_output,
329                                    num_bytes_per_weight,
330                                    num_bytes_per_bias,
331                                    weight_scale_factor,
332                                    output_scale_factor,
333                                    (void *&) ptr_inputs,
334                                    (void *&) ptr_outputs,
335                                    (void *&) ptr_weights,
336                                    (void *&) ptr_biases,
337                                    isDiag,
338                                    false);
339     }
340
341     void InitDiagonalComponent(uint32_t component_index,
342                                uint32_t num_rows_in,
343                                uint32_t num_columns,
344                                uint32_t num_rows_out,
345                                uint32_t num_bytes_per_input,
346                                uint32_t num_bytes_per_output,
347                                uint32_t num_bytes_per_weight,
348                                uint32_t num_bytes_per_bias,
349                                float weight_scale_factor,
350                                float output_scale_factor,
351                                void *ptr_inputs,
352                                void *ptr_outputs,
353                                void *ptr_weights,
354                                void *ptr_biases);
355
356     template<class A, class B, class C, class D>
357     void InitConvolutional1DComponent(uint32_t component_index,
358                                       uint32_t num_rows_in,
359                                       uint32_t num_columns_in,
360                                       uint32_t num_rows_out,
361                                       uint32_t num_columns_out,
362                                       uint32_t num_bytes_per_input,
363                                       uint32_t num_bytes_per_output,
364                                       uint32_t num_bytes_per_weight,
365                                       uint32_t num_bytes_per_bias,
366                                       uint32_t num_filters,
367                                       uint32_t num_filter_rows,
368                                       uint32_t num_filter_coefficients,
369                                       uint32_t num_feature_maps,
370                                       uint32_t num_feature_map_rows,
371                                       uint32_t num_feature_map_columns,
372                                       float weight_scale_factor,
373                                       float output_scale_factor,
374                                       A *& ptr_inputs,
375                                       B *& ptr_outputs,
376                                       C *& ptr_filters,
377                                       D *& ptr_biases) {
378         InitConvolutional1DComponentPrivate(component[component_index],
379                                             num_rows_in,
380                                             num_columns_in,
381                                             num_rows_out,
382                                             num_columns_out,
383                                             num_bytes_per_input,
384                                             num_bytes_per_output,
385                                             num_bytes_per_weight,
386                                             num_bytes_per_bias,
387                                             num_filters,
388                                             num_filter_rows,
389                                             num_filter_coefficients,
390                                             num_feature_maps,
391                                             num_feature_map_rows,
392                                             num_feature_map_columns,
393                                             weight_scale_factor,
394                                             output_scale_factor,
395                                             (void *&) ptr_inputs,
396                                             (void *&) ptr_outputs,
397                                             (void *&) ptr_filters,
398                                             (void *&) ptr_biases,
399                                             false);
400     }
401
402     template<class A, class B, class C, class D>
403     static void InitConvolutional1DComponent(intel_dnn_component_t &comp,
404                                       uint32_t num_rows_in,
405                                       uint32_t num_columns_in,
406                                       uint32_t num_rows_out,
407                                       uint32_t num_columns_out,
408                                       uint32_t num_bytes_per_input,
409                                       uint32_t num_bytes_per_output,
410                                       uint32_t num_bytes_per_weight,
411                                       uint32_t num_bytes_per_bias,
412                                       uint32_t num_filters,
413                                       uint32_t num_filter_rows,
414                                       uint32_t num_filter_coefficients,
415                                       uint32_t num_feature_maps,
416                                       uint32_t num_feature_map_rows,
417                                       uint32_t num_feature_map_columns,
418                                       float weight_scale_factor,
419                                       float output_scale_factor,
420                                       A *& ptr_inputs,
421                                       B *& ptr_outputs,
422                                       C *& ptr_filters,
423                                       D *& ptr_biases) {
424         InitConvolutional1DComponentPrivate(comp,
425                                             num_rows_in,
426                                             num_columns_in,
427                                             num_rows_out,
428                                             num_columns_out,
429                                             num_bytes_per_input,
430                                             num_bytes_per_output,
431                                             num_bytes_per_weight,
432                                             num_bytes_per_bias,
433                                             num_filters,
434                                             num_filter_rows,
435                                             num_filter_coefficients,
436                                             num_feature_maps,
437                                             num_feature_map_rows,
438                                             num_feature_map_columns,
439                                             weight_scale_factor,
440                                             output_scale_factor,
441                                             (void *&) ptr_inputs,
442                                             (void *&) ptr_outputs,
443                                             (void *&) ptr_filters,
444                                             (void *&) ptr_biases,
445                                             true);
446     }
447
448
449
450     // TODO: this functions accepted component_index only used in legacy code
451     void InitMaxpoolComponent(uint32_t component_index,
452                               uint32_t num_rows_in,
453                               uint32_t num_columns_in,
454                               uint32_t num_rows_out,
455                               uint32_t num_columns_out,
456                               uint32_t num_bytes_per_input,
457                               uint32_t num_bytes_per_output,
458                               uint32_t num_pool_size,
459                               uint32_t num_pool_step,
460                               uint32_t num_pool_stride,
461                               bool do_sum_not_max,
462                               float output_scale_factor,
463                               void * ptr_inputs,
464                               void * ptr_outputs) {
465         InitMaxpoolComponentPrivate(component[component_index],
466             num_rows_in,
467             num_columns_in,
468             num_rows_out,
469             num_columns_out,
470             num_bytes_per_input,
471             num_bytes_per_output,
472             num_pool_size,
473             num_pool_step,
474             num_pool_stride,
475             do_sum_not_max,
476             output_scale_factor,
477             (void *&) ptr_inputs,
478             (void *&) ptr_outputs,
479             false);
480     }
481
482     template<class A, class B>
483     static void InitMaxpoolComponent(intel_dnn_component_t &cmp,
484                               uint32_t num_rows_in,
485                               uint32_t num_columns_in,
486                               uint32_t num_rows_out,
487                               uint32_t num_columns_out,
488                               uint32_t num_bytes_per_input,
489                               uint32_t num_bytes_per_output,
490                               uint32_t num_pool_size,
491                               uint32_t num_pool_step,
492                               uint32_t num_pool_stride,
493                               bool do_sum_not_max,
494                               float output_scale_factor,
495                               A *&ptr_inputs,
496                               B *&ptr_outputs) {
497         InitMaxpoolComponentPrivate(cmp,
498                                     num_rows_in,
499                                     num_columns_in,
500                                     num_rows_out,
501                                     num_columns_out,
502                                     num_bytes_per_input,
503                                     num_bytes_per_output,
504                                     num_pool_size,
505                                     num_pool_step,
506                                     num_pool_stride,
507                                     do_sum_not_max,
508                                     output_scale_factor,
509                                     (void *&) ptr_inputs,
510                                     (void *&) ptr_outputs,
511                                     true);
512     }
513
514
515
516
517     void InitPiecewiseLinearComponent(uint32_t component_index,
518                                       DnnActivation function_id,
519                                       intel_dnn_orientation_t orientation,
520                                       uint32_t num_rows,
521                                       uint32_t num_columns,
522                                       uint32_t num_bytes_per_input,
523                                       uint32_t num_bytes_per_output,
524                                       uint32_t num_segments,
525                                       float output_scale_factor,
526                                       void * ptr_inputs,
527                                       void * ptr_outputs,
528                                       intel_pwl_segment_t *ptr_segments) {
529         InitPiecewiseLinearComponentPrivate(component[component_index],
530                                             function_id,
531                                             orientation,
532                                             num_rows,
533                                             num_columns,
534                                             num_bytes_per_input,
535                                             num_bytes_per_output,
536                                             num_segments,
537                                             output_scale_factor,
538                                             ptr_inputs,
539                                             ptr_outputs,
540                                             ptr_segments,
541                                             false);
542     }
543     template<class A, class B>
544     static void InitPiecewiseLinearComponent(intel_dnn_component_t &cmp,
545                                       DnnActivation function_id,
546                                       intel_dnn_orientation_t orientation,
547                                       uint32_t num_rows,
548                                       uint32_t num_columns,
549                                       uint32_t num_bytes_per_input,
550                                       uint32_t num_bytes_per_output,
551                                       uint32_t num_segments,
552                                       float output_scale_factor,
553                                       A *&ptr_inputs,
554                                       B *&ptr_outputs,
555                                       intel_pwl_segment_t *ptr_segments) {
556         InitPiecewiseLinearComponentPrivate(cmp,
557                                             function_id,
558                                             orientation,
559                                             num_rows,
560                                             num_columns,
561                                             num_bytes_per_input,
562                                             num_bytes_per_output,
563                                             num_segments,
564                                             output_scale_factor,
565                                             (void *&) ptr_inputs,
566                                             (void *&) ptr_outputs,
567                                             ptr_segments,
568                                             true);
569     }
570
571
572     void InitRecurrentComponent(uint32_t component_index,
573                                 uint32_t num_rows,
574                                 uint32_t num_columns_in,
575                                 uint32_t num_columns_out,
576                                 uint32_t num_bytes_per_input,
577                                 uint32_t num_bytes_per_output,
578                                 uint32_t num_vector_delay,
579                                 uint32_t num_bytes_per_weight,
580                                 uint32_t num_bytes_per_bias,
581                                 float weight_scale_factor,
582                                 float output_scale_factor,
583                                 void *ptr_inputs,
584                                 void *ptr_feedbacks,
585                                 void *ptr_outputs,
586                                 void *ptr_weights,
587                                 void *ptr_biases);
588     void InitInterleaveComponent(uint32_t component_index,
589                                  uint32_t num_rows,
590                                  uint32_t num_columns,
591                                  uint32_t num_bytes_per_input,
592                                  uint32_t num_bytes_per_output,
593                                  float output_scale_factor,
594                                  void *ptr_inputs,
595                                  void *ptr_outputs);
596     void InitDeinterleaveComponent(uint32_t component_index,
597                                    uint32_t num_rows,
598                                    uint32_t num_columns,
599                                    uint32_t num_bytes_per_input,
600                                    uint32_t num_bytes_per_output,
601                                    float output_scale_factor,
602                                    void *ptr_inputs,
603                                    void *ptr_outputs);
604     void InitCopyComponent(uint32_t component_index,
605                            intel_dnn_orientation_t orientation,
606                            uint32_t num_rows_in,
607                            uint32_t num_columns_in,
608                            uint32_t num_rows_out,
609                            uint32_t num_columns_out,
610                            uint32_t num_bytes_per_input,
611                            uint32_t num_bytes_per_output,
612                            float output_scale_factor,
613                            uint32_t num_copy_rows,
614                            uint32_t num_copy_columns,
615                            void *ptr_inputs,
616                            void *ptr_outputs) {
617         InitCopyComponentPrivate(component[component_index],
618                                  orientation,
619                                  num_rows_in,
620                                  num_columns_in,
621                                  num_rows_out,
622                                  num_columns_out,
623                                  num_bytes_per_input,
624                                  num_bytes_per_output,
625                                  output_scale_factor,
626                                  num_copy_rows,
627                                  num_copy_columns,
628                                  ptr_inputs,
629                                  ptr_outputs,
630                                  false);
631     }
632
633     template<class A, class B>
634     static  void InitCopyComponent(intel_dnn_component_t &cmp,
635                                    intel_dnn_orientation_t orientation,
636                                    uint32_t num_rows_in,
637                                    uint32_t num_columns_in,
638                                    uint32_t num_rows_out,
639                                    uint32_t num_columns_out,
640                                    uint32_t num_bytes_per_input,
641                                    uint32_t num_bytes_per_output,
642                                    float output_scale_factor,
643                                    uint32_t num_copy_rows,
644                                    uint32_t num_copy_columns,
645                                    A *&ptr_inputs,
646                                    B *&ptr_outputs) {
647         InitCopyComponentPrivate(cmp,
648                                  orientation,
649                                  num_rows_in,
650                                  num_columns_in,
651                                  num_rows_out,
652                                  num_columns_out,
653                                  num_bytes_per_input,
654                                  num_bytes_per_output,
655                                  output_scale_factor,
656                                  num_copy_rows,
657                                  num_copy_columns,
658                                  (void *&) ptr_inputs,
659                                  (void *&) ptr_outputs,
660                                  true);
661     }
662     void AddComponents(uint32_t num_components_to_add);
663     void ClearComponent(uint32_t component_index);
664     void ClearState();
665     uint32_t CopyActiveList(std::vector<std::vector<uint32_t> > &active_list, uint32_t list_index);
666     void Propagate();
667     intel_dnn_macro_operation_t MacroOperation(uint32_t component_index);
668     void SetMacroOperation(uint32_t component_index, intel_dnn_macro_operation_t macro_operation);
669     float InputScaleFactor(uint32_t component_index);
670     float WeightScaleFactor(uint32_t component_index);
671     float OutputScaleFactor(uint32_t component_index) {
672         return OutputScaleFactor(component[component_index]);
673     }
674     float OutputScaleFactor(intel_dnn_component_t &comp);
675     void SetInputScaleFactor(float scale_factor) { input_scale_factor_ = scale_factor; }
676     void SetOutputScaleFactor(uint32_t component_index, float scale_factor);
677     void PrintOutputs(uint32_t component_index);
678     uint32_t CompareScores(void *ptr_scores, intel_score_error_t *score_error, uint32_t num_frames);
679     void WriteGraphWizModel(const char *filename);
680     void WriteDnnText(const char *filename, intel_dnn_number_type_t number_type);
681     uint32_t MemoryRequiredToReadDnnText(const char *filename);
682     void ReadDnnText(const char *filename, void *ptr_memory, uint32_t num_memory_bytes, float *ptr_scale_in);
683
684     void InitGNAStruct(intel_nnet_type_t *ptr_nnet);
685     void DestroyGNAStruct(intel_nnet_type_t *ptr_nnet);
686     void GetScaledOutput(float *ptr_output, uint32_t component_index);
687     uint32_t *ptr_active_outputs() { return (ptr_active_outputs_); }
688     uint32_t num_active_outputs() { return (num_active_outputs_); }
689     uint32_t num_gna_layers() {
690         uint32_t num_layers = 0;
691         for (uint32_t i = 0; i < component.size(); i++) {
692             if ((component[i].operation == kDnnAffineOp) || (component[i].operation == kDnnDiagonalOp)
693                 || (component[i].operation == kDnnConvolutional1dOp) || (component[i].operation == kDnnCopyOp)
694                 || (component[i].operation == kDnnDeinterleaveOp) || (component[i].operation == kDnnInterleaveOp)
695                 || (component[i].operation == kDnnRecurrentOp)) {
696                 num_layers++;
697             }
698         }
699         return (num_layers);
700     }
701     uint32_t num_group_in() {
702         return ((component.size() > 0) ? ((component[0].orientation_in == kDnnInterleavedOrientation)
703                                           ? component[0].num_columns_in : component[0].num_rows_in) : 0);
704     }
705     uint32_t num_group_out() {
706         return ((component.size() > 0) ? ((component[component.size() - 1].orientation_out
707             == kDnnInterleavedOrientation) ? component[component.size() - 1].num_columns_out : component[
708                                               component.size() - 1].num_rows_out) : 0);
709     }
710
711     std::vector<intel_dnn_component_t> component;
712     uint32_t num_left_context;
713     uint32_t num_right_context;
714     bool do_rotate_input;
715     uint32_t num_rotate_rows = 0;
716     uint32_t num_rotate_columns = 0;
717     DnnSoftmaxType softmax_type;
718     uint32_t *ptr_sumgroup_sizes;
719     uint32_t num_sumgroup_sizes;
720     float *ptr_priors;
721
722     void WriteInputAndOutputText();
723     static void WriteInputAndOutputTextGNA(intel_nnet_type_t * nnet);
724     void BeginNewWrite();
725
726  private:
727     void *ptr_dnn_memory_;
728     uint32_t num_bytes_dnn_memory_;
729     uint32_t *ptr_active_outputs_;
730     uint32_t num_active_outputs_;
731     intel_dnn_number_type_t number_type_;
732     float input_scale_factor_;
733
734     static void InitCopyComponentPrivate(intel_dnn_component_t &cmp,
735                                          intel_dnn_orientation_t orientation,
736                                          uint32_t num_rows_in,
737                                          uint32_t num_columns_in,
738                                          uint32_t num_rows_out,
739                                          uint32_t num_columns_out,
740                                          uint32_t num_bytes_per_input,
741                                          uint32_t num_bytes_per_output,
742                                          float output_scale_factor,
743                                          uint32_t num_copy_rows,
744                                          uint32_t num_copy_columns,
745                                          void *&ptr_inputs,
746                                          void *&ptr_outputs,
747                                          bool postInitMem);
748
749     static void InitMaxpoolComponentPrivate(intel_dnn_component_t &cmp,
750                                      uint32_t num_rows_in,
751                                      uint32_t num_columns_in,
752                                      uint32_t num_rows_out,
753                                      uint32_t num_columns_out,
754                                      uint32_t num_bytes_per_input,
755                                      uint32_t num_bytes_per_output,
756                                      uint32_t num_pool_size,
757                                      uint32_t num_pool_step,
758                                      uint32_t num_pool_stride,
759                                      bool do_sum_not_max,
760                                      float output_scale_factor,
761                                      void *&ptr_inputs,
762                                      void *&ptr_outputs,
763                                      bool   postInitMem);
764
765     static void InitPiecewiseLinearComponentPrivate(intel_dnn_component_t &cmp,
766                                              DnnActivation function_id,
767                                              intel_dnn_orientation_t orientation,
768                                              uint32_t num_rows,
769                                              uint32_t num_columns,
770                                              uint32_t num_bytes_per_input,
771                                              uint32_t num_bytes_per_output,
772                                              uint32_t num_segments,
773                                              float   output_scale_factor,
774                                              void *& ptr_inputs,
775                                              void *& ptr_outputs,
776                                              intel_pwl_segment_t *ptr_segments,
777                                              bool    postInitMem);
778
779     static void InitConvolutional1DComponentPrivate(intel_dnn_component_t &comp,
780                                              uint32_t num_rows_in,
781                                              uint32_t num_columns_in,
782                                              uint32_t num_rows_out,
783                                              uint32_t num_columns_out,
784                                              uint32_t num_bytes_per_input,
785                                              uint32_t num_bytes_per_output,
786                                              uint32_t num_bytes_per_weight,
787                                              uint32_t num_bytes_per_bias,
788                                              uint32_t num_filters,
789                                              uint32_t num_filter_rows,
790                                              uint32_t num_filter_coefficients,
791                                              uint32_t num_feature_maps,
792                                              uint32_t num_feature_map_rows,
793                                              uint32_t num_feature_map_columns,
794                                              float   weight_scale_factor,
795                                              float   output_scale_factor,
796                                              void *& ptr_inputs,
797                                              void *& ptr_outputs,
798                                              void *& ptr_filters,
799                                              void *& ptr_biases,
800                                              bool    postInitMem);
801
802     static void InitAffineComponentPrivate(intel_dnn_component_t &comp,
803                                            uint32_t num_rows_in,
804                                            uint32_t num_columns,
805                                            uint32_t num_rows_out,
806                                            uint32_t num_bytes_per_input,
807                                            uint32_t num_bytes_per_output,
808                                            uint32_t num_bytes_per_weight,
809                                            uint32_t num_bytes_per_bias,
810                                            float  weight_scale_factor,
811                                            float  output_scale_factor,
812                                            void *&ptr_inputs,
813                                            void *&ptr_outputs,
814                                            void *&ptr_weights,
815                                            void *&ptr_biases,
816                                            bool   isDiag,
817                                            bool   postInitMem);
818 };
819
820 void PlotFloatIntDnn(AmIntelDnn *dnn, AmIntelDnn *dnn_int);
821 bool isCompatibleDnn(AmIntelDnn dnn1, AmIntelDnn dnn2);
822 void ClearScoreError(intel_score_error_t *error);
823 void UpdateScoreError(intel_score_error_t *error, intel_score_error_t *total_error);
824 void SoftmaxGoogle(float *ptr_output, float *ptr_input, const uint32_t num_outputs, const uint32_t num_inputs);