Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / src / gna_plugin / lstm.hpp
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #pragma once
6
7 #define LSTM_GIFO_X_C (component_index)
8 #define LSTM_GIFO_R_C (component_index+1)
9 #define LSTM_INPUT_GATE_C (component_index+2)
10 #define LSTM_INPUT_SIGMOID_C (component_index+3)
11 #define LSTM_FORGET_GATE_C (component_index+4)
12 #define LSTM_FORGET_SIGMOID_C (component_index+5)
13 #define LSTM_CELL_INPUT1_C (component_index+6)
14 #define LSTM_CELL_INPUT1_TANH_C (component_index+7)
15 #define LSTM_CELL_INPUT2_C (component_index+8)
16 #define LSTM_CELL_OUTPUT1_C (component_index+9)
17 #define LSTM_CELL_TANH_C (component_index+10)
18 #define LSTM_CELL_OUTPUT2_C (component_index+11)
19 #define LSTM_CELL_CLIPPING_C (component_index+12)
20 #define LSTM_OUTPUT_GATE_C (component_index+13)
21 #define LSTM_OUTPUT_SIGMOID_C (component_index+14)
22 #define LSTM_HIDDEN_C (component_index+15)
23 #define LSTM_HIDDEN_IDENTITY_C (component_index+16)
24 #define LSTM_PROJECTED_C (component_index+17)
25 #define LSTM_PROJECTED_IDENTITY_C (component_index+18)
26 #define NUM_LSTM_COMPONENTS 19
27
28 #define BILSTM_GIFO_X_FW_C (component_index)
29 #define BILSTM_GIFO_R_FW_C (component_index+1)
30 #define BILSTM_INPUT_GATE_FW_C (component_index+2)
31 #define BILSTM_INPUT_SIGMOID_FW_C (component_index+3)
32 #define BILSTM_FORGET_GATE_FW_C (component_index+4)
33 #define BILSTM_FORGET_SIGMOID_FW_C (component_index+5)
34 #define BILSTM_CELL_INPUT1_FW_C (component_index+6)
35 #define BILSTM_CELL_INPUT1_TANH_FW_C (component_index+7)
36 #define BILSTM_CELL_INPUT2_FW_C (component_index+8)
37 #define BILSTM_CELL_GATE_FW_C (component_index+9)
38 #define BILSTM_CELL_OUTPUT1_FW_C (component_index+10)
39 #define BILSTM_CELL_TANH_FW_C (component_index+11)
40 #define BILSTM_CELL_COPY_FW_C (component_index+12)
41 #define BILSTM_OUTPUT_GATE_FW_C (component_index+13)
42 #define BILSTM_OUTPUT_SIGMOID_FW_C (component_index+14)
43 #define BILSTM_HIDDEN_FW_C (component_index+15)
44 #define BILSTM_HIDDEN_IDENTITY_FW_C (component_index+16)
45 #define BILSTM_GIFO_X_BW_C (component_index+17)
46 #define BILSTM_GIFO_R_BW_C (component_index+18)
47 #define BILSTM_INPUT_GATE_BW_C (component_index+19)
48 #define BILSTM_INPUT_SIGMOID_BW_C (component_index+20)
49 #define BILSTM_FORGET_GATE_BW_C (component_index+21)
50 #define BILSTM_FORGET_SIGMOID_BW_C (component_index+22)
51 #define BILSTM_CELL_INPUT1_BW_C (component_index+23)
52 #define BILSTM_CELL_INPUT1_TANH_BW_C (component_index+24)
53 #define BILSTM_CELL_INPUT2_BW_C (component_index+25)
54 #define BILSTM_CELL_GATE_BW_C (component_index+26)
55 #define BILSTM_CELL_OUTPUT1_BW_C (component_index+27)
56 #define BILSTM_CELL_TANH_BW_C (component_index+28)
57 #define BILSTM_CELL_COPY_BW_C (component_index+29)
58 #define BILSTM_OUTPUT_GATE_BW_C (component_index+30)
59 #define BILSTM_OUTPUT_SIGMOID_BW_C (component_index+31)
60 #define BILSTM_HIDDEN_BW_C (component_index+32)
61 #define BILSTM_HIDDEN_IDENTITY_BW_C (component_index+33)
62 #define NUM_BILSTM_COMPONENTS 34
63
64 #include "gna-api.h"
65
66 #define ACTIVATION_SCALE_IG  1024.0f
67 #define ACTIVATION_SCALE_CI1 1024.0f
68 #define ACTIVATION_SCALE_CO1 2048.0f
69 #define ACTIVATION_SCALE_OG  2048.0f
70 #define ACTIVATION_SCALE_HID 2048.0f
71 #define MAX_WEIGHT_IFO_GATE  1024.0f
72 #define NUM_WEIGHT_BYTES_IN        2
73 #define NUM_WEIGHT_BYTES_PROJ    2
74
75 typedef struct {
76     float min;
77     float max;
78     float sum;
79     float sum_squared;
80     uint32_t num_saturations;
81     uint32_t num_elements;
82 } intel_buffer_stats_t;
83
84 typedef struct {
85     intel_nnet_layer_t in;        // combined input transform
86     intel_nnet_layer_t rec;        // combined recurrent transform
87     intel_nnet_layer_t ig;        // input gate
88     intel_nnet_layer_t fg;        // forget gate
89     intel_nnet_layer_t ci1;        // cell gate input part 1
90     intel_nnet_layer_t ci2;        // cell gate input part 2
91     intel_nnet_layer_t co1;        // cell gate output part 1
92     intel_nnet_layer_t co2;        // cell gate output part 2
93     intel_nnet_layer_t og;        // output gate
94     intel_nnet_layer_t hid;        // hidden gated output
95     intel_nnet_layer_t proj;    // projected output
96 } intel_lstm_projected_layer_t;
97
98 typedef struct {
99     intel_affine_layer_t *in;        // combined input transform
100     intel_affine_layer_t *rec;        // combined recurrent transform
101     intel_affine_layer_t *ig;        // input gate
102     intel_affine_layer_t *fg;        // forget gate
103     intel_affine_layer_t *ci1;        // cell gate input part 1
104     intel_affine_layer_t *ci2;        // cell gate input part 2
105     intel_affine_layer_t *co1;        // cell gate output part 1
106     intel_affine_layer_t *co2;        // cell gate output part 2
107     intel_affine_layer_t *og;        // output gate
108     intel_affine_layer_t *hid;        // hidden gated output
109     intel_affine_layer_t *proj;        // projected output
110 } intel_lstm_projected_transform_t;
111
112 typedef struct {
113     intel_buffer_stats_t in;        // combined input transform
114     intel_buffer_stats_t rec;        // combined recurrent transform
115     intel_buffer_stats_t ig;        // input gate
116     intel_buffer_stats_t fg;        // forget gate
117     intel_buffer_stats_t ci1;        // cell gate input part 1
118     intel_buffer_stats_t ci2;        // cell gate input part 2
119     intel_buffer_stats_t co1;        // cell gate output part 1
120     intel_buffer_stats_t co2;        // cell gate output part 2
121     intel_buffer_stats_t og;        // output gate
122     intel_buffer_stats_t hid;        // hidden gated output
123     intel_buffer_stats_t proj;    // projected output
124 } intel_lstm_projected_stats_t;
125
126 typedef struct {
127     intel_nnet_layer_t rec;        // combined recurrent transform
128     intel_nnet_layer_t ig;        // input gate
129     intel_nnet_layer_t fg;        // forget gate
130     intel_nnet_layer_t ci1;        // cell gate input part 1
131     intel_nnet_layer_t ci2;        // cell gate input part 2
132     intel_nnet_layer_t co1;        // cell gate output part 1
133     intel_nnet_layer_t co2;        // cell gate output part 2
134     intel_nnet_layer_t og;        // output gate
135     intel_nnet_layer_t hid;        // hidden gated output
136     intel_nnet_layer_t proj;    // projected output
137 } intel_lstm_partial_layer_t;
138
139 typedef struct {
140     intel_affine_layer_t *rec;        // combined recurrent transform
141     intel_affine_layer_t *ig;        // input gate
142     intel_affine_layer_t *fg;        // forget gate
143     intel_affine_layer_t *ci1;        // cell gate input part 1
144     intel_affine_layer_t *ci2;        // cell gate input part 2
145     intel_affine_layer_t *co1;        // cell gate output part 1
146     intel_affine_layer_t *co2;        // cell gate output part 2
147     intel_affine_layer_t *og;        // output gate
148     intel_affine_layer_t *hid;        // hidden gated output
149     intel_affine_layer_t *proj;        // projected output
150 } intel_lstm_partial_transform_t;
151
152 typedef struct {
153     intel_buffer_stats_t rec;        // combined recurrent transform
154     intel_buffer_stats_t ig;        // input gate
155     intel_buffer_stats_t fg;        // forget gate
156     intel_buffer_stats_t ci1;        // cell gate input part 1
157     intel_buffer_stats_t ci2;        // cell gate input part 2
158     intel_buffer_stats_t co1;        // cell gate output part 1
159     intel_buffer_stats_t co2;        // cell gate output part 2
160     intel_buffer_stats_t og;        // output gate
161     intel_buffer_stats_t hid;        // hidden gated output
162     intel_buffer_stats_t proj;    // projected output
163 } intel_lstm_partial_stats_t;
164
165 typedef struct {
166     intel_nnet_layer_t in;                // combined input transform
167     intel_nnet_layer_t dintl;            // interleave x8
168     intel_nnet_layer_t intl1;            // deinterleave x2
169     intel_nnet_layer_t intl2;            // deinterleave x2
170     intel_nnet_layer_t intl3;            // deinterleave x2
171     intel_nnet_layer_t intl4;            // deinterleave x2
172     intel_lstm_partial_layer_t part[4];    // unrolled part
173     intel_nnet_layer_t intl;            // interleave x4
174 } intel_lstm_projected_layer_g4_t;
175
176 typedef struct {
177     intel_affine_layer_t *in;                // combined input transform
178     intel_lstm_partial_transform_t part[4];  // unrolled part
179 } intel_lstm_projected_transform_g4_t;
180
181 typedef struct {
182     intel_buffer_stats_t in;            // combined input transform
183     intel_lstm_partial_stats_t part[4];    // unrolled part
184 } intel_lstm_projected_stats_g4_t;
185
186 #define NUM_LSTM_LAYERS 11
187 #define NUM_LSTM_G4_LAYERS 47
188
189 extern const char *intel_lstm_projected_layer_name[NUM_LSTM_LAYERS];
190 extern const char *intel_lstm_projected_layer_g4_name[NUM_LSTM_G4_LAYERS];
191 /*
192 void GetLstmBufferStats(intel_lstm_projected_layer_t *ptr_layer, std::vector<intel_lstm_projected_stats_t> &stats);
193 void UpdateLstmBufferStats(std::vector<intel_lstm_projected_stats_t> &accum, std::vector<intel_lstm_projected_stats_t> stats);
194 void ClearLstmBufferStats(std::vector<intel_lstm_projected_stats_t> &stats);
195 void PrintLstmBufferStats(std::string preamble, std::vector<intel_lstm_projected_stats_t> stats);
196 uint32_t NumBytesLstmMacroLayer(uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells, uint32_t num_group_size, uint32_t layer_num, bool is_compact);
197 void InitLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform, uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells);
198 void InitLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform, uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells);
199 void AllocateLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform, intel_shared_outputs scratch, uint8_t **ptr_memory, uint32_t *ptr_num_bytes_used, uint32_t num_memory_bytes, bool is_compact);
200 void AllocateLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform, intel_shared_outputs scratch, uint8_t **ptr_memory, uint32_t *ptr_num_bytes_used, uint32_t num_memory_bytes, bool is_compact);
201 void ConnectLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform);
202 void ConnectLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform);
203 void QuantizeLstmMacroLayerG1(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_transform_t *ptr_transform, float input_scale, gna_scale_factor_t *scale, uint32_t j);
204 void QuantizeLstmMacroLayerG4(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_transform_g4_t *ptr_transform, float input_scale, gna_scale_factor_t *scale, uint32_t j);
205 void ReQuantizeLstmMacroLayerG1(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_layer_t *ptr_layer, float input_scale, gna_scale_factor_t *scale, uint32_t j);
206 void ReQuantizeLstmMacroLayerG4(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_layer_g4_t *ptr_layer, float input_scale, gna_scale_factor_t *scale, uint32_t j);
207 void IntegrityCheckLstmMacroLayer(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_layer_t *ptr_layer, gna_scale_factor_t *scale, uint32_t j);
208
209 */