1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
7 #define LSTM_GIFO_X_C (component_index)
8 #define LSTM_GIFO_R_C (component_index+1)
9 #define LSTM_INPUT_GATE_C (component_index+2)
10 #define LSTM_INPUT_SIGMOID_C (component_index+3)
11 #define LSTM_FORGET_GATE_C (component_index+4)
12 #define LSTM_FORGET_SIGMOID_C (component_index+5)
13 #define LSTM_CELL_INPUT1_C (component_index+6)
14 #define LSTM_CELL_INPUT1_TANH_C (component_index+7)
15 #define LSTM_CELL_INPUT2_C (component_index+8)
16 #define LSTM_CELL_OUTPUT1_C (component_index+9)
17 #define LSTM_CELL_TANH_C (component_index+10)
18 #define LSTM_CELL_OUTPUT2_C (component_index+11)
19 #define LSTM_CELL_CLIPPING_C (component_index+12)
20 #define LSTM_OUTPUT_GATE_C (component_index+13)
21 #define LSTM_OUTPUT_SIGMOID_C (component_index+14)
22 #define LSTM_HIDDEN_C (component_index+15)
23 #define LSTM_HIDDEN_IDENTITY_C (component_index+16)
24 #define LSTM_PROJECTED_C (component_index+17)
25 #define LSTM_PROJECTED_IDENTITY_C (component_index+18)
26 #define NUM_LSTM_COMPONENTS 19
28 #define BILSTM_GIFO_X_FW_C (component_index)
29 #define BILSTM_GIFO_R_FW_C (component_index+1)
30 #define BILSTM_INPUT_GATE_FW_C (component_index+2)
31 #define BILSTM_INPUT_SIGMOID_FW_C (component_index+3)
32 #define BILSTM_FORGET_GATE_FW_C (component_index+4)
33 #define BILSTM_FORGET_SIGMOID_FW_C (component_index+5)
34 #define BILSTM_CELL_INPUT1_FW_C (component_index+6)
35 #define BILSTM_CELL_INPUT1_TANH_FW_C (component_index+7)
36 #define BILSTM_CELL_INPUT2_FW_C (component_index+8)
37 #define BILSTM_CELL_GATE_FW_C (component_index+9)
38 #define BILSTM_CELL_OUTPUT1_FW_C (component_index+10)
39 #define BILSTM_CELL_TANH_FW_C (component_index+11)
40 #define BILSTM_CELL_COPY_FW_C (component_index+12)
41 #define BILSTM_OUTPUT_GATE_FW_C (component_index+13)
42 #define BILSTM_OUTPUT_SIGMOID_FW_C (component_index+14)
43 #define BILSTM_HIDDEN_FW_C (component_index+15)
44 #define BILSTM_HIDDEN_IDENTITY_FW_C (component_index+16)
45 #define BILSTM_GIFO_X_BW_C (component_index+17)
46 #define BILSTM_GIFO_R_BW_C (component_index+18)
47 #define BILSTM_INPUT_GATE_BW_C (component_index+19)
48 #define BILSTM_INPUT_SIGMOID_BW_C (component_index+20)
49 #define BILSTM_FORGET_GATE_BW_C (component_index+21)
50 #define BILSTM_FORGET_SIGMOID_BW_C (component_index+22)
51 #define BILSTM_CELL_INPUT1_BW_C (component_index+23)
52 #define BILSTM_CELL_INPUT1_TANH_BW_C (component_index+24)
53 #define BILSTM_CELL_INPUT2_BW_C (component_index+25)
54 #define BILSTM_CELL_GATE_BW_C (component_index+26)
55 #define BILSTM_CELL_OUTPUT1_BW_C (component_index+27)
56 #define BILSTM_CELL_TANH_BW_C (component_index+28)
57 #define BILSTM_CELL_COPY_BW_C (component_index+29)
58 #define BILSTM_OUTPUT_GATE_BW_C (component_index+30)
59 #define BILSTM_OUTPUT_SIGMOID_BW_C (component_index+31)
60 #define BILSTM_HIDDEN_BW_C (component_index+32)
61 #define BILSTM_HIDDEN_IDENTITY_BW_C (component_index+33)
62 #define NUM_BILSTM_COMPONENTS 34
66 #define ACTIVATION_SCALE_IG 1024.0f
67 #define ACTIVATION_SCALE_CI1 1024.0f
68 #define ACTIVATION_SCALE_CO1 2048.0f
69 #define ACTIVATION_SCALE_OG 2048.0f
70 #define ACTIVATION_SCALE_HID 2048.0f
71 #define MAX_WEIGHT_IFO_GATE 1024.0f
72 #define NUM_WEIGHT_BYTES_IN 2
73 #define NUM_WEIGHT_BYTES_PROJ 2
80 uint32_t num_saturations;
81 uint32_t num_elements;
82 } intel_buffer_stats_t;
85 intel_nnet_layer_t in; // combined input transform
86 intel_nnet_layer_t rec; // combined recurrent transform
87 intel_nnet_layer_t ig; // input gate
88 intel_nnet_layer_t fg; // forget gate
89 intel_nnet_layer_t ci1; // cell gate input part 1
90 intel_nnet_layer_t ci2; // cell gate input part 2
91 intel_nnet_layer_t co1; // cell gate output part 1
92 intel_nnet_layer_t co2; // cell gate output part 2
93 intel_nnet_layer_t og; // output gate
94 intel_nnet_layer_t hid; // hidden gated output
95 intel_nnet_layer_t proj; // projected output
96 } intel_lstm_projected_layer_t;
99 intel_affine_layer_t *in; // combined input transform
100 intel_affine_layer_t *rec; // combined recurrent transform
101 intel_affine_layer_t *ig; // input gate
102 intel_affine_layer_t *fg; // forget gate
103 intel_affine_layer_t *ci1; // cell gate input part 1
104 intel_affine_layer_t *ci2; // cell gate input part 2
105 intel_affine_layer_t *co1; // cell gate output part 1
106 intel_affine_layer_t *co2; // cell gate output part 2
107 intel_affine_layer_t *og; // output gate
108 intel_affine_layer_t *hid; // hidden gated output
109 intel_affine_layer_t *proj; // projected output
110 } intel_lstm_projected_transform_t;
113 intel_buffer_stats_t in; // combined input transform
114 intel_buffer_stats_t rec; // combined recurrent transform
115 intel_buffer_stats_t ig; // input gate
116 intel_buffer_stats_t fg; // forget gate
117 intel_buffer_stats_t ci1; // cell gate input part 1
118 intel_buffer_stats_t ci2; // cell gate input part 2
119 intel_buffer_stats_t co1; // cell gate output part 1
120 intel_buffer_stats_t co2; // cell gate output part 2
121 intel_buffer_stats_t og; // output gate
122 intel_buffer_stats_t hid; // hidden gated output
123 intel_buffer_stats_t proj; // projected output
124 } intel_lstm_projected_stats_t;
127 intel_nnet_layer_t rec; // combined recurrent transform
128 intel_nnet_layer_t ig; // input gate
129 intel_nnet_layer_t fg; // forget gate
130 intel_nnet_layer_t ci1; // cell gate input part 1
131 intel_nnet_layer_t ci2; // cell gate input part 2
132 intel_nnet_layer_t co1; // cell gate output part 1
133 intel_nnet_layer_t co2; // cell gate output part 2
134 intel_nnet_layer_t og; // output gate
135 intel_nnet_layer_t hid; // hidden gated output
136 intel_nnet_layer_t proj; // projected output
137 } intel_lstm_partial_layer_t;
140 intel_affine_layer_t *rec; // combined recurrent transform
141 intel_affine_layer_t *ig; // input gate
142 intel_affine_layer_t *fg; // forget gate
143 intel_affine_layer_t *ci1; // cell gate input part 1
144 intel_affine_layer_t *ci2; // cell gate input part 2
145 intel_affine_layer_t *co1; // cell gate output part 1
146 intel_affine_layer_t *co2; // cell gate output part 2
147 intel_affine_layer_t *og; // output gate
148 intel_affine_layer_t *hid; // hidden gated output
149 intel_affine_layer_t *proj; // projected output
150 } intel_lstm_partial_transform_t;
153 intel_buffer_stats_t rec; // combined recurrent transform
154 intel_buffer_stats_t ig; // input gate
155 intel_buffer_stats_t fg; // forget gate
156 intel_buffer_stats_t ci1; // cell gate input part 1
157 intel_buffer_stats_t ci2; // cell gate input part 2
158 intel_buffer_stats_t co1; // cell gate output part 1
159 intel_buffer_stats_t co2; // cell gate output part 2
160 intel_buffer_stats_t og; // output gate
161 intel_buffer_stats_t hid; // hidden gated output
162 intel_buffer_stats_t proj; // projected output
163 } intel_lstm_partial_stats_t;
166 intel_nnet_layer_t in; // combined input transform
167 intel_nnet_layer_t dintl; // interleave x8
168 intel_nnet_layer_t intl1; // deinterleave x2
169 intel_nnet_layer_t intl2; // deinterleave x2
170 intel_nnet_layer_t intl3; // deinterleave x2
171 intel_nnet_layer_t intl4; // deinterleave x2
172 intel_lstm_partial_layer_t part[4]; // unrolled part
173 intel_nnet_layer_t intl; // interleave x4
174 } intel_lstm_projected_layer_g4_t;
177 intel_affine_layer_t *in; // combined input transform
178 intel_lstm_partial_transform_t part[4]; // unrolled part
179 } intel_lstm_projected_transform_g4_t;
182 intel_buffer_stats_t in; // combined input transform
183 intel_lstm_partial_stats_t part[4]; // unrolled part
184 } intel_lstm_projected_stats_g4_t;
186 #define NUM_LSTM_LAYERS 11
187 #define NUM_LSTM_G4_LAYERS 47
189 extern const char *intel_lstm_projected_layer_name[NUM_LSTM_LAYERS];
190 extern const char *intel_lstm_projected_layer_g4_name[NUM_LSTM_G4_LAYERS];
192 void GetLstmBufferStats(intel_lstm_projected_layer_t *ptr_layer, std::vector<intel_lstm_projected_stats_t> &stats);
193 void UpdateLstmBufferStats(std::vector<intel_lstm_projected_stats_t> &accum, std::vector<intel_lstm_projected_stats_t> stats);
194 void ClearLstmBufferStats(std::vector<intel_lstm_projected_stats_t> &stats);
195 void PrintLstmBufferStats(std::string preamble, std::vector<intel_lstm_projected_stats_t> stats);
196 uint32_t NumBytesLstmMacroLayer(uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells, uint32_t num_group_size, uint32_t layer_num, bool is_compact);
197 void InitLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform, uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells);
198 void InitLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform, uint32_t num_inputs, uint32_t num_outputs, uint32_t num_cells);
199 void AllocateLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform, intel_shared_outputs scratch, uint8_t **ptr_memory, uint32_t *ptr_num_bytes_used, uint32_t num_memory_bytes, bool is_compact);
200 void AllocateLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform, intel_shared_outputs scratch, uint8_t **ptr_memory, uint32_t *ptr_num_bytes_used, uint32_t num_memory_bytes, bool is_compact);
201 void ConnectLstmMacroLayerG1(intel_lstm_projected_layer_t *ptr_layer, intel_lstm_projected_transform_t *ptr_transform);
202 void ConnectLstmMacroLayerG4(intel_lstm_projected_layer_g4_t *ptr_layer, intel_lstm_projected_transform_g4_t *ptr_transform);
203 void QuantizeLstmMacroLayerG1(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_transform_t *ptr_transform, float input_scale, gna_scale_factor_t *scale, uint32_t j);
204 void QuantizeLstmMacroLayerG4(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_transform_g4_t *ptr_transform, float input_scale, gna_scale_factor_t *scale, uint32_t j);
205 void ReQuantizeLstmMacroLayerG1(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_layer_t *ptr_layer, float input_scale, gna_scale_factor_t *scale, uint32_t j);
206 void ReQuantizeLstmMacroLayerG4(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_layer_g4_t *ptr_layer, float input_scale, gna_scale_factor_t *scale, uint32_t j);
207 void IntegrityCheckLstmMacroLayer(std::vector<intel_dnn_component_t> *ptr_component, uint32_t component_index, intel_lstm_projected_layer_t *ptr_layer, gna_scale_factor_t *scale, uint32_t j);