Imported Upstream version 1.25.0
[platform/core/ml/nnfw.git] / onert-micro / luci-interpreter / src / kernels / UnidirectionalSequenceLSTM.cpp
1 /*
2  * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
4  *
5  * Licensed under the Apache License, Version 2.0 (the "License");
6  * you may not use this file except in compliance with the License.
7  * You may obtain a copy of the License at
8  *
9  *    http://www.apache.org/licenses/LICENSE-2.0
10  *
11  * Unless required by applicable law or agreed to in writing, software
12  * distributed under the License is distributed on an "AS IS" BASIS,
13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14  * See the License for the specific language governing permissions and
15  * limitations under the License.
16  */
17
18 #include "Builders.h"
19 #include "kernels/Utils.h"
20
21 #include "PALUnidirectionalSequenceLSTM.h"
22
23 namespace luci_interpreter
24 {
25 namespace
26 {
27
28 #ifndef DIS_QUANT
29
30 bool checkedLog2(const float x, int *log2_result)
31 {
32   // Using TfLiteRound instead of std::round and std::log instead of
33   // std::log2 to work around these functions being missing in a toolchain
34   // used in some TensorFlow tests as of May 2018.
35   const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
36   const float x_log2_rounded = std::round(x_log2);
37   const float x_log2_fracpart = x_log2 - x_log2_rounded;
38
39   *log2_result = static_cast<int>(x_log2_rounded);
40   return std::abs(x_log2_fracpart) < 1e-3f;
41 }
42
43 // Create parameters for element wise multiplication that happens in a) cell
44 // state update ; b) hidden state update
45 // Note that all the output of gates are symmetrically quantized so only scales
46 // are required for input. However, during the hidden state update phase, the
47 // output is the updated hidden state, which is asymmetrically quantized. Thus
48 // output may require zero point
49 luci_interpreter_pal::ArithmeticParams
50 createInterGateParams(const float input1_scale, const float input2_scale, const float output_scale,
51                       const DataType output_type, const int output_zp)
52 {
53   luci_interpreter_pal::ArithmeticParams op_params;
54   if (output_type == DataType::S16)
55   {
56     op_params.quantized_activation_min = std::numeric_limits<int16_t>::min();
57     op_params.quantized_activation_max = std::numeric_limits<int16_t>::max();
58   }
59   else if (output_type == DataType::S8)
60   {
61     op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
62     op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
63   }
64
65   op_params.input1_offset = 0; // symmetric
66   op_params.input2_offset = 0; // symmetric
67   op_params.output_offset = output_zp;
68
69   const double input_product_scale =
70     static_cast<double>(input1_scale) * static_cast<double>(input2_scale);
71   double effective_scale = input_product_scale / static_cast<double>(output_scale);
72   auto output_shift = static_cast<int>(op_params.output_shift);
73   kernels::quantizeMultiplier(effective_scale, &op_params.output_multiplier, &output_shift);
74   op_params.output_shift = output_shift;
75   return op_params;
76 }
77
78 void createGateParams(const circle::Tensor *input, const circle::Tensor *input_weight,
79                       const circle::Tensor *input_bias, const circle::Tensor *hidden_state,
80                       const circle::Tensor *hidden_state_weight,
81                       const float nonlinear_activation_input_scale, const DataType cell_type,
82                       lstm::GateParameters *gate_params)
83 {
84   // Input CalculateOpDataFullyConnected
85   {
86     luci_interpreter_pal::FullyConnectedParams input_gate_params;
87     double real_multiplier = 0.0;
88     int output_shift;
89     int32_t output_activation_min;
90     int32_t output_activation_max;
91     int32_t output_multiplier;
92     real_multiplier = kernels::getQuantizedConvolutionMultipler(
93       Tensor::scale(input), Tensor::scale(input_weight), nonlinear_activation_input_scale);
94     kernels::quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
95     kernels::calculateActivationRangeQuantized(FusedActFunc::NONE, 0,
96                                                nonlinear_activation_input_scale, cell_type,
97                                                &output_activation_min, &output_activation_max);
98
99     input_gate_params.output_shift = output_shift;
100     input_gate_params.output_multiplier = output_multiplier;
101     input_gate_params.quantized_activation_max = output_activation_max;
102     input_gate_params.quantized_activation_min = output_activation_min;
103     input_gate_params.input_offset = -Tensor::zero_point(input);
104     input_gate_params.weights_offset = -Tensor::zero_point(input_weight);
105     input_gate_params.output_offset = 0;
106
107     gate_params->input_fc_params = input_gate_params;
108   }
109
110   // Recurrent CalculateOpDataFullyConnected
111   {
112     luci_interpreter_pal::FullyConnectedParams recurrent_gate_params;
113     double real_multiplier = 0.0;
114     int output_shift;
115     int32_t output_activation_min;
116     int32_t output_activation_max;
117     int32_t output_multiplier;
118     real_multiplier = kernels::getQuantizedConvolutionMultipler(Tensor::scale(hidden_state),
119                                                                 Tensor::scale(hidden_state_weight),
120                                                                 nonlinear_activation_input_scale);
121     kernels::quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
122     kernels::calculateActivationRangeQuantized(FusedActFunc::NONE, 0,
123                                                nonlinear_activation_input_scale, cell_type,
124                                                &output_activation_min, &output_activation_max);
125
126     recurrent_gate_params.output_shift = output_shift;
127     recurrent_gate_params.output_multiplier = output_multiplier;
128     recurrent_gate_params.quantized_activation_max = output_activation_max;
129     recurrent_gate_params.quantized_activation_min = output_activation_min;
130     recurrent_gate_params.input_offset = -Tensor::zero_point(hidden_state);
131     recurrent_gate_params.weights_offset = -Tensor::zero_point(hidden_state_weight);
132     recurrent_gate_params.output_offset = 0;
133
134     gate_params->recurrent_fc_params = recurrent_gate_params;
135   }
136 }
137
138 void prepareGateParamsInteger(lstm::LSTMStruct *lstm_struct,
139                               lstm::LSTMParameters *quant_lstm_params)
140 {
141   float nonlinear_input_scale = 0.00024414062; // 2^-12 Q3.12 -> Q0.15
142
143   createGateParams(lstm_struct->input(), lstm_struct->input_to_forget_weights(),
144                    lstm_struct->forget_gate_bias(), lstm_struct->output_state(),
145                    lstm_struct->recurrent_to_forget_weights(), nonlinear_input_scale, DataType::S16,
146                    &quant_lstm_params->forget_gate_parameters);
147
148   createGateParams(lstm_struct->input(), lstm_struct->input_to_input_weights(),
149                    lstm_struct->input_gate_bias(), lstm_struct->output_state(),
150                    lstm_struct->recurrent_to_input_weights(), nonlinear_input_scale, DataType::S16,
151                    &quant_lstm_params->input_gate_parameters);
152
153   // lstm::GateParameters cell_gate_parameters;
154   createGateParams(lstm_struct->input(), lstm_struct->input_to_cell_weights(),
155                    lstm_struct->cell_gate_bias(), lstm_struct->output_state(),
156                    lstm_struct->recurrent_to_cell_weights(), nonlinear_input_scale, DataType::S16,
157                    &quant_lstm_params->cell_gate_parameters);
158
159   // lstm::GateParameters output_gate_parameters;
160   createGateParams(lstm_struct->input(), lstm_struct->input_to_output_weights(),
161                    lstm_struct->output_gate_bias(), lstm_struct->output_state(),
162                    lstm_struct->recurrent_to_output_weights(), nonlinear_input_scale, DataType::S16,
163                    &quant_lstm_params->output_gate_parameters);
164
165   // Inter gate multiplication parameters
166   float nonlinear_output_scale = 0.00003051757; // 2^-15 Q3.12 -> Q0.15
167   float cell_state_scale =
168     Tensor::scale(lstm_struct->cell_state()); // lstm_tensors.CellStateTensor()->params.scale;
169   // forget gate output (nonlinear output) x cell state -> cell state
170   quant_lstm_params->inter_gate_parameters.forget_cell_mul_params = createInterGateParams(
171     nonlinear_output_scale, cell_state_scale, cell_state_scale, DataType::S16, 0);
172
173   // input gate output x cell gate output -> cell state
174   quant_lstm_params->inter_gate_parameters.input_mul_params = createInterGateParams(
175     nonlinear_output_scale, nonlinear_output_scale, cell_state_scale, DataType::S16, 0);
176
177   // tanh output x output gate output -> hidden state (potentially asymmetric)
178   quant_lstm_params->inter_gate_parameters.output_mul_params = createInterGateParams(
179     nonlinear_output_scale, nonlinear_output_scale, Tensor::scale(lstm_struct->output_state()),
180     Tensor::element_type(lstm_struct->output_state()),
181     Tensor::zero_point(lstm_struct->output_state()));
182 }
183
184 // Create the additional information about the cell state, which include:
185 // cell_state_scale_power: used in integer nonlinear function (e.g., tanh)
186 // quantized_cell_clip: quantized cell clip range
187 lstm::CellStateInfo createLstmCellStateInfo(const float cell_state_scale, const float cell_clip)
188 {
189   lstm::CellStateInfo cell_state_info;
190   // cell_state_scale_power: 2^-cell_state_scale_power = cell state scale
191   int buffer;
192   checkedLog2(cell_state_scale, &buffer);
193   cell_state_info.cell_state_scale_power = buffer;
194   // Cell state specifics
195   cell_state_info.cell_clip = cell_clip;
196   cell_state_info.quantized_cell_clip = static_cast<int16_t>(std::min(
197     std::max(static_cast<double>(cell_clip) / static_cast<double>(cell_state_scale), -32768.0),
198     32767.0));
199   return cell_state_info;
200 }
201
202 void evalInt8(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph, bool)
203 {
204   lstm::LSTMStruct lstm_struct(cur_op, runtime_graph);
205
206   lstm::LSTMParameters quant_lstm_params;
207   prepareGateParamsInteger(&lstm_struct, &quant_lstm_params);
208
209   lstm::CellStateInfo cell_state_info = createLstmCellStateInfo(
210     luci_interpreter::Tensor::scale(lstm_struct.cell_state()), lstm_struct.options->cell_clip());
211
212   const bool time_major = lstm_struct.options->time_major();
213   const auto batch_size =
214     time_major ? Tensor::dim(lstm_struct.input(), 1) : Tensor::dim(lstm_struct.input(), 0);
215   const auto state_dimension = Tensor::dim(lstm_struct.output_state(), 1);
216   const auto cell_state_type_size = getDataTypeSize(Tensor::element_type(lstm_struct.cell_state()));
217
218   auto scratch_0_data =
219     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
220   auto scratch_1_data =
221     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
222   auto scratch_2_data =
223     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
224   auto scratch_3_data =
225     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
226
227   // Create and fill with 0 output state tensor
228   auto output_state_data =
229     std::make_unique<int8_t[]>(Tensor::num_elements(lstm_struct.output_state()));
230   std::fill_n(output_state_data.get(), Tensor::num_elements(lstm_struct.output_state()), 0);
231
232   // Create and fill with 0 cell state tensor
233   auto cell_state_data =
234     std::make_unique<int16_t[]>(Tensor::num_elements(lstm_struct.cell_state()));
235   std::fill_n(cell_state_data.get(), Tensor::num_elements(lstm_struct.cell_state()), 0);
236
237   luci_interpreter_pal::evalLSTM<int8_t, int8_t, int16_t, int32_t>(
238     &lstm_struct, &quant_lstm_params, &cell_state_info, output_state_data.get(),
239     cell_state_data.get(), kernels::getTensorData<int16_t>(scratch_0_data.get()),
240     kernels::getTensorData<int16_t>(scratch_1_data.get()),
241     kernels::getTensorData<int16_t>(scratch_2_data.get()),
242     kernels::getTensorData<int16_t>(scratch_3_data.get()), runtime_graph);
243 }
244
245 #endif // DIS_QUANT
246
247 #ifndef DIS_FLOAT
248 luci_interpreter_pal::FullyConnectedParams createFcParamsFloat()
249 {
250   luci_interpreter_pal::FullyConnectedParams op_params;
251   kernels::calculateActivationRange(FusedActFunc::NONE, &op_params.float_activation_min,
252                                     &op_params.float_activation_max);
253   op_params.quantized_activation_max = op_params.float_activation_max;
254   op_params.quantized_activation_min = op_params.float_activation_min;
255   return op_params;
256 }
257
258 lstm::GateParameters createGateParamsFloat()
259 {
260   lstm::GateParameters gate_params;
261
262   gate_params.input_fc_params = createFcParamsFloat();
263   gate_params.recurrent_fc_params = createFcParamsFloat();
264
265   return gate_params;
266 }
267
268 lstm::CellStateInfo createLstmCellStateInfoFloat(const float cell_clip)
269 {
270   lstm::CellStateInfo cell_state_info;
271   cell_state_info.cell_clip = cell_clip;
272   cell_state_info.cell_state_scale_power = 0; // no quantization
273   cell_state_info.quantized_cell_clip = 0;    // no quantization
274   return cell_state_info;
275 }
276
277 void prepareGateParamsFloat(lstm::LSTMParameters *float_lstm_params)
278 {
279   // Gate Parameters
280   float_lstm_params->forget_gate_parameters = createGateParamsFloat();
281   float_lstm_params->input_gate_parameters = createGateParamsFloat();
282   float_lstm_params->cell_gate_parameters = createGateParamsFloat();
283   float_lstm_params->output_gate_parameters = createGateParamsFloat();
284
285   // Inter gate multiplication parameters
286   luci_interpreter_pal::ArithmeticParams op_params;
287   kernels::calculateActivationRange(FusedActFunc::NONE, &op_params.float_activation_min,
288                                     &op_params.float_activation_max);
289   op_params.quantized_activation_max = op_params.float_activation_max;
290   op_params.quantized_activation_min = op_params.float_activation_min;
291   float_lstm_params->inter_gate_parameters.forget_cell_mul_params = op_params;
292   float_lstm_params->inter_gate_parameters.input_mul_params = op_params;
293   float_lstm_params->inter_gate_parameters.output_mul_params = op_params;
294 }
295
296 void evalFloat(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph, bool)
297 {
298   lstm::LSTMStruct lstm_struct(cur_op, runtime_graph);
299
300   lstm::CellStateInfo cell_state_info =
301     createLstmCellStateInfoFloat(lstm_struct.options->cell_clip());
302
303   lstm::LSTMParameters lstm_params;
304   prepareGateParamsFloat(&lstm_params);
305
306   const bool time_major = lstm_struct.options->time_major();
307   const auto batch_size =
308     time_major ? Tensor::dim(lstm_struct.input(), 1) : Tensor::dim(lstm_struct.input(), 0);
309   const auto state_dimension = Tensor::dim(lstm_struct.output_state(), 1);
310   const auto cell_state_type_size = getDataTypeSize(Tensor::element_type(lstm_struct.cell_state()));
311
312   auto scratch_0_data =
313     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
314   auto scratch_1_data =
315     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
316   auto scratch_2_data =
317     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
318   auto scratch_3_data =
319     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
320
321   // Create and fill with 0 output state tensor
322   auto output_state_data =
323     std::make_unique<float[]>(Tensor::num_elements(lstm_struct.output_state()));
324   std::fill_n(output_state_data.get(), Tensor::num_elements(lstm_struct.output_state()), 0);
325
326   // Create and fill with 0 cell state tensor
327   auto cell_state_data = std::make_unique<float[]>(Tensor::num_elements(lstm_struct.cell_state()));
328   std::fill_n(cell_state_data.get(), Tensor::num_elements(lstm_struct.cell_state()), 0);
329
330   luci_interpreter_pal::evalLSTM<float, float, float, float>(
331     &lstm_struct, &lstm_params, &cell_state_info, output_state_data.get(), cell_state_data.get(),
332     kernels::getTensorData<float>(scratch_0_data.get()),
333     kernels::getTensorData<float>(scratch_1_data.get()),
334     kernels::getTensorData<float>(scratch_2_data.get()),
335     kernels::getTensorData<float>(scratch_3_data.get()), runtime_graph);
336 }
337 #endif // DIS_FLOAT
338
339 void validateWeightTensorSize(const circle::Tensor *weight_tensor, int dim1_size, int dim2_size)
340 {
341   LUCI_INTERPRETER_CHECK(Tensor::num_dims(weight_tensor) == 2);
342   LUCI_INTERPRETER_CHECK(Tensor::dim(weight_tensor, 0) == dim1_size);
343   LUCI_INTERPRETER_CHECK(Tensor::dim(weight_tensor, 1) == dim2_size);
344 }
345
346 void validateTensorsSize(lstm::LSTMStruct *lstm_struct, const bool time_major)
347 {
348   const auto batch_size =
349     time_major ? Tensor::dim(lstm_struct->input(), 1) : Tensor::dim(lstm_struct->input(), 0);
350
351   const auto input_dimension = Tensor::dim(lstm_struct->input(), 2);
352   const auto state_dimension = Tensor::dim(lstm_struct->output_state(), 1);
353
354   // Input FC weights
355   for (int32_t i = 1; i < 5; i++)
356   {
357     validateWeightTensorSize(lstm_struct->get_internal_tensor(i), state_dimension, input_dimension);
358   }
359
360   // Recurrent FC weights
361   for (int32_t i = 5; i < 9; i++)
362   {
363     validateWeightTensorSize(lstm_struct->get_internal_tensor(i), state_dimension, state_dimension);
364   }
365
366   // Biases
367   for (int32_t i = 12; i < 16; i++)
368   {
369     LUCI_INTERPRETER_CHECK(Tensor::num_dims(lstm_struct->get_internal_tensor(i)) == 1);
370     LUCI_INTERPRETER_CHECK(Tensor::dim(lstm_struct->get_internal_tensor(i), 0) == state_dimension);
371   }
372
373   // Check the shape of input state tensors.
374   // These tensor may be 1D or 2D. It's fine as long as the total size is
375   // correct.
376   LUCI_INTERPRETER_CHECK(Tensor::num_elements(lstm_struct->output_state()) ==
377                          batch_size * state_dimension);
378   LUCI_INTERPRETER_CHECK(Tensor::num_elements(lstm_struct->cell_state()) ==
379                          batch_size * state_dimension);
380
381   // Check the shape of output tensor against that of input tensor
382   LUCI_INTERPRETER_CHECK(Tensor::num_dims(lstm_struct->output()) == 3);
383   LUCI_INTERPRETER_CHECK(Tensor::dim(lstm_struct->input(), 0) ==
384                          Tensor::dim(lstm_struct->output(), 0));
385   LUCI_INTERPRETER_CHECK(Tensor::dim(lstm_struct->input(), 1) ==
386                          Tensor::dim(lstm_struct->output(), 1));
387   LUCI_INTERPRETER_CHECK(Tensor::dim(lstm_struct->output(), 2) == state_dimension);
388 }
389
390 } // namespace
391
392 void configure_kernel_CircleUnidirectionalSequenceLSTM(const circle::Operator *cur_op,
393                                                        BaseRuntimeGraph *runtime_graph)
394 {
395   lstm::LSTMStruct lstm_struct(cur_op, runtime_graph);
396
397   LUCI_INTERPRETER_CHECK(Tensor::element_type(lstm_struct.input()) == DataType::FLOAT32 or
398                          Tensor::element_type(lstm_struct.input()) == DataType::S8);
399
400   lstm_struct.validateTensorTypes();
401
402   const bool time_major = lstm_struct.options->time_major();
403
404   validateTensorsSize(&lstm_struct, time_major);
405
406   // No peephole
407   for (int32_t i = 9; i < 12; ++i)
408     LUCI_INTERPRETER_CHECK(lstm_struct.get_internal_tensor(i) == nullptr);
409
410   // No projection
411   for (int32_t i = 16; i < 18; ++i)
412     LUCI_INTERPRETER_CHECK(lstm_struct.get_internal_tensor(i) == nullptr);
413
414   // No internal layer norm
415   for (int32_t i = 20; i < 24; ++i)
416     LUCI_INTERPRETER_CHECK(lstm_struct.get_internal_tensor(i) == nullptr);
417 }
418
419 void execute_kernel_CircleUnidirectionalSequenceLSTM(const circle::Operator *cur_op,
420                                                      BaseRuntimeGraph *runtime_graph)
421 {
422   const auto input_index = cur_op->inputs()->operator[](0);
423   assert(input_index != -1);
424
425   bool is_inplace = runtime_graph->is_inplace_op(cur_op);
426
427   const auto input = runtime_graph->getCircleTensorByIndex(input_index);
428
429   switch (Tensor::element_type(input))
430   {
431 #ifndef DIS_FLOAT
432     case DataType::FLOAT32:
433       evalFloat(cur_op, runtime_graph, is_inplace);
434       break;
435 #endif // DIS_FLOAT
436 #ifndef DIS_QUANT
437     case DataType::S8:
438       evalInt8(cur_op, runtime_graph, is_inplace);
439       break;
440 #endif // DIS_QUANT
441     default:
442       assert(false && "Unsupported type.");
443   }
444 }
445
446 } // namespace luci_interpreter