onert-micro/luci-interpreter/src/kernels/UnidirectionalSequenceLSTM.cpp

   1 /*
   2  * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *    http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #include "Builders.h"
  19 #include "kernels/Utils.h"
  20
  21 #include "PALUnidirectionalSequenceLSTM.h"
  22
  23 namespace luci_interpreter
  24 {
  25 namespace
  26 {
  27
  28 #ifndef DIS_QUANT
  29
  30 bool checkedLog2(const float x, int *log2_result)
  31 {
  32   // Using TfLiteRound instead of std::round and std::log instead of
  33   // std::log2 to work around these functions being missing in a toolchain
  34   // used in some TensorFlow tests as of May 2018.
  35   const float x_log2 = std::log(x) * (1.0f / std::log(2.0f));
  36   const float x_log2_rounded = std::round(x_log2);
  37   const float x_log2_fracpart = x_log2 - x_log2_rounded;
  38
  39   *log2_result = static_cast<int>(x_log2_rounded);
  40   return std::abs(x_log2_fracpart) < 1e-3f;
  41 }
  42
  43 // Create parameters for element wise multiplication that happens in a) cell
  44 // state update ; b) hidden state update
  45 // Note that all the output of gates are symmetrically quantized so only scales
  46 // are required for input. However, during the hidden state update phase, the
  47 // output is the updated hidden state, which is asymmetrically quantized. Thus
  48 // output may require zero point
  49 luci_interpreter_pal::ArithmeticParams
  50 createInterGateParams(const float input1_scale, const float input2_scale, const float output_scale,
  51                       const DataType output_type, const int output_zp)
  52 {
  53   luci_interpreter_pal::ArithmeticParams op_params;
  54   if (output_type == DataType::S16)
  55   {
  56     op_params.quantized_activation_min = std::numeric_limits<int16_t>::min();
  57     op_params.quantized_activation_max = std::numeric_limits<int16_t>::max();
  58   }
  59   else if (output_type == DataType::S8)
  60   {
  61     op_params.quantized_activation_min = std::numeric_limits<int8_t>::min();
  62     op_params.quantized_activation_max = std::numeric_limits<int8_t>::max();
  63   }
  64
  65   op_params.input1_offset = 0; // symmetric
  66   op_params.input2_offset = 0; // symmetric
  67   op_params.output_offset = output_zp;
  68
  69   const double input_product_scale =
  70     static_cast<double>(input1_scale) * static_cast<double>(input2_scale);
  71   double effective_scale = input_product_scale / static_cast<double>(output_scale);
  72   auto output_shift = static_cast<int>(op_params.output_shift);
  73   kernels::quantizeMultiplier(effective_scale, &op_params.output_multiplier, &output_shift);
  74   op_params.output_shift = output_shift;
  75   return op_params;
  76 }
  77
  78 void createGateParams(const circle::Tensor *input, const circle::Tensor *input_weight,
  79                       const circle::Tensor *input_bias, const circle::Tensor *hidden_state,
  80                       const circle::Tensor *hidden_state_weight,
  81                       const float nonlinear_activation_input_scale, const DataType cell_type,
  82                       lstm::GateParameters *gate_params)
  83 {
  84   // Input CalculateOpDataFullyConnected
  85   {
  86     luci_interpreter_pal::FullyConnectedParams input_gate_params;
  87     double real_multiplier = 0.0;
  88     int output_shift;
  89     int32_t output_activation_min;
  90     int32_t output_activation_max;
  91     int32_t output_multiplier;
  92     real_multiplier = kernels::getQuantizedConvolutionMultipler(
  93       Tensor::scale(input), Tensor::scale(input_weight), nonlinear_activation_input_scale);
  94     kernels::quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
  95     kernels::calculateActivationRangeQuantized(FusedActFunc::NONE, 0,
  96                                                nonlinear_activation_input_scale, cell_type,
  97                                                &output_activation_min, &output_activation_max);
  98
  99     input_gate_params.output_shift = output_shift;
 100     input_gate_params.output_multiplier = output_multiplier;
 101     input_gate_params.quantized_activation_max = output_activation_max;
 102     input_gate_params.quantized_activation_min = output_activation_min;
 103     input_gate_params.input_offset = -Tensor::zero_point(input);
 104     input_gate_params.weights_offset = -Tensor::zero_point(input_weight);
 105     input_gate_params.output_offset = 0;
 106
 107     gate_params->input_fc_params = input_gate_params;
 108   }
 109
 110   // Recurrent CalculateOpDataFullyConnected
 111   {
 112     luci_interpreter_pal::FullyConnectedParams recurrent_gate_params;
 113     double real_multiplier = 0.0;
 114     int output_shift;
 115     int32_t output_activation_min;
 116     int32_t output_activation_max;
 117     int32_t output_multiplier;
 118     real_multiplier = kernels::getQuantizedConvolutionMultipler(Tensor::scale(hidden_state),
 119                                                                 Tensor::scale(hidden_state_weight),
 120                                                                 nonlinear_activation_input_scale);
 121     kernels::quantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
 122     kernels::calculateActivationRangeQuantized(FusedActFunc::NONE, 0,
 123                                                nonlinear_activation_input_scale, cell_type,
 124                                                &output_activation_min, &output_activation_max);
 125
 126     recurrent_gate_params.output_shift = output_shift;
 127     recurrent_gate_params.output_multiplier = output_multiplier;
 128     recurrent_gate_params.quantized_activation_max = output_activation_max;
 129     recurrent_gate_params.quantized_activation_min = output_activation_min;
 130     recurrent_gate_params.input_offset = -Tensor::zero_point(hidden_state);
 131     recurrent_gate_params.weights_offset = -Tensor::zero_point(hidden_state_weight);
 132     recurrent_gate_params.output_offset = 0;
 133
 134     gate_params->recurrent_fc_params = recurrent_gate_params;
 135   }
 136 }
 137
 138 void prepareGateParamsInteger(lstm::LSTMStruct *lstm_struct,
 139                               lstm::LSTMParameters *quant_lstm_params)
 140 {
 141   float nonlinear_input_scale = 0.00024414062; // 2^-12 Q3.12 -> Q0.15
 142
 143   createGateParams(lstm_struct->input(), lstm_struct->input_to_forget_weights(),
 144                    lstm_struct->forget_gate_bias(), lstm_struct->output_state(),
 145                    lstm_struct->recurrent_to_forget_weights(), nonlinear_input_scale, DataType::S16,
 146                    &quant_lstm_params->forget_gate_parameters);
 147
 148   createGateParams(lstm_struct->input(), lstm_struct->input_to_input_weights(),
 149                    lstm_struct->input_gate_bias(), lstm_struct->output_state(),
 150                    lstm_struct->recurrent_to_input_weights(), nonlinear_input_scale, DataType::S16,
 151                    &quant_lstm_params->input_gate_parameters);
 152
 153   // lstm::GateParameters cell_gate_parameters;
 154   createGateParams(lstm_struct->input(), lstm_struct->input_to_cell_weights(),
 155                    lstm_struct->cell_gate_bias(), lstm_struct->output_state(),
 156                    lstm_struct->recurrent_to_cell_weights(), nonlinear_input_scale, DataType::S16,
 157                    &quant_lstm_params->cell_gate_parameters);
 158
 159   // lstm::GateParameters output_gate_parameters;
 160   createGateParams(lstm_struct->input(), lstm_struct->input_to_output_weights(),
 161                    lstm_struct->output_gate_bias(), lstm_struct->output_state(),
 162                    lstm_struct->recurrent_to_output_weights(), nonlinear_input_scale, DataType::S16,
 163                    &quant_lstm_params->output_gate_parameters);
 164
 165   // Inter gate multiplication parameters
 166   float nonlinear_output_scale = 0.00003051757; // 2^-15 Q3.12 -> Q0.15
 167   float cell_state_scale =
 168     Tensor::scale(lstm_struct->cell_state()); // lstm_tensors.CellStateTensor()->params.scale;
 169   // forget gate output (nonlinear output) x cell state -> cell state
 170   quant_lstm_params->inter_gate_parameters.forget_cell_mul_params = createInterGateParams(
 171     nonlinear_output_scale, cell_state_scale, cell_state_scale, DataType::S16, 0);
 172
 173   // input gate output x cell gate output -> cell state
 174   quant_lstm_params->inter_gate_parameters.input_mul_params = createInterGateParams(
 175     nonlinear_output_scale, nonlinear_output_scale, cell_state_scale, DataType::S16, 0);
 176
 177   // tanh output x output gate output -> hidden state (potentially asymmetric)
 178   quant_lstm_params->inter_gate_parameters.output_mul_params = createInterGateParams(
 179     nonlinear_output_scale, nonlinear_output_scale, Tensor::scale(lstm_struct->output_state()),
 180     Tensor::element_type(lstm_struct->output_state()),
 181     Tensor::zero_point(lstm_struct->output_state()));
 182 }
 183
 184 // Create the additional information about the cell state, which include:
 185 // cell_state_scale_power: used in integer nonlinear function (e.g., tanh)
 186 // quantized_cell_clip: quantized cell clip range
 187 lstm::CellStateInfo createLstmCellStateInfo(const float cell_state_scale, const float cell_clip)
 188 {
 189   lstm::CellStateInfo cell_state_info;
 190   // cell_state_scale_power: 2^-cell_state_scale_power = cell state scale
 191   int buffer;
 192   checkedLog2(cell_state_scale, &buffer);
 193   cell_state_info.cell_state_scale_power = buffer;
 194   // Cell state specifics
 195   cell_state_info.cell_clip = cell_clip;
 196   cell_state_info.quantized_cell_clip = static_cast<int16_t>(std::min(
 197     std::max(static_cast<double>(cell_clip) / static_cast<double>(cell_state_scale), -32768.0),
 198     32767.0));
 199   return cell_state_info;
 200 }
 201
 202 void evalInt8(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph, bool)
 203 {
 204   lstm::LSTMStruct lstm_struct(cur_op, runtime_graph);
 205
 206   lstm::LSTMParameters quant_lstm_params;
 207   prepareGateParamsInteger(&lstm_struct, &quant_lstm_params);
 208
 209   lstm::CellStateInfo cell_state_info = createLstmCellStateInfo(
 210     luci_interpreter::Tensor::scale(lstm_struct.cell_state()), lstm_struct.options->cell_clip());
 211
 212   const bool time_major = lstm_struct.options->time_major();
 213   const auto batch_size =
 214     time_major ? Tensor::dim(lstm_struct.input(), 1) : Tensor::dim(lstm_struct.input(), 0);
 215   const auto state_dimension = Tensor::dim(lstm_struct.output_state(), 1);
 216   const auto cell_state_type_size = getDataTypeSize(Tensor::element_type(lstm_struct.cell_state()));
 217
 218   auto scratch_0_data =
 219     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
 220   auto scratch_1_data =
 221     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
 222   auto scratch_2_data =
 223     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
 224   auto scratch_3_data =
 225     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
 226
 227   // Create and fill with 0 output state tensor
 228   auto output_state_data =
 229     std::make_unique<int8_t[]>(Tensor::num_elements(lstm_struct.output_state()));
 230   std::fill_n(output_state_data.get(), Tensor::num_elements(lstm_struct.output_state()), 0);
 231
 232   // Create and fill with 0 cell state tensor
 233   auto cell_state_data =
 234     std::make_unique<int16_t[]>(Tensor::num_elements(lstm_struct.cell_state()));
 235   std::fill_n(cell_state_data.get(), Tensor::num_elements(lstm_struct.cell_state()), 0);
 236
 237   luci_interpreter_pal::evalLSTM<int8_t, int8_t, int16_t, int32_t>(
 238     &lstm_struct, &quant_lstm_params, &cell_state_info, output_state_data.get(),
 239     cell_state_data.get(), kernels::getTensorData<int16_t>(scratch_0_data.get()),
 240     kernels::getTensorData<int16_t>(scratch_1_data.get()),
 241     kernels::getTensorData<int16_t>(scratch_2_data.get()),
 242     kernels::getTensorData<int16_t>(scratch_3_data.get()), runtime_graph);
 243 }
 244
 245 #endif // DIS_QUANT
 246
 247 #ifndef DIS_FLOAT
 248 luci_interpreter_pal::FullyConnectedParams createFcParamsFloat()
 249 {
 250   luci_interpreter_pal::FullyConnectedParams op_params;
 251   kernels::calculateActivationRange(FusedActFunc::NONE, &op_params.float_activation_min,
 252                                     &op_params.float_activation_max);
 253   op_params.quantized_activation_max = op_params.float_activation_max;
 254   op_params.quantized_activation_min = op_params.float_activation_min;
 255   return op_params;
 256 }
 257
 258 lstm::GateParameters createGateParamsFloat()
 259 {
 260   lstm::GateParameters gate_params;
 261
 262   gate_params.input_fc_params = createFcParamsFloat();
 263   gate_params.recurrent_fc_params = createFcParamsFloat();
 264
 265   return gate_params;
 266 }
 267
 268 lstm::CellStateInfo createLstmCellStateInfoFloat(const float cell_clip)
 269 {
 270   lstm::CellStateInfo cell_state_info;
 271   cell_state_info.cell_clip = cell_clip;
 272   cell_state_info.cell_state_scale_power = 0; // no quantization
 273   cell_state_info.quantized_cell_clip = 0;    // no quantization
 274   return cell_state_info;
 275 }
 276
 277 void prepareGateParamsFloat(lstm::LSTMParameters *float_lstm_params)
 278 {
 279   // Gate Parameters
 280   float_lstm_params->forget_gate_parameters = createGateParamsFloat();
 281   float_lstm_params->input_gate_parameters = createGateParamsFloat();
 282   float_lstm_params->cell_gate_parameters = createGateParamsFloat();
 283   float_lstm_params->output_gate_parameters = createGateParamsFloat();
 284
 285   // Inter gate multiplication parameters
 286   luci_interpreter_pal::ArithmeticParams op_params;
 287   kernels::calculateActivationRange(FusedActFunc::NONE, &op_params.float_activation_min,
 288                                     &op_params.float_activation_max);
 289   op_params.quantized_activation_max = op_params.float_activation_max;
 290   op_params.quantized_activation_min = op_params.float_activation_min;
 291   float_lstm_params->inter_gate_parameters.forget_cell_mul_params = op_params;
 292   float_lstm_params->inter_gate_parameters.input_mul_params = op_params;
 293   float_lstm_params->inter_gate_parameters.output_mul_params = op_params;
 294 }
 295
 296 void evalFloat(const circle::Operator *cur_op, BaseRuntimeGraph *runtime_graph, bool)
 297 {
 298   lstm::LSTMStruct lstm_struct(cur_op, runtime_graph);
 299
 300   lstm::CellStateInfo cell_state_info =
 301     createLstmCellStateInfoFloat(lstm_struct.options->cell_clip());
 302
 303   lstm::LSTMParameters lstm_params;
 304   prepareGateParamsFloat(&lstm_params);
 305
 306   const bool time_major = lstm_struct.options->time_major();
 307   const auto batch_size =
 308     time_major ? Tensor::dim(lstm_struct.input(), 1) : Tensor::dim(lstm_struct.input(), 0);
 309   const auto state_dimension = Tensor::dim(lstm_struct.output_state(), 1);
 310   const auto cell_state_type_size = getDataTypeSize(Tensor::element_type(lstm_struct.cell_state()));
 311
 312   auto scratch_0_data =
 313     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
 314   auto scratch_1_data =
 315     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
 316   auto scratch_2_data =
 317     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
 318   auto scratch_3_data =
 319     std::make_unique<uint8_t[]>(batch_size * state_dimension * cell_state_type_size);
 320
 321   // Create and fill with 0 output state tensor
 322   auto output_state_data =
 323     std::make_unique<float[]>(Tensor::num_elements(lstm_struct.output_state()));
 324   std::fill_n(output_state_data.get(), Tensor::num_elements(lstm_struct.output_state()), 0);
 325
 326   // Create and fill with 0 cell state tensor
 327   auto cell_state_data = std::make_unique<float[]>(Tensor::num_elements(lstm_struct.cell_state()));
 328   std::fill_n(cell_state_data.get(), Tensor::num_elements(lstm_struct.cell_state()), 0);
 329
 330   luci_interpreter_pal::evalLSTM<float, float, float, float>(
 331     &lstm_struct, &lstm_params, &cell_state_info, output_state_data.get(), cell_state_data.get(),
 332     kernels::getTensorData<float>(scratch_0_data.get()),
 333     kernels::getTensorData<float>(scratch_1_data.get()),
 334     kernels::getTensorData<float>(scratch_2_data.get()),
 335     kernels::getTensorData<float>(scratch_3_data.get()), runtime_graph);
 336 }
 337 #endif // DIS_FLOAT
 338
 339 void validateWeightTensorSize(const circle::Tensor *weight_tensor, int dim1_size, int dim2_size)
 340 {
 341   LUCI_INTERPRETER_CHECK(Tensor::num_dims(weight_tensor) == 2);
 342   LUCI_INTERPRETER_CHECK(Tensor::dim(weight_tensor, 0) == dim1_size);
 343   LUCI_INTERPRETER_CHECK(Tensor::dim(weight_tensor, 1) == dim2_size);
 344 }
 345
 346 void validateTensorsSize(lstm::LSTMStruct *lstm_struct, const bool time_major)
 347 {
 348   const auto batch_size =
 349     time_major ? Tensor::dim(lstm_struct->input(), 1) : Tensor::dim(lstm_struct->input(), 0);
 350
 351   const auto input_dimension = Tensor::dim(lstm_struct->input(), 2);
 352   const auto state_dimension = Tensor::dim(lstm_struct->output_state(), 1);
 353
 354   // Input FC weights
 355   for (int32_t i = 1; i < 5; i++)
 356   {
 357     validateWeightTensorSize(lstm_struct->get_internal_tensor(i), state_dimension, input_dimension);
 358   }
 359
 360   // Recurrent FC weights
 361   for (int32_t i = 5; i < 9; i++)
 362   {
 363     validateWeightTensorSize(lstm_struct->get_internal_tensor(i), state_dimension, state_dimension);
 364   }
 365
 366   // Biases
 367   for (int32_t i = 12; i < 16; i++)
 368   {
 369     LUCI_INTERPRETER_CHECK(Tensor::num_dims(lstm_struct->get_internal_tensor(i)) == 1);
 370     LUCI_INTERPRETER_CHECK(Tensor::dim(lstm_struct->get_internal_tensor(i), 0) == state_dimension);
 371   }
 372
 373   // Check the shape of input state tensors.
 374   // These tensor may be 1D or 2D. It's fine as long as the total size is
 375   // correct.
 376   LUCI_INTERPRETER_CHECK(Tensor::num_elements(lstm_struct->output_state()) ==
 377                          batch_size * state_dimension);
 378   LUCI_INTERPRETER_CHECK(Tensor::num_elements(lstm_struct->cell_state()) ==
 379                          batch_size * state_dimension);
 380
 381   // Check the shape of output tensor against that of input tensor
 382   LUCI_INTERPRETER_CHECK(Tensor::num_dims(lstm_struct->output()) == 3);
 383   LUCI_INTERPRETER_CHECK(Tensor::dim(lstm_struct->input(), 0) ==
 384                          Tensor::dim(lstm_struct->output(), 0));
 385   LUCI_INTERPRETER_CHECK(Tensor::dim(lstm_struct->input(), 1) ==
 386                          Tensor::dim(lstm_struct->output(), 1));
 387   LUCI_INTERPRETER_CHECK(Tensor::dim(lstm_struct->output(), 2) == state_dimension);
 388 }
 389
 390 } // namespace
 391
 392 void configure_kernel_CircleUnidirectionalSequenceLSTM(const circle::Operator *cur_op,
 393                                                        BaseRuntimeGraph *runtime_graph)
 394 {
 395   lstm::LSTMStruct lstm_struct(cur_op, runtime_graph);
 396
 397   LUCI_INTERPRETER_CHECK(Tensor::element_type(lstm_struct.input()) == DataType::FLOAT32 or
 398                          Tensor::element_type(lstm_struct.input()) == DataType::S8);
 399
 400   lstm_struct.validateTensorTypes();
 401
 402   const bool time_major = lstm_struct.options->time_major();
 403
 404   validateTensorsSize(&lstm_struct, time_major);
 405
 406   // No peephole
 407   for (int32_t i = 9; i < 12; ++i)
 408     LUCI_INTERPRETER_CHECK(lstm_struct.get_internal_tensor(i) == nullptr);
 409
 410   // No projection
 411   for (int32_t i = 16; i < 18; ++i)
 412     LUCI_INTERPRETER_CHECK(lstm_struct.get_internal_tensor(i) == nullptr);
 413
 414   // No internal layer norm
 415   for (int32_t i = 20; i < 24; ++i)
 416     LUCI_INTERPRETER_CHECK(lstm_struct.get_internal_tensor(i) == nullptr);
 417 }
 418
 419 void execute_kernel_CircleUnidirectionalSequenceLSTM(const circle::Operator *cur_op,
 420                                                      BaseRuntimeGraph *runtime_graph)
 421 {
 422   const auto input_index = cur_op->inputs()->operator[](0);
 423   assert(input_index != -1);
 424
 425   bool is_inplace = runtime_graph->is_inplace_op(cur_op);
 426
 427   const auto input = runtime_graph->getCircleTensorByIndex(input_index);
 428
 429   switch (Tensor::element_type(input))
 430   {
 431 #ifndef DIS_FLOAT
 432     case DataType::FLOAT32:
 433       evalFloat(cur_op, runtime_graph, is_inplace);
 434       break;
 435 #endif // DIS_FLOAT
 436 #ifndef DIS_QUANT
 437     case DataType::S8:
 438       evalInt8(cur_op, runtime_graph, is_inplace);
 439       break;
 440 #endif // DIS_QUANT
 441     default:
 442       assert(false && "Unsupported type.");
 443   }
 444 }
 445
 446 } // namespace luci_interpreter