runtime/onert/backend/acl_common/AclKernelGen.h

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #ifndef __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
  18 #define __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_
  19
  20 #include <exec/IFunction.h>
  21 #include <ir/Operands.h>
  22
  23 #include <ir/operation/LSTM.h>
  24 #include <arm_compute/runtime/CL/CLFunctions.h>
  25
  26 namespace onert
  27 {
  28 namespace backend
  29 {
  30 namespace acl_common
  31 {
  32
  33 template <typename Layer, typename... Args>
  34 std::unique_ptr<arm_compute::IFunction> generateLayer(Args &&... args)
  35 {
  36   auto l = std::make_unique<Layer>();
  37
  38   l->configure(std::forward<Args>(args)...);
  39
  40   return l;
  41 }
  42
  43 template <typename Layer, typename... Args>
  44 std::unique_ptr<arm_compute::IFunction>
  45 generateLayer(std::shared_ptr<arm_compute::IMemoryManager> memory_manager, Args &&... args)
  46 {
  47   auto l = std::make_unique<Layer>(memory_manager);
  48
  49   l->configure(std::forward<Args>(args)...);
  50
  51   return l;
  52 }
  53
  54 template <typename T_FunctionWrapper, typename T_Tensor, typename T_ACLLayer,
  55           typename T_TensorRegistry>
  56 std::unique_ptr<exec::IFunction> kernelGenLSTM(const ir::operation::LSTM &node,
  57                                                const ir::Operands &operands,
  58                                                const std::shared_ptr<T_TensorRegistry> &tensor_reg)
  59 {
  60   // TODO Support dynamic rnn
  61   // TODO Fix subtle error in the case of non-CIFG, non-peephole and No Projection.
  62   const auto scratch_buffer_index{
  63       node.getOutputs().at(ir::operation::LSTM::Output::SCRATCH_BUFFER)};
  64   const auto output_state_out_index{
  65       node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT_STATE_OUT)};
  66   const auto cell_state_out_index{
  67       node.getOutputs().at(ir::operation::LSTM::Output::CELL_STATE_OUT)};
  68   const auto output_index{node.getOutputs().at(ir::operation::LSTM::Output::OUTPUT)};
  69
  70   const auto input_index{node.getInputs().at(ir::operation::LSTM::Input::INPUT)};
  71   const auto input_to_input_weights_index{
  72       node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_INPUT_WEIGHTS)}; // optional
  73   const auto input_to_forget_weights_index{
  74       node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_FORGET_WEIGHTS)};
  75   const auto input_to_cell_weights_index{
  76       node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_CELL_WEIGHTS)};
  77   const auto input_to_output_weights_index{
  78       node.getInputs().at(ir::operation::LSTM::Input::INPUT_TO_OUTPUT_WEIGHTS)};
  79   const auto recurrent_to_input_weights_index{
  80       node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_INPUT_WEIGHTS)}; // optional
  81   const auto recurrent_to_forget_weights_index{
  82       node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_FORGET_WEIGHTS)};
  83   const auto recurrent_to_cell_weights_index{
  84       node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_CELL_WEIGHTS)};
  85   const auto recurrent_to_output_weights_index{
  86       node.getInputs().at(ir::operation::LSTM::Input::RECURRENT_TO_OUTPUT_WEIGHTS)};
  87   const auto cell_to_input_weights_index{
  88       node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_INPUT_WEIGHTS)}; // optional
  89   const auto cell_to_forget_weights_index{
  90       node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_FORGET_WEIGHTS)}; // optional
  91   const auto cell_to_output_weights_index{
  92       node.getInputs().at(ir::operation::LSTM::Input::CELL_TO_OUTPUT_WEIGHTS)}; // optional
  93   const auto input_gate_bias_index{
  94       node.getInputs().at(ir::operation::LSTM::Input::INPUT_GATE_BIAS)};
  95   const auto forget_gate_bias_index{
  96       node.getInputs().at(ir::operation::LSTM::Input::FORGET_GATE_BIAS)};
  97   const auto cell_bias_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_BIAS)};
  98   const auto output_gate_bias_index{
  99       node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_GATE_BIAS)};
 100   const auto projection_weights_index{
 101       node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_WEIGHTS)}; // optional
 102   const auto projection_bias_index{
 103       node.getInputs().at(ir::operation::LSTM::Input::PROJECTION_BIAS)}; // optional
 104   const auto output_state_in_index{
 105       node.getInputs().at(ir::operation::LSTM::Input::OUTPUT_STATE_IN)};
 106   const auto cell_state_in_index{node.getInputs().at(ir::operation::LSTM::Input::CELL_STATE_IN)};
 107   const auto cell_threshold = node.param().cell_threshold;
 108   const auto projection_threshold = node.param().projection_threshold;
 109
 110   bool has_input_to_input_weights = operands.at(input_to_input_weights_index).shape().dim(0) != 0 &&
 111                                     operands.at(input_to_input_weights_index).shape().dim(1) != 0;
 112   bool has_recurrent_to_input_weights =
 113       operands.at(recurrent_to_input_weights_index).shape().dim(0) != 0 &&
 114       operands.at(recurrent_to_input_weights_index).shape().dim(1) != 0;
 115   bool has_cell_to_forget_weights = operands.at(cell_to_forget_weights_index).shape().dim(0) != 0;
 116   bool has_cell_to_output_weights = operands.at(cell_to_output_weights_index).shape().dim(0) != 0;
 117   bool has_projection_weights = operands.at(projection_weights_index).shape().dim(0) != 0 &&
 118                                 operands.at(projection_weights_index).shape().dim(1) != 0;
 119   bool has_projection_bias = operands.at(projection_bias_index).shape().dim(0);
 120
 121   // NOTE The input_to_input_weights and the recurrent_to_input_weights do not exist in CIFG.
 122   // true: no CIFG
 123   // false: CIFG
 124   // NOTE The cell_to_input_weights does not exist in non-peephole although regular LSTM(non-CIFG).
 125   bool has_cifg_param = has_input_to_input_weights && has_recurrent_to_input_weights;
 126
 127   // NOTE The cell_to_forget_weights and the cell_to_output_weights exist in peephole.
 128   // But the cell_to_input_weights does not exist in regular CIFG although peephole.
 129   // true: peephole
 130   // false: no peephole
 131   bool has_peephole_param = has_cell_to_forget_weights && has_cell_to_output_weights;
 132
 133   // NOTE Although the projection weights has data the projection bias may not have data.
 134   bool has_projection_param = has_projection_weights;
 135
 136   const auto activation = node.param().activation;
 137   const auto cell_clip = cell_threshold;
 138   const auto projection_clip = projection_threshold;
 139   assert(cell_clip >= 0.f && projection_clip >= 0.f);
 140
 141   auto scratch_buffer_tensor = tensor_reg->getAclTensor(scratch_buffer_index).get();
 142   auto output_state_out_tensor = tensor_reg->getAclTensor(output_state_out_index).get();
 143   auto cell_state_out_tensor = tensor_reg->getAclTensor(cell_state_out_index).get();
 144   auto output_tensor = tensor_reg->getAclTensor(output_index).get();
 145
 146   auto input_tensor = tensor_reg->getAclTensor(input_index).get();
 147
 148   auto input_to_forget_weights_tensor =
 149       tensor_reg->getAclTensor(input_to_forget_weights_index).get();
 150   auto input_to_cell_weights_tensor = tensor_reg->getAclTensor(input_to_cell_weights_index).get();
 151   auto input_to_output_weights_tensor =
 152       tensor_reg->getAclTensor(input_to_output_weights_index).get();
 153   auto recurrent_to_forget_weights_tensor =
 154       tensor_reg->getAclTensor(recurrent_to_forget_weights_index).get();
 155   auto recurrent_to_cell_weights_tensor =
 156       tensor_reg->getAclTensor(recurrent_to_cell_weights_index).get();
 157   auto recurrent_to_output_weights_tensor =
 158       tensor_reg->getAclTensor(recurrent_to_output_weights_index).get();
 159
 160   auto forget_gate_bias_tensor = tensor_reg->getAclTensor(forget_gate_bias_index).get();
 161   auto cell_bias_tensor = tensor_reg->getAclTensor(cell_bias_index).get();
 162   auto output_gate_bias_tensor = tensor_reg->getAclTensor(output_gate_bias_index).get();
 163   auto output_state_in_tensor = tensor_reg->getAclTensor(output_state_in_index).get();
 164   auto cell_state_in_tensor = tensor_reg->getAclTensor(cell_state_in_index).get();
 165
 166   auto act_info = asActivationLayerInfo(activation);
 167
 168   ::arm_compute::LSTMParams<T_Tensor> lstm_params{};
 169   if (has_cifg_param)
 170   {
 171     auto input_to_input_weights_tensor =
 172         tensor_reg->getAclTensor(input_to_input_weights_index).get(); // optional
 173     auto recurrent_to_input_weights_tensor =
 174         tensor_reg->getAclTensor(recurrent_to_input_weights_index).get(); // optional
 175     auto cell_to_input_weights_handle =
 176         has_peephole_param ? tensor_reg->getAclTensor(cell_to_input_weights_index).get()->handle()
 177                            : nullptr; // optional (non-cifg && peephole)
 178     auto input_gate_bias_tensor = tensor_reg->getAclTensor(input_gate_bias_index).get(); // optional
 179     lstm_params.set_cifg_params(input_to_input_weights_tensor->handle(),
 180                                 recurrent_to_input_weights_tensor->handle(),
 181                                 cell_to_input_weights_handle, input_gate_bias_tensor->handle());
 182   }
 183   if (has_peephole_param)
 184   {
 185     auto cell_to_forget_weights_tensor =
 186         tensor_reg->getAclTensor(cell_to_forget_weights_index).get(); // optional
 187     auto cell_to_output_weights_tensor =
 188         tensor_reg->getAclTensor(cell_to_output_weights_index).get(); // optional
 189     lstm_params.set_peephole_params(cell_to_forget_weights_tensor->handle(),
 190                                     cell_to_output_weights_tensor->handle());
 191   }
 192   if (has_projection_param)
 193   {
 194     auto projection_weights_tensor =
 195         tensor_reg->getAclTensor(projection_weights_index).get(); // optional
 196     auto projection_bias_handle =
 197         has_projection_bias ? tensor_reg->getAclTensor(projection_bias_index).get()->handle()
 198                             : nullptr; // optional
 199     lstm_params.set_projection_params(projection_weights_tensor->handle(), projection_bias_handle);
 200   }
 201
 202   auto fn = generateLayer<T_ACLLayer>(
 203       input_tensor->handle(), input_to_forget_weights_tensor->handle(),
 204       input_to_cell_weights_tensor->handle(), input_to_output_weights_tensor->handle(),
 205       recurrent_to_forget_weights_tensor->handle(), recurrent_to_cell_weights_tensor->handle(),
 206       recurrent_to_output_weights_tensor->handle(), forget_gate_bias_tensor->handle(),
 207       cell_bias_tensor->handle(), output_gate_bias_tensor->handle(),
 208       output_state_in_tensor->handle(), cell_state_in_tensor->handle(),
 209       scratch_buffer_tensor->handle(), output_state_out_tensor->handle(),
 210       cell_state_out_tensor->handle(), output_tensor->handle(), lstm_params, act_info, cell_clip,
 211       projection_clip);
 212
 213   return std::make_unique<T_FunctionWrapper>(std::move(fn));
 214 }
 215
 216 template <typename T_FunctionWrapper, typename T_Tensor, typename T_ACLLayer,
 217           typename T_TensorBuilder, typename T_TensorRegistry>
 218 std::unique_ptr<exec::IFunction>
 219 kernelGenFullyConnected(const ir::operation::FullyConnected &node, const ir::Operands &operands,
 220                         const std::shared_ptr<T_TensorBuilder> &tensor_builder,
 221                         const std::shared_ptr<T_TensorRegistry> &tensor_reg, ir::Layout layout)
 222 {
 223   using ir::operation::FullyConnected;
 224
 225   const auto output_index{node.getOutputs().at(0)};
 226   const auto input_index{node.getInputs().at(FullyConnected::Input::INPUT)};
 227   const auto weight_index{node.getInputs().at(FullyConnected::Input::WEIGHT)};
 228   const auto bias_index{node.getInputs().at(FullyConnected::Input::BIAS)};
 229
 230   const auto input_rank = operands.at(input_index).shape().rank();
 231
 232   const auto output_size =
 233       operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 1);
 234   UNUSED_RELEASE(output_size);
 235   assert(operands.at(bias_index).shape().dim(0) == output_size);
 236   assert(operands.at(weight_index).shape().dim(0) == output_size);
 237   const auto batch_size =
 238       operands.at(output_index).shape().dim(operands.at(output_index).shape().rank() - 2);
 239   const auto input_size =
 240       operands.at(weight_index).shape().dim(operands.at(weight_index).shape().rank() - 1);
 241
 242   // Check for reshaping input's shape into rank-2
 243   bool needs_reshape = false;
 244   ir::Shape reshape(2);
 245   if (input_rank == 3 || input_rank == 4)
 246   {
 247     const auto &ifm_shape = operands.at(input_index).shape();
 248     auto feature_size = 1;
 249     for (int i = 0; i < ifm_shape.rank(); ++i)
 250     {
 251       feature_size *= ifm_shape.dim(i);
 252     }
 253
 254     UNUSED_RELEASE(feature_size);
 255     assert(feature_size == batch_size * input_size);
 256
 257     // for reshaping
 258     needs_reshape = true;
 259     reshape.dim(0) = batch_size; /* H */
 260     reshape.dim(1) = input_size; /* W */
 261   }
 262
 263   auto output_tensor = tensor_reg->getAclTensor(output_index).get();
 264   const auto input_tensor = tensor_reg->getAclTensor(input_index).get();
 265   const auto weight_tensor = tensor_reg->getAclTensor(weight_index).get();
 266   const auto bias_tensor = tensor_reg->getAclTensor(bias_index).get();
 267   const auto frontend_layout = layout;
 268   const auto acl_layout = output_tensor->handle()->info()->data_layout();
 269
 270   typename T_ACLLayer::KernelType kernel_type = T_ACLLayer::KernelType::GENERAL;
 271   if (operands.at(weight_index).isConstant())
 272   {
 273     kernel_type = T_ACLLayer::KernelType::PREPROCESSED_WEIGHTS;
 274     assert(operands.at(weight_index).data());
 275   }
 276
 277   auto fn = generateLayer<T_ACLLayer>(
 278       tensor_builder->acl_tensor_manager()->internal_buffer_manager(), input_tensor->handle(),
 279       weight_tensor->handle(), bias_tensor->handle(), output_tensor->handle(), needs_reshape,
 280       asTensorShape(reshape, frontend_layout, asRuntimeLayout(acl_layout)), kernel_type);
 281
 282   return std::make_unique<T_FunctionWrapper>(std::move(fn));
 283 }
 284
 285 template <typename T_ACLLayer, typename T_PoolOp, typename T_AclTensorRegistry>
 286 std::unique_ptr<::arm_compute::IFunction>
 287 kernelGenPool2D(const T_PoolOp &node, const ir::Operands &operands,
 288                 const std::shared_ptr<T_AclTensorRegistry> &tensor_reg, ir::Layout layout,
 289                 ::arm_compute::PoolingType pooling_type)
 290 {
 291   const auto ofm_index{node.getOutputs().at(0)};
 292   const auto ifm_index{node.getInputs().at(0)};
 293
 294   const auto ofm_shape = operands.at(ofm_index).shape().asFeature(layout);
 295   const auto ifm_shape = operands.at(ifm_index).shape().asFeature(layout);
 296
 297   const auto kh = node.param().kh;
 298   const auto kw = node.param().kw;
 299   const auto stride = node.param().stride;
 300   const auto padding =
 301       ir::calculatePadding(node.param().padding, ifm_shape, ofm_shape, stride, kw, kh);
 302
 303   VERBOSE(Pool2DParam) << "IFM_H: " << ifm_shape.H << std::endl;
 304   VERBOSE(Pool2DParam) << "IFM_W: " << ifm_shape.W << std::endl;
 305   VERBOSE(Pool2DParam) << "OFM_H: " << ofm_shape.H << std::endl;
 306   VERBOSE(Pool2DParam) << "OFM_W: " << ofm_shape.W << std::endl;
 307   VERBOSE(Pool2DParam) << "KER_H: " << kh << std::endl;
 308   VERBOSE(Pool2DParam) << "KER_W: " << kw << std::endl;
 309   VERBOSE(Pool2DParam) << "STRIDE_H: " << stride.vertical << std::endl;
 310   VERBOSE(Pool2DParam) << "STRIDE_W: " << stride.horizontal << std::endl;
 311   VERBOSE(Pool2DParam) << "PAD(T): " << padding.top << std::endl;
 312   VERBOSE(Pool2DParam) << "PAD(B): " << padding.bottom << std::endl;
 313   VERBOSE(Pool2DParam) << "PAD(L): " << padding.left << std::endl;
 314   VERBOSE(Pool2DParam) << "PAD(R): " << padding.right << std::endl;
 315
 316   auto ofm_tensor = tensor_reg->getAclTensor(ofm_index).get();
 317   auto ifm_tensor = tensor_reg->getAclTensor(ifm_index).get();
 318
 319   ::arm_compute::PoolingLayerInfo info{
 320       pooling_type, ::arm_compute::Size2D{kw, kh}, ifm_tensor->info()->data_layout(),
 321       asPadStrideInfo(padding, stride), true /* exclude_padding */};
 322
 323   auto fn = generateLayer<T_ACLLayer>(ifm_tensor->handle(), ofm_tensor->handle(), info);
 324
 325   return fn;
 326 }
 327
 328 } // namespace acl_common
 329 } // namespace backend
 330 } // namespace onert
 331
 332 #endif // __ONERT_BACKEND_ACL_COMMON_ACL_KERNEL_GEN_H_