onert-micro/luci-interpreter/pal/common/PALAddCommon.h

   1 /*
   2  * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *    http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef LUCI_INTERPRETER_PAL_ADD_COMMON_H
  19 #define LUCI_INTERPRETER_PAL_ADD_COMMON_H
  20
  21 #include "Params.h"
  22 #include "PALUtils.h"
  23 #include "ProcessBroadcastShapes.h"
  24
  25 namespace luci_interpreter_pal
  26 {
  27
  28 // TODO: check if there real activation value
  29 template <typename T>
  30 inline void Add(const ArithmeticParams &params, const int flat_size, const T *input1_data,
  31                 const T *input2_data, T *output_data)
  32 {
  33   T activation_min, activation_max;
  34   getActivationParams(params, &activation_min, &activation_max);
  35
  36   for (int i = 0; i < flat_size; ++i)
  37     output_data[i] =
  38       std::min(std::max(input1_data[i] + input2_data[i], activation_min), activation_max);
  39 }
  40
  41 template <typename T>
  42 inline void
  43 BroadcastAdd4DSlow(const ArithmeticParams &params,
  44                    const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data,
  45                    const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
  46                    const luci_interpreter::RuntimeShape &output_shape, T *output_data)
  47 {
  48   NdArrayDesc<4> desc1;
  49   NdArrayDesc<4> desc2;
  50   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
  51   const luci_interpreter::RuntimeShape extended_output_shape =
  52     luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
  53
  54   T activation_min, activation_max;
  55   getActivationParams(params, &activation_min, &activation_max);
  56
  57   // In Tensorflow, the dimensions are canonically named (batch_number, row,
  58   // col, channel), with extents (batches, height, width, depth), with the
  59   // trailing dimension changing most rapidly (channels has the smallest stride,
  60   // typically 1 element).
  61   //
  62   // In generated C code, we store arrays with the dimensions reversed. The
  63   // first dimension has smallest stride.
  64   //
  65   // We name our variables by their Tensorflow convention, but generate C code
  66   // nesting loops such that the innermost loop has the smallest stride for the
  67   // best cache behavior.
  68   for (int b = 0; b < extended_output_shape.dims(0); ++b)
  69   {
  70     for (int y = 0; y < extended_output_shape.dims(1); ++y)
  71     {
  72       for (int x = 0; x < extended_output_shape.dims(2); ++x)
  73       {
  74         for (int c = 0; c < extended_output_shape.dims(3); ++c)
  75         {
  76           const int output_data_offset =
  77             ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
  78               extended_output_shape.dims(3) +
  79             c;
  80
  81           output_data[output_data_offset] =
  82             std::min(std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)] +
  83                                 input2_data[subscriptToIndex(desc2, b, y, x, c)],
  84                               activation_min),
  85                      activation_max);
  86         }
  87       }
  88     }
  89   }
  90 }
  91
  92 } // namespace luci_interpreter_pal
  93
  94 #endif // LUCI_INTERPRETER_PAL_ADD_COMMON_H