onert-micro/luci-interpreter/pal/common/PALMulCommon.h

   1 /*
   2  * Copyright (c) 2021 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2019 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *    http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef LUCI_INTERPRETER_PAL_MUL_COMMON_H
  19 #define LUCI_INTERPRETER_PAL_MUL_COMMON_H
  20
  21 #include "Params.h"
  22 #include "PALUtils.h"
  23 #include "ProcessBroadcastShapes.h"
  24
  25 namespace luci_interpreter_pal
  26 {
  27 template <typename T>
  28 inline void Mul(const ArithmeticParams &params, const int flat_size, const T *input1_data,
  29                 const T *input2_data, T *output_data)
  30 {
  31   T activation_min, activation_max;
  32   getActivationParams(params, &activation_min, &activation_max);
  33
  34   for (int i = 0; i < flat_size; ++i)
  35     output_data[i] =
  36       std::min(std::max(input1_data[i] * input2_data[i], activation_min), activation_max);
  37 }
  38
  39 template <typename T>
  40 inline void MulScalar(const ArithmeticParams &params, const int flat_size, const T *input_data,
  41                       const T scalar_value, T *output_data)
  42 {
  43   T activation_min, activation_max;
  44   getActivationParams(params, &activation_min, &activation_max);
  45
  46   for (int i = 0; i < flat_size; ++i)
  47     output_data[i] =
  48       std::min(std::max(input_data[i] * scalar_value, activation_min), activation_max);
  49 }
  50
  51 template <typename T>
  52 inline void
  53 BroadcastMul4DSlow(const ArithmeticParams &params,
  54                    const luci_interpreter::RuntimeShape &input1_shape, const T *input1_data,
  55                    const luci_interpreter::RuntimeShape &input2_shape, const T *input2_data,
  56                    const luci_interpreter::RuntimeShape &output_shape, T *output_data)
  57 {
  58   const int flat_size = input1_shape.flatSize();
  59
  60   if (params.broadcast_category == BroadcastableOpCategory::kScalarFirstBroadcast)
  61   {
  62     return MulScalar(params, flat_size, input2_data, input1_data[0], output_data);
  63   }
  64   else if (params.broadcast_category == BroadcastableOpCategory::kScalarSecondBroadcast)
  65   {
  66     return MulScalar(params, flat_size, input1_data, input2_data[0], output_data);
  67   }
  68
  69   NdArrayDesc<4> desc1;
  70   NdArrayDesc<4> desc2;
  71   NdArrayDescsForElementwiseBroadcast(input1_shape, input2_shape, &desc1, &desc2);
  72   const luci_interpreter::RuntimeShape extended_output_shape =
  73     luci_interpreter::RuntimeShape::extendedShape(4, output_shape);
  74
  75   T activation_min, activation_max;
  76   getActivationParams(params, &activation_min, &activation_max);
  77
  78   // In Tensorflow, the dimensions are canonically named (batch_number, row,
  79   // col, channel), with extents (batches, height, width, depth), with the
  80   // trailing dimension changing most rapidly (channels has the smallest stride,
  81   // typically 1 element).
  82   //
  83   // In generated C code, we store arrays with the dimensions reversed. The
  84   // first dimension has smallest stride.
  85   //
  86   // We name our variables by their Tensorflow convention, but generate C code
  87   // nesting loops such that the innermost loop has the smallest stride for the
  88   // best cache behavior.
  89   for (int b = 0; b < extended_output_shape.dims(0); ++b)
  90   {
  91     for (int y = 0; y < extended_output_shape.dims(1); ++y)
  92     {
  93       for (int x = 0; x < extended_output_shape.dims(2); ++x)
  94       {
  95         for (int c = 0; c < extended_output_shape.dims(3); ++c)
  96         {
  97           const int output_data_offset =
  98             ((b * extended_output_shape.dims(1) + y) * extended_output_shape.dims(2) + x) *
  99               extended_output_shape.dims(3) +
 100             c;
 101
 102           output_data[output_data_offset] =
 103             std::min(std::max(input1_data[subscriptToIndex(desc1, b, y, x, c)] *
 104                                 input2_data[subscriptToIndex(desc2, b, y, x, c)],
 105                               activation_min),
 106                      activation_max);
 107         }
 108       }
 109     }
 110   }
 111 }
 112
 113 } // namespace luci_interpreter_pal
 114
 115 #endif // LUCI_INTERPRETER_PAL_MUL_H