onert-micro/luci-interpreter/pal/common/PALLogistic.h

   1 /*
   2  * Copyright (c) 2023 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2020 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *    http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef LUCI_INTERPRETER_PAL_LOGISTIC_H
  19 #define LUCI_INTERPRETER_PAL_LOGISTIC_H
  20
  21 #include "Params.h"
  22 #include "PALUtils.h"
  23
  24 namespace luci_interpreter_pal
  25 {
  26
  27 inline void Logistic(const int flat_size, const float *input_data, float *output_data)
  28 {
  29   const float cutoff_upper = 16.619047164916992188f;
  30   const float cutoff_lower = -9.f;
  31
  32   // Rational for using approximation in reference kernel.
  33   // 0. This approximation gives enough precision for float.
  34   // 1. This works around an issue on an embedded chipset where exp() does not
  35   // return correctly as expected - exp(x) should return inf when overflown
  36   // not 1.701417   IEEE 754 defines representation for inf.
  37   // 2. This will speed up calculation and is matching the behavior in the
  38   // optimized kernels. (check the definition of scalar_logistic_op<float>)
  39
  40   for (int i = 0; i < flat_size; i++)
  41   {
  42     float val = input_data[i];
  43     float result;
  44     if (val > cutoff_upper)
  45     {
  46       result = 1.0f;
  47     }
  48     else if (val < cutoff_lower)
  49     {
  50       result = std::exp(val);
  51     }
  52     else
  53     {
  54       result = 1.f / (1.f + std::exp(-val));
  55     }
  56     output_data[i] = result;
  57   }
  58 }
  59
  60 inline void Logistic(const int flat_size, const int8_t *input_data, float input_scale,
  61                      int input_zero_point, int8_t *output_data, float output_scale,
  62                      int output_zero_point)
  63 {
  64   const float cutoff_upper = 16.619047164916992188f;
  65   const float cutoff_lower = -9.f;
  66
  67   // Rational for using approximation in reference kernel.
  68   // 0. This approximation gives enough precision for float.
  69   // 1. This works around an issue on an embedded chipset where exp() does not
  70   // return correctly as expected - exp(x) should return inf when overflown
  71   // not 1.701417   IEEE 754 defines representation for inf.
  72   // 2. This will speed up calculation and is matching the behavior in the
  73   // optimized kernels. (check the definition of scalar_logistic_op<float>)
  74
  75   for (int i = 0; i < flat_size; i++)
  76   {
  77     // Dequantize.
  78     float val = static_cast<float>((input_data[i] - input_zero_point) * input_scale);
  79     float result;
  80     if (val > cutoff_upper)
  81     {
  82       result = 1.0f;
  83     }
  84     else if (val < cutoff_lower)
  85     {
  86       result = std::exp(val);
  87     }
  88     else
  89     {
  90       result = 1.f / (1.f + std::exp(-val));
  91     }
  92     // Requantize
  93     int8_t output = static_cast<int8_t>(result / output_scale + output_zero_point);
  94     output_data[i] = output;
  95   }
  96 }
  97
  98 inline void Logistic(int32_t input_multiplier, int32_t input_left_shift, int32_t input_size,
  99                      const int16_t *ptr_input_data, int16_t *ptr_output_data)
 100 {
 101   // We use the LUT for sigmoid and take into account, that
 102   // tanh(x) = 2*sigmoid(2*x) - 1
 103
 104   // We scale by 3/4 to expand range [-8,8]->[-10.7,10.7].
 105   // In case of general parameter scale, multiplier 3 is taken into account
 106   // in TanhPrepare function and it is included in
 107   // input_multiplier already.
 108   if (input_multiplier == 0)
 109   { // power of two case
 110     input_multiplier = 3 << input_left_shift;
 111     input_left_shift = 0;
 112   }
 113
 114   int32_t round = (input_left_shift > 0) ? 1 << (input_left_shift - 1) : 0;
 115
 116   for (int i = 0; i < input_size; ++i, ptr_input_data++, ptr_output_data++)
 117   {
 118     int32_t input_data = ((*ptr_input_data) * input_multiplier + round) >> input_left_shift;
 119
 120     // We do interpolation on unsigned values.
 121     uint32_t abs_input_data = abs(input_data);
 122
 123     // We divide by 2 power of 9, because
 124     // we need to divide by 2 in power of 7 for
 125     // the input conversion + 1/4 from the scale above.
 126
 127     // Define uh as uint32_t type not to make this function overflow.
 128     uint32_t uh = abs_input_data >> 9;
 129     uint32_t result;
 130
 131     if (uh >= 255)
 132     {
 133       // Saturate to maximum.
 134       result = 0x7FFF << 10;
 135     }
 136     else
 137     {
 138       uint32_t ua = sigmoid_table_uint16[uh];
 139       uint32_t ub = sigmoid_table_uint16[uh + 1];
 140       uint32_t ut = abs_input_data & 0x1ff;
 141       // Interpolation is done using the fractional bit.
 142       result = (ua << 9) + ut * (ub - ua);
 143     }
 144
 145     result = (input_data >= 0) ? (result + (1 << 9)) : ((1 << (16 + 9)) - result + (1 << 9) - 1);
 146
 147     // Back to 16-bit.
 148     result >>= 10;
 149
 150     *ptr_output_data = result;
 151   }
 152 }
 153
 154 } // namespace luci_interpreter_pal
 155
 156 #endif // LUCI_INTERPRETER_PAL_LOGISTIC_H