compute/cker/include/cker/operation/SoftMax.h

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_CKER_SOFTMAX_H__
  19 #define __NNFW_CKER_SOFTMAX_H__
  20
  21 #include "cker/Shape.h"
  22 #include "cker/Utils.h"
  23 #include "cker/Types.h"
  24 #include "cker/eigen/Utils.h"
  25
  26 #include <Eigen/Core>
  27 #include <fixedpoint/fixedpoint.h>
  28 #include <cmath>
  29
  30 namespace nnfw
  31 {
  32 namespace cker
  33 {
  34
  35 namespace reference
  36 {
  37
  38 // Note. This Softmax function supports all of dimensions
  39 inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const float *input_data,
  40                     const Shape &output_shape, float *output_data)
  41 {
  42   const int trailing_dim = input_shape.DimensionsCount() - 1;
  43   const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
  44   const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
  45
  46   for (int i = 0; i < outer_size; ++i)
  47   {
  48     // Find max element value which we'll use to ensure numerical stability
  49     // taking advantage of the following equality:
  50     // exp(x[i])/sum(exp(x[i])) == exp(x[i]+C)/sum(exp(x[i]+C))
  51     float max = std::numeric_limits<float>::lowest();
  52     for (int c = 0; c < depth; ++c)
  53     {
  54       max = std::max(max, input_data[i * depth + c]);
  55     }
  56
  57     // Compute sum.
  58     float sum = 0.f;
  59     for (int c = 0; c < depth; ++c)
  60     {
  61       sum += std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta));
  62     }
  63
  64     // Compute result.
  65     for (int c = 0; c < depth; ++c)
  66     {
  67       output_data[i * depth + c] =
  68           std::exp((input_data[i * depth + c] - max) * static_cast<float>(params.beta)) / sum;
  69     }
  70   }
  71 }
  72 }
  73
  74 // Performs softmax along the input of size (input_size * batch_size).
  75 inline void Softmax(const float *in, const int input_size, const int batch_size, const float beta,
  76                     float *out)
  77 {
  78   assert(input_size > 0);
  79
  80   // For each batch
  81   for (int b = 0; b < batch_size; b++)
  82   {
  83     // Find the max coeff.
  84     float max_coeff = in[0];
  85     for (int i = 1; i < input_size; i++)
  86     {
  87       if (in[i] > max_coeff)
  88         max_coeff = in[i];
  89     }
  90
  91     // Compute the normalized sum of exps.
  92     float exp_sum = 0.0;
  93     for (int i = 0; i < input_size; i++)
  94     {
  95       out[i] = std::exp((in[i] - max_coeff) * beta);
  96       exp_sum += out[i];
  97     }
  98
  99     // Divide by the sum of exps.
 100     float reciprocal_sum_exp = 1.f / exp_sum;
 101     for (int i = 0; i < input_size; i++)
 102     {
 103       out[i] *= reciprocal_sum_exp;
 104     }
 105
 106     // Advance in and out pointers for the next batch.
 107     in += input_size;
 108     out += input_size;
 109   }
 110 }
 111
 112 inline void Softmax(const SoftmaxParams &params, const Shape &input_shape, const float *input_data,
 113                     const Shape &output_shape, float *output_data)
 114 {
 115   // Validate whether if shapes of input and output are the same
 116   MatchingFlatSize(input_shape, output_shape);
 117
 118   const auto in_mat = MapAsMatrixWithLastDimAsRows(input_data, input_shape);
 119   auto out_mat = MapAsMatrixWithLastDimAsRows(output_data, output_shape);
 120   // Compute the exponential first, removing the max coefficient for numerical
 121   // stability.
 122   out_mat = (in_mat.rowwise() - in_mat.colwise().maxCoeff()).array() * params.beta;
 123   // We are separating out the exp function so that exp can be vectorized.
 124   out_mat = out_mat.array().exp();
 125   // Normalize to get the activations.
 126   Eigen::Array<float, 1, Eigen::Dynamic> scale = out_mat.array().colwise().sum().inverse();
 127   out_mat.array().rowwise() *= scale;
 128 }
 129
 130 inline void Softmax(const SoftmaxParams &params, const Shape &input_shape,
 131                     const uint8_t *input_data, const Shape &output_shape, uint8_t *output_data)
 132 {
 133   const int32_t input_beta_multiplier = params.input_multiplier;
 134   const int32_t input_beta_left_shift = params.input_left_shift;
 135   const int diff_min = params.diff_min;
 136   // The representation chosen for the input to the exp() function is Q5.26.
 137   // We need to leave extra space since values that we skip might be as large as
 138   // -32 before multiplying by input_beta_multiplier, and therefore as large as
 139   // -16 afterwards.  Note that exp(-8) is definitely not insignificant to
 140   // accumulation, but exp(-16) definitely is.
 141   static const int kScaledDiffIntegerBits = 5;
 142   static const int kAccumulationIntegerBits = 12;
 143   using FixedPointScaledDiff = gemmlowp::FixedPoint<int32_t, kScaledDiffIntegerBits>;
 144   using FixedPointAccum = gemmlowp::FixedPoint<int32_t, kAccumulationIntegerBits>;
 145   using FixedPoint0 = gemmlowp::FixedPoint<int32_t, 0>;
 146
 147   const int trailing_dim = input_shape.DimensionsCount() - 1;
 148   const int outer_size = MatchingFlatSizeSkipDim(input_shape, trailing_dim, output_shape);
 149   const int depth = MatchingDim(input_shape, trailing_dim, output_shape, trailing_dim);
 150
 151   for (int i = 0; i < outer_size; ++i)
 152   {
 153     uint8_t max_in_row = 0;
 154     for (int c = 0; c < depth; ++c)
 155     {
 156       max_in_row = std::max(max_in_row, input_data[i * depth + c]);
 157     }
 158
 159     FixedPointAccum sum_of_exps = FixedPointAccum::Zero();
 160     for (int c = 0; c < depth; ++c)
 161     {
 162       int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
 163       if (input_diff >= diff_min)
 164       {
 165         const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
 166             input_diff, input_beta_multiplier, input_beta_left_shift);
 167         const FixedPointScaledDiff scaled_diff_f8 =
 168             FixedPointScaledDiff::FromRaw(input_diff_rescaled);
 169         sum_of_exps = sum_of_exps + gemmlowp::Rescale<kAccumulationIntegerBits>(
 170                                         exp_on_negative_values(scaled_diff_f8));
 171       }
 172     }
 173
 174     int32_t fixed_sum_of_exps = sum_of_exps.raw();
 175     int headroom_plus_one = CountLeadingZeros(static_cast<uint32_t>(fixed_sum_of_exps));
 176     // This is the number of bits to the left of the binary point above 1.0.
 177     // Consider fixed_sum_of_exps=1.25.  In that case shifted_scale=0.8 and
 178     // no later adjustment will be needed.
 179     int num_bits_over_unit = kAccumulationIntegerBits - headroom_plus_one;
 180     int32_t shifted_sum_minus_one =
 181         static_cast<int32_t>((static_cast<uint32_t>(fixed_sum_of_exps) << headroom_plus_one) -
 182                              (static_cast<uint32_t>(1) << 31));
 183
 184     FixedPoint0 shifted_scale =
 185         one_over_one_plus_x_for_x_in_0_1(FixedPoint0::FromRaw(shifted_sum_minus_one));
 186
 187     for (int c = 0; c < depth; ++c)
 188     {
 189       int32_t input_diff = static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
 190       if (input_diff >= diff_min)
 191       {
 192         const int32_t input_diff_rescaled = MultiplyByQuantizedMultiplierGreaterThanOne(
 193             input_diff, input_beta_multiplier, input_beta_left_shift);
 194         const FixedPointScaledDiff scaled_diff_f8 =
 195             FixedPointScaledDiff::FromRaw(input_diff_rescaled);
 196
 197         FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
 198         int32_t unsat_output = gemmlowp::RoundingDivideByPOT((shifted_scale * exp_in_0).raw(),
 199                                                              num_bits_over_unit + 31 - 8);
 200
 201         output_data[i * depth + c] = static_cast<uint8_t>(
 202             std::max(std::min(unsat_output, static_cast<int32_t>(255)), static_cast<int32_t>(0)));
 203       }
 204       else
 205       {
 206         output_data[i * depth + c] = 0;
 207       }
 208     }
 209   }
 210 }
 211
 212 } // namespace cker
 213 } // namespace nnfw
 214
 215 #endif // __NNFW_CKER_SOFTMAX_H__