compute/cker/include/cker/PortableTensorUtils.h

   1 /*
   2  * Copyright (c) 2020 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_CKER_PORTABLE_TENSOR_UTILS_H__
  19 #define __NNFW_CKER_PORTABLE_TENSOR_UTILS_H__
  20
  21 #include "cker/Types.h"
  22 #include "cker/neon/neon_check.h"
  23 #include <ruy/context.h>
  24
  25 #include <cstring>
  26 #include <cmath>
  27
  28 namespace nnfw
  29 {
  30 namespace cker
  31 {
  32
  33 class ActivationFunctor
  34 {
  35 public:
  36   explicit ActivationFunctor(FusedActivationFunctionType act) : act_(act) {}
  37
  38   float operator()(float a) const
  39   {
  40     switch (act_)
  41     {
  42       case FusedActivationFunctionType::kNone:
  43         return a;
  44       case FusedActivationFunctionType::kRelu:
  45         return a < 0.f ? 0.f : a;
  46       case FusedActivationFunctionType::kRelu6:
  47         return std::max(0.f, std::min(a, 6.f));
  48       case FusedActivationFunctionType::kTanh:
  49         return std::tanh(a);
  50       case FusedActivationFunctionType::kSigmoid:
  51         return 1.0f / (1.0f + std::exp(-a));
  52       default:
  53         // TODO(aselle): More informative fatal error!
  54         exit(1);
  55     }
  56   }
  57
  58 private:
  59   FusedActivationFunctionType act_;
  60 };
  61
  62 template <typename T>
  63 void PortableCwiseClipping(T *vector, const int v_size, const T clipping_value)
  64 {
  65   for (int i = 0; i < v_size; i++)
  66   {
  67     vector[i] = std::max(std::min(clipping_value, vector[i]), static_cast<T>(-clipping_value));
  68   }
  69 }
  70
  71 inline void PortableVectorBatchVectorAssign(const float *vector, int v_size, int n_batch,
  72                                             float *batch_vector)
  73 {
  74   for (int b = 0; b < n_batch; b++)
  75   {
  76     memcpy(batch_vector + b * v_size, vector, v_size * sizeof(float));
  77   }
  78 }
  79
  80 inline void PortableVectorBatchVectorAdd(const float *vector, int v_size, int n_batch,
  81                                          float *batch_vector)
  82 {
  83   for (int b = 0; b < n_batch; b++)
  84   {
  85     for (int i = 0; i < v_size; ++i)
  86     {
  87       batch_vector[i] += vector[i];
  88     }
  89     batch_vector += v_size;
  90   }
  91 }
  92
  93 inline bool PortableIsZeroVector(const float *vector, int v_size)
  94 {
  95   for (int i = 0; i < v_size; ++i)
  96   {
  97     if (*vector++ != 0.0f)
  98       return false;
  99   }
 100   return true;
 101 }
 102
 103 inline void PortableApplyActivationToVector(const float *vector, int v_size,
 104                                             FusedActivationFunctionType activation, float *result)
 105 {
 106   auto activation_func = ActivationFunctor(activation);
 107   for (int v = 0; v < v_size; v++)
 108   {
 109     *result++ = (activation_func)(*vector++);
 110   }
 111 }
 112
 113 inline void PortableSub1Vector(const float *vector, int v_size, float *result)
 114 {
 115   for (int v = 0; v < v_size; v++)
 116   {
 117     *result++ = 1.0f - *vector++;
 118   }
 119 }
 120
 121 inline void PortableSymmetricQuantizeFloats(const float *values, const int size,
 122                                             int8_t *quantized_values, float *min_value,
 123                                             float *max_value, float *scaling_factor)
 124 {
 125   auto minmax = std::minmax_element(values, values + size);
 126   *min_value = *minmax.first;
 127   *max_value = *minmax.second;
 128   const int kScale = 127;
 129   const float range = std::max(std::abs(*min_value), std::abs(*max_value));
 130   if (range == 0)
 131   {
 132     memset(quantized_values, 0, size * sizeof(int8_t));
 133     *scaling_factor = 1;
 134     return;
 135   }
 136   *scaling_factor = range / kScale;
 137   const float scaling_factor_inv = kScale / range;
 138   for (int i = 0; i < size; ++i)
 139   {
 140     const int32_t quantized_value =
 141       static_cast<int32_t>(std::round(values[i] * scaling_factor_inv));
 142     // Clamp: just in case some odd numeric offset.
 143     quantized_values[i] = std::min(kScale, std::max(-kScale, quantized_value));
 144   }
 145 }
 146
 147 inline void PortableAsymmetricQuantizeFloats(const float *values, const int size,
 148                                              int8_t *quantized_values, float *scaling_factor,
 149                                              int32_t *offset)
 150 {
 151   /* Copied from TensorFlow PortableAsymmetricQuantizeFloats */
 152   const int32_t kMinScale = -128;
 153   const int32_t kMaxScale = 127;
 154   const double qmin_double = kMinScale;
 155   const double qmax_double = kMaxScale;
 156   const auto minmax = std::minmax_element(values, values + size);
 157   const double rmin = static_cast<double>(std::min(0.0f, *minmax.first));
 158   const double rmax = static_cast<double>(std::max(0.0f, *minmax.second));
 159   if (rmin == rmax)
 160   {
 161     memset(quantized_values, 0, size * sizeof(int8_t));
 162     *scaling_factor = 1;
 163     *offset = 0;
 164     return;
 165   }
 166   else
 167   {
 168     double scale = (rmax - rmin) / (qmax_double - qmin_double);
 169     const double zero_point_from_min = qmin_double - rmin / scale;
 170     const double zero_point_from_max = qmax_double - rmax / scale;
 171     const double zero_point_from_min_error = std::abs(qmin_double) + std::abs(rmin / scale);
 172     const double zero_point_from_max_error = std::abs(qmax_double) + std::abs(rmax / scale);
 173     const double zero_point_double = zero_point_from_min_error < zero_point_from_max_error
 174                                        ? zero_point_from_min
 175                                        : zero_point_from_max;
 176     int8_t nudged_zero_point = 0;
 177     if (zero_point_double <= qmin_double)
 178     {
 179       nudged_zero_point = kMinScale;
 180     }
 181     else if (zero_point_double >= qmax_double)
 182     {
 183       nudged_zero_point = kMaxScale;
 184     }
 185     else
 186     {
 187       nudged_zero_point = static_cast<int8_t>(round(zero_point_double));
 188     }
 189     *scaling_factor = scale;
 190     *offset = nudged_zero_point;
 191   }
 192   const float scaling_factor_inv = 1.0f / *scaling_factor;
 193   for (int i = 0; i < size; ++i)
 194   {
 195     const int32_t quantized_value =
 196       static_cast<int32_t>(std::round(*offset + values[i] * scaling_factor_inv));
 197     quantized_values[i] = std::min(kMaxScale, std::max(kMinScale, quantized_value));
 198   }
 199 }
 200
 201 inline void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
 202                                                         const int m_rows, const int m_cols,
 203                                                         const int8_t *__restrict__ vectors,
 204                                                         const float *scaling_factors, int n_batch,
 205                                                         float *__restrict__ result,
 206                                                         int result_stride)
 207 {
 208   int batch, row, col;
 209   for (batch = 0; batch < n_batch; ++batch, vectors += m_cols)
 210   {
 211     const float batch_scaling_factor = scaling_factors[batch];
 212     // Get the address of the first row.
 213     const int8_t *row_ptr = matrix;
 214     for (row = 0; row < m_rows; ++row, result += result_stride)
 215     {
 216       // Initialize the dot product sum for the row to 0.
 217       int32_t dotprod = 0;
 218 #if defined(__GNUC__)
 219       // Prefetch the row to cache.
 220       __builtin_prefetch(row_ptr, 0 /* prefetch for read */, 3 /* temporal locality */);
 221 #endif
 222       for (col = 0; col < m_cols; ++col, ++row_ptr)
 223       {
 224         dotprod += (*row_ptr) * (vectors[col]);
 225       } // for col
 226       *result += (dotprod * batch_scaling_factor);
 227     } // for row
 228   }   // for batch
 229 }
 230
 231 inline void PortableMatrixBatchVectorMultiplyAccumulate(const int8_t *__restrict__ matrix,
 232                                                         const int m_rows, const int m_cols,
 233                                                         const int8_t *__restrict__ vector,
 234                                                         const float *scaling_factors, int n_batch,
 235                                                         int32_t *, float *__restrict__ result,
 236                                                         int result_stride, ruy::Context *)
 237 {
 238   PortableMatrixBatchVectorMultiplyAccumulate(matrix, m_rows, m_cols, vector, scaling_factors,
 239                                               n_batch, result, result_stride);
 240 }
 241
 242 inline void PortableMatrixBatchVectorMultiplyAccumulate(const float *matrix, int m_rows, int m_cols,
 243                                                         const float *vector, int n_batch,
 244                                                         float *result, int result_stride)
 245 {
 246   float *result_in_batch = result;
 247   for (int b = 0; b < n_batch; b++)
 248   {
 249     const float *matrix_ptr = matrix;
 250     for (int r = 0; r < m_rows; r++)
 251     {
 252       float dot_prod = 0.0f;
 253       const float *vector_in_batch = vector + b * m_cols;
 254       for (int c = 0; c < m_cols; c++)
 255       {
 256         dot_prod += *matrix_ptr++ * *vector_in_batch++;
 257       }
 258       *result_in_batch += dot_prod;
 259       result_in_batch += result_stride;
 260     }
 261   }
 262 }
 263
 264 inline void PortableMeanStddevNormalization(const float *input_vector, float *output_vector,
 265                                             int v_size, int n_batch)
 266 {
 267   for (int batch = 0; batch < n_batch; ++batch)
 268   {
 269     float sum = 0.0f;
 270     for (int i = 0; i < v_size; ++i)
 271     {
 272       sum += input_vector[i];
 273     }
 274     const float mean = sum / v_size;
 275     float sum_diff_sq = 0.0f;
 276     for (int i = 0; i < v_size; ++i)
 277     {
 278       const float diff = input_vector[i] - mean;
 279       sum_diff_sq += diff * diff;
 280     }
 281     const float variance = sum_diff_sq / v_size;
 282     constexpr float kNormalizationConstant = 1e-8f;
 283     const float stddev_inv = 1.0f / std::sqrt(variance + kNormalizationConstant);
 284     for (int i = 0; i < v_size; ++i)
 285     {
 286       output_vector[i] = (input_vector[i] - mean) * stddev_inv;
 287     }
 288     input_vector += v_size;
 289     output_vector += v_size;
 290   }
 291 }
 292
 293 inline void PortableZeroVector(float *vector, int v_size) { std::fill_n(vector, v_size, 0); }
 294
 295 } // namespace cker
 296 } // namespace nnfw
 297
 298 #endif // __NNFW_CKER_PORTABLE_TENSOR_UTILS_H__