compute/cker/include/cker/operation/FullyConnected.h

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_CKER_FULLY_CONNECTED_H__
  19 #define __NNFW_CKER_FULLY_CONNECTED_H__
  20
  21 #include <ruy/context.h>
  22 #include "cker/operation/FullyConnectedDense16x1.h"
  23 #include "cker/operation/FullyConnectedSparse16x1.h"
  24 #include "cker/Shape.h"
  25 #include "cker/Types.h"
  26 #include "cker/Utils.h"
  27 #include "cker/TensorUtils.h"
  28 #include "cker/neon/neon_check.h"
  29
  30 namespace nnfw
  31 {
  32 namespace cker
  33 {
  34
  35 class FCTempArena
  36 {
  37 public:
  38   FCTempArena(void) : prepared(false), input_quantized(), scaling_factors(), accum_scratch()
  39   {
  40     // DO NOTHING
  41   }
  42
  43   void prepare(const Shape &input_shape, const Shape &weights_shape)
  44   {
  45     auto input_size = input_shape.FlatSize();
  46     input_quantized.resize(input_size);
  47
  48     assert(weights_shape.DimensionsCount() == 2);
  49     int batch_size = input_size / weights_shape.Dims(1);
  50     scaling_factors.resize(batch_size);
  51     prepared = true;
  52   }
  53
  54 public:
  55   bool prepared;
  56   std::vector<int8_t> input_quantized;
  57   std::vector<float> scaling_factors;
  58   std::vector<int32_t> accum_scratch;
  59 };
  60
  61 inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
  62                            const float *input_data, const Shape &weights_shape,
  63                            const float *weights_data, const Shape &, const float *bias_data,
  64                            const Shape &, float *output_data)
  65 {
  66   int total_input_size = input_shape.FlatSize();
  67   int input_size = weights_shape.Dims(1);
  68   const int batch_size = total_input_size / input_size;
  69   const int num_units = weights_shape.Dims(0);
  70
  71   // Output = bias if bias tensor exists.
  72   if (bias_data)
  73   {
  74     VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
  75   }
  76   else
  77   {
  78     ZeroVector(output_data, batch_size * num_units);
  79   }
  80
  81   // Compute output += weight * input
  82   MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
  83                                       output_data, /*result_stride=*/1);
  84
  85   if (params.activation != FusedActivationFunctionType::kNone)
  86   {
  87     // Apply activation function
  88     ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
  89   }
  90 }
  91
  92 inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
  93                            const uint8_t *input_data, const Shape &filter_shape,
  94                            const uint8_t *filter_data, const Shape &bias_shape,
  95                            const int32_t *bias_data, const Shape &output_shape,
  96                            uint8_t *output_data)
  97 {
  98   UNUSED_RELEASE(input_shape);
  99   UNUSED_RELEASE(bias_shape);
 100   const int32_t input_offset = params.input_offset;
 101   const int32_t filter_offset = params.weights_offset;
 102   const int32_t output_offset = params.output_offset;
 103   const int32_t output_multiplier = params.output_multiplier;
 104   const int output_shift = params.output_shift;
 105   const int32_t output_activation_min = params.quantized_activation_min;
 106   const int32_t output_activation_max = params.quantized_activation_max;
 107   assert(filter_shape.DimensionsCount() >= 2);
 108   assert(output_shape.DimensionsCount() >= 1);
 109
 110   assert(output_activation_min <= output_activation_max);
 111   // TODO(benoitjacob): This really should be:
 112   //     const int batches = ArraySize(output_dims, 1);
 113   // but the current --variable_batch hack consists in overwriting the 3rd
 114   // dimension with the runtime batch size, as we don't keep track for each
 115   // array of which dimension is the batch dimension in it.
 116   const int output_dim_count = output_shape.DimensionsCount();
 117   const int filter_dim_count = filter_shape.DimensionsCount();
 118   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
 119   const int output_depth =
 120     MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
 121   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
 122   for (int b = 0; b < batches; ++b)
 123   {
 124     for (int out_c = 0; out_c < output_depth; ++out_c)
 125     {
 126       int32_t acc = 0;
 127       for (int d = 0; d < accum_depth; ++d)
 128       {
 129         int32_t input_val = input_data[b * accum_depth + d];
 130         int32_t filter_val = filter_data[out_c * accum_depth + d];
 131         acc += (filter_val + filter_offset) * (input_val + input_offset);
 132       }
 133       if (bias_data)
 134       {
 135         acc += bias_data[out_c];
 136       }
 137       acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
 138       acc += output_offset;
 139       acc = std::max(acc, output_activation_min);
 140       acc = std::min(acc, output_activation_max);
 141       output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
 142     }
 143   }
 144 }
 145
 146 inline void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape &input_shape,
 147                                  const float *input_data, const Shape &filter_shape,
 148                                  const int8_t *filter_data, const Shape &, const float *bias_data,
 149                                  const Shape &output_shape, float *output_data,
 150                                  FCTempArena &temp_arena, ruy::Context *ruy_context)
 151 {
 152   int total_input_size = input_shape.FlatSize();
 153   const int input_size = filter_shape.Dims(1);
 154   const int batch_size = total_input_size / input_size;
 155   const int num_units = filter_shape.Dims(0);
 156
 157   // Output = bias if bias tensor exists.
 158   if (bias_data)
 159   {
 160     VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
 161   }
 162   else
 163   {
 164     ZeroVector(output_data, batch_size * num_units);
 165   }
 166
 167   // Save matrix multiplication computation for all zero input.
 168   if (IsZeroVector(input_data, total_input_size))
 169   {
 170     ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
 171     return;
 172   }
 173
 174   // Quantize input from float to uint8 + quantization params (scaling factor).
 175   float unused_min, unused_max;
 176   float *scaling_factors_ptr = temp_arena.scaling_factors.data();
 177   int8_t *quant_data = temp_arena.input_quantized.data();
 178
 179   // Quantize each batch independently.
 180   for (int b = 0; b < batch_size; ++b)
 181   {
 182     const int offset = b * input_size;
 183     SymmetricQuantizeFloats(input_data + offset, input_size, quant_data + offset, &unused_min,
 184                             &unused_max, &scaling_factors_ptr[b]);
 185     // Incorporate scaling of the filter.
 186     scaling_factors_ptr[b] *= params.weights_scale;
 187   }
 188
 189 // Compute output += weight * quantized_input
 190 #ifdef USE_RUY_GEMV
 191   auto output_size = output_shape.FlatSize();
 192   temp_arena.accum_scratch.resize(output_size);
 193   int32_t *scratch = temp_arena.accum_scratch.data();
 194   MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
 195                                       scaling_factors_ptr, batch_size, scratch, output_data,
 196                                       /*result_stride=*/1, ruy_context);
 197 #else
 198   MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
 199                                       scaling_factors_ptr, batch_size, output_data,
 200                                       /*result_stride=*/1);
 201   UNUSED_RELEASE(ruy_context);
 202   UNUSED_RELEASE(output_shape);
 203 #endif
 204
 205   // Apply activation function to floats.
 206   if (params.activation != FusedActivationFunctionType::kNone)
 207   {
 208     // Apply activation function
 209     ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
 210   }
 211   return;
 212 }
 213
 214 inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams &params,
 215                                              const Shape &input_shape, const float *input_data,
 216                                              const Shape &weights_shape, const float *weights_data,
 217                                              const Shape &bias_shape, const float *bias_data,
 218                                              const Shape &output_shape, float *output_data,
 219                                              const uint16_t *w1_segments,
 220                                              const uint16_t *w1_indices)
 221 {
 222   UNUSED_RELEASE(params);
 223   UNUSED_RELEASE(input_shape);
 224
 225   assert(weights_shape.DimensionsCount() == 2);
 226   assert(output_shape.DimensionsCount() == 2);
 227
 228   const int output_dims_count = output_shape.DimensionsCount();
 229   const int weights_dims_count = weights_shape.DimensionsCount();
 230   const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
 231   const int output_depth =
 232     MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
 233   const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
 234
 235   UNUSED_RELEASE(bias_shape);
 236   if (bias_data)
 237   {
 238     VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
 239   }
 240   else
 241   {
 242     ZeroVector(output_data, batches * output_depth);
 243   }
 244   for (int b = 0; b < batches; ++b)
 245   {
 246     for (int idx_0 = 0; idx_0 < output_depth; ++idx_0)
 247     {
 248       for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
 249       {
 250         int idx_1 = w1_indices[pw1];
 251         output_data[b * output_depth + idx_0] +=
 252           weights_data[pw1] * input_data[b * accum_depth + idx_1];
 253       }
 254     }
 255   }
 256   if (params.activation != FusedActivationFunctionType::kNone)
 257   {
 258     // Apply activation function
 259     ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
 260   }
 261 }
 262
 263 } // namespace cker
 264 } // namespace nnfw
 265
 266 #endif // __NNFW_CKER_FULLY_CONNECTED_H__