compute/cker/include/cker/operation/FullyConnected.h

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_CKER_FULLY_CONNECTED_H__
  19 #define __NNFW_CKER_FULLY_CONNECTED_H__
  20
  21 #include <ruy/context.h>
  22 #include "cker/Shape.h"
  23 #include "cker/Types.h"
  24 #include "cker/Utils.h"
  25 #include "cker/TensorUtils.h"
  26
  27 namespace nnfw
  28 {
  29 namespace cker
  30 {
  31
  32 class FCTempArena
  33 {
  34 public:
  35   FCTempArena(void) : prepared(false), input_quantized(), scaling_factors(), accum_scratch()
  36   {
  37     // DO NOTHING
  38   }
  39
  40   void prepare(const Shape &input_shape, const Shape &weights_shape)
  41   {
  42     auto input_size = input_shape.FlatSize();
  43     input_quantized.resize(input_size);
  44
  45     assert(weights_shape.DimensionsCount() == 2);
  46     int batch_size = input_size / weights_shape.Dims(1);
  47     scaling_factors.resize(batch_size);
  48     prepared = true;
  49   }
  50
  51 public:
  52   bool prepared;
  53   std::vector<int8_t> input_quantized;
  54   std::vector<float> scaling_factors;
  55   std::vector<int32_t> accum_scratch;
  56 };
  57
  58 inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
  59                            const float *input_data, const Shape &weights_shape,
  60                            const float *weights_data, const Shape &, const float *bias_data,
  61                            const Shape &, float *output_data)
  62 {
  63   int total_input_size = input_shape.FlatSize();
  64   int input_size = weights_shape.Dims(1);
  65   const int batch_size = total_input_size / input_size;
  66   const int num_units = weights_shape.Dims(0);
  67
  68   // Output = bias if bias tensor exists.
  69   if (bias_data)
  70   {
  71     VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
  72   }
  73   else
  74   {
  75     ZeroVector(output_data, batch_size * num_units);
  76   }
  77
  78   // Compute output += weight * input
  79   MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
  80                                       output_data, /*result_stride=*/1);
  81
  82   if (params.activation != FusedActivationFunctionType::kNone)
  83   {
  84     // Apply activation function
  85     ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
  86   }
  87 }
  88
  89 inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
  90                            const uint8_t *input_data, const Shape &filter_shape,
  91                            const uint8_t *filter_data, const Shape &bias_shape,
  92                            const int32_t *bias_data, const Shape &output_shape,
  93                            uint8_t *output_data)
  94 {
  95   UNUSED_RELEASE(input_shape);
  96   UNUSED_RELEASE(bias_shape);
  97   const int32_t input_offset = params.input_offset;
  98   const int32_t filter_offset = params.weights_offset;
  99   const int32_t output_offset = params.output_offset;
 100   const int32_t output_multiplier = params.output_multiplier;
 101   const int output_shift = params.output_shift;
 102   const int32_t output_activation_min = params.quantized_activation_min;
 103   const int32_t output_activation_max = params.quantized_activation_max;
 104   assert(filter_shape.DimensionsCount() >= 2);
 105   assert(output_shape.DimensionsCount() >= 1);
 106
 107   assert(output_activation_min <= output_activation_max);
 108   // TODO(benoitjacob): This really should be:
 109   //     const int batches = ArraySize(output_dims, 1);
 110   // but the current --variable_batch hack consists in overwriting the 3rd
 111   // dimension with the runtime batch size, as we don't keep track for each
 112   // array of which dimension is the batch dimension in it.
 113   const int output_dim_count = output_shape.DimensionsCount();
 114   const int filter_dim_count = filter_shape.DimensionsCount();
 115   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
 116   const int output_depth =
 117       MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
 118   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
 119   for (int b = 0; b < batches; ++b)
 120   {
 121     for (int out_c = 0; out_c < output_depth; ++out_c)
 122     {
 123       int32_t acc = 0;
 124       for (int d = 0; d < accum_depth; ++d)
 125       {
 126         int32_t input_val = input_data[b * accum_depth + d];
 127         int32_t filter_val = filter_data[out_c * accum_depth + d];
 128         acc += (filter_val + filter_offset) * (input_val + input_offset);
 129       }
 130       if (bias_data)
 131       {
 132         acc += bias_data[out_c];
 133       }
 134       acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
 135       acc += output_offset;
 136       acc = std::max(acc, output_activation_min);
 137       acc = std::min(acc, output_activation_max);
 138       output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
 139     }
 140   }
 141 }
 142
 143 inline void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape &input_shape,
 144                                  const float *input_data, const Shape &filter_shape,
 145                                  const int8_t *filter_data, const Shape &, const float *bias_data,
 146                                  const Shape &output_shape, float *output_data,
 147                                  FCTempArena &temp_arena, ruy::Context *ruy_context)
 148 {
 149   int total_input_size = input_shape.FlatSize();
 150   const int input_size = filter_shape.Dims(1);
 151   const int batch_size = total_input_size / input_size;
 152   const int num_units = filter_shape.Dims(0);
 153
 154   // Output = bias if bias tensor exists.
 155   if (bias_data)
 156   {
 157     VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
 158   }
 159   else
 160   {
 161     ZeroVector(output_data, batch_size * num_units);
 162   }
 163
 164   // Save matrix multiplication computation for all zero input.
 165   if (IsZeroVector(input_data, total_input_size))
 166   {
 167     ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
 168     return;
 169   }
 170
 171   // Quantize input from float to uint8 + quantization params (scaling factor).
 172   float unused_min, unused_max;
 173   float *scaling_factors_ptr = temp_arena.scaling_factors.data();
 174   int8_t *quant_data = temp_arena.input_quantized.data();
 175
 176   // Quantize each batch independently.
 177   for (int b = 0; b < batch_size; ++b)
 178   {
 179     const int offset = b * input_size;
 180     SymmetricQuantizeFloats(input_data + offset, input_size, quant_data + offset, &unused_min,
 181                             &unused_max, &scaling_factors_ptr[b]);
 182     // Incorporate scaling of the filter.
 183     scaling_factors_ptr[b] *= params.weights_scale;
 184   }
 185
 186 // Compute output += weight * quantized_input
 187 #ifdef USE_RUY_GEMV
 188   auto output_size = output_shape.FlatSize();
 189   temp_arena.accum_scratch.resize(output_size);
 190   int32_t *scratch = temp_arena.accum_scratch.data();
 191   MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
 192                                       scaling_factors_ptr, batch_size, scratch, output_data,
 193                                       /*result_stride=*/1, ruy_context);
 194 #else
 195   MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
 196                                       scaling_factors_ptr, batch_size, output_data,
 197                                       /*result_stride=*/1);
 198   UNUSED_RELEASE(ruy_context);
 199   UNUSED_RELEASE(output_shape);
 200 #endif
 201
 202   // Apply activation function to floats.
 203   if (params.activation != FusedActivationFunctionType::kNone)
 204   {
 205     // Apply activation function
 206     ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
 207   }
 208   return;
 209 }
 210
 211 inline void FullyConnectedSparseWeight(const FullyConnectedParams &params, const Shape &input_shape,
 212                                        const float *input_data, const Shape &weights_shape,
 213                                        const float *weights_data, const Shape &bias_shape,
 214                                        const float *bias_data, const Shape &output_shape,
 215                                        float *output_data, int w0_size, const uint16_t *w1_segments,
 216                                        const uint16_t *w1_indices)
 217 {
 218   UNUSED_RELEASE(params);
 219   UNUSED_RELEASE(input_shape);
 220
 221   assert(weights_shape.DimensionsCount() == 2);
 222   assert(output_shape.DimensionsCount() == 2);
 223
 224   const int output_dims_count = output_shape.DimensionsCount();
 225   const int weights_dims_count = weights_shape.DimensionsCount();
 226   const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
 227   const int output_depth =
 228       MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
 229   const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
 230
 231   UNUSED_RELEASE(bias_shape);
 232   if (bias_data)
 233   {
 234     VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
 235   }
 236   else
 237   {
 238     ZeroVector(output_data, batches * output_depth);
 239   }
 240   for (int b = 0; b < batches; ++b)
 241   {
 242     for (int idx_0 = 0; idx_0 < w0_size; ++idx_0)
 243     {
 244       for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
 245       {
 246         int idx_1 = w1_indices[pw1];
 247         output_data[b * output_depth + idx_0] +=
 248             weights_data[pw1] * input_data[b * accum_depth + idx_1];
 249       }
 250     }
 251   }
 252   if (params.activation != FusedActivationFunctionType::kNone)
 253   {
 254     // Apply activation function
 255     ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
 256   }
 257 }
 258
 259 } // namespace cker
 260 } // namespace nnfw
 261
 262 #endif // __NNFW_CKER_FULLY_CONNECTED_H__