compute/cker/include/cker/operation/FullyConnected.h

   1 /*
   2  * Copyright (c) 2019 Samsung Electronics Co., Ltd. All Rights Reserved
   3  * Copyright 2017 The TensorFlow Authors. All Rights Reserved.
   4  *
   5  * Licensed under the Apache License, Version 2.0 (the "License");
   6  * you may not use this file except in compliance with the License.
   7  * You may obtain a copy of the License at
   8  *
   9  *      http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 #ifndef __NNFW_CKER_FULLY_CONNECTED_H__
  19 #define __NNFW_CKER_FULLY_CONNECTED_H__
  20
  21 #include <ruy/context.h>
  22 #include "cker/operation/FullyConnectedDense16x1.h"
  23 #include "cker/operation/FullyConnectedSparse16x1.h"
  24 #include "cker/operation/optimized/Gemm.h"
  25 #include "cker/Shape.h"
  26 #include "cker/Types.h"
  27 #include "cker/Utils.h"
  28 #include "cker/TensorUtils.h"
  29 #include "cker/neon/neon_check.h"
  30
  31 namespace nnfw
  32 {
  33 namespace cker
  34 {
  35
  36 class FCTempArena
  37 {
  38 public:
  39   FCTempArena(void) : prepared(false), input_quantized(), scaling_factors(), accum_scratch()
  40   {
  41     // DO NOTHING
  42   }
  43
  44   void prepare(const Shape &input_shape, const Shape &weights_shape)
  45   {
  46     auto input_size = input_shape.FlatSize();
  47     input_quantized.resize(input_size);
  48
  49     assert(weights_shape.DimensionsCount() == 2);
  50     int batch_size = input_size / weights_shape.Dims(1);
  51     scaling_factors.resize(batch_size);
  52     prepared = true;
  53   }
  54
  55 public:
  56   bool prepared;
  57   std::vector<int8_t> input_quantized;
  58   std::vector<float> scaling_factors;
  59   std::vector<int32_t> accum_scratch;
  60 };
  61
  62 #if defined(CKER_X86_PLATFORM)
  63
  64 // From tensorflow/tensorflow/lite/kernels/internal/optimized/optimized_ops.h
  65 inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
  66                            const float *input_data, const Shape &weights_shape,
  67                            const float *weights_data, const Shape &,
  68                            const float *optional_bias_data, const Shape &output_shape,
  69                            float *output_data)
  70 {
  71   const int dims_count = weights_shape.DimensionsCount();
  72   const int input_rows = weights_shape.Dims(dims_count - 1);
  73   MatrixParams<float> rhs_params;
  74   rhs_params.order = Order::kColMajor;
  75   rhs_params.rows = input_rows;
  76   rhs_params.cols = input_shape.FlatSize() / input_rows;
  77   rhs_params.cache_policy = optimized::DefaultCachePolicy(params.rhs_cacheable);
  78
  79   MatrixParams<float> lhs_params;
  80   lhs_params.order = Order::kRowMajor;
  81   lhs_params.cols = weights_shape.Dims(dims_count - 1);
  82   lhs_params.rows = FlatSizeSkipDim(weights_shape, dims_count - 1);
  83   lhs_params.cache_policy = optimized::DefaultCachePolicy(params.lhs_cacheable);
  84   MatrixParams<float> dst_params;
  85   dst_params.order = Order::kColMajor;
  86   dst_params.rows = output_shape.Dims(output_shape.DimensionsCount() - 1);
  87   dst_params.cols = FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - 1);
  88   GemmParams<float, float> gemm_params;
  89   gemm_params.bias = optional_bias_data;
  90   gemm_params.clamp_min = params.float_activation_min;
  91   gemm_params.clamp_max = params.float_activation_max;
  92   optimized::Gemm(lhs_params, weights_data, rhs_params, input_data, dst_params, output_data,
  93                   gemm_params);
  94 }
  95
  96 #else // CKER_X86_PLATFORM
  97
  98 inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
  99                            const float *input_data, const Shape &weights_shape,
 100                            const float *weights_data, const Shape &, const float *bias_data,
 101                            const Shape &, float *output_data)
 102 {
 103   int total_input_size = input_shape.FlatSize();
 104   int input_size = weights_shape.Dims(1);
 105   const int batch_size = total_input_size / input_size;
 106   const int num_units = weights_shape.Dims(0);
 107
 108   // Output = bias if bias tensor exists.
 109   if (bias_data)
 110   {
 111     VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
 112   }
 113   else
 114   {
 115     ZeroVector(output_data, batch_size * num_units);
 116   }
 117
 118   // Compute output += weight * input
 119   MatrixBatchVectorMultiplyAccumulate(weights_data, num_units, input_size, input_data, batch_size,
 120                                       output_data, /*result_stride=*/1);
 121
 122   if (params.activation != FusedActivationFunctionType::kNone)
 123   {
 124     // Apply activation function
 125     ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
 126   }
 127 }
 128
 129 #endif // CKER_X86_PLATFORM
 130
 131 inline void FullyConnected(const FullyConnectedParams &params, const Shape &input_shape,
 132                            const uint8_t *input_data, const Shape &filter_shape,
 133                            const uint8_t *filter_data, const Shape &bias_shape,
 134                            const int32_t *bias_data, const Shape &output_shape,
 135                            uint8_t *output_data)
 136 {
 137   UNUSED_RELEASE(input_shape);
 138   UNUSED_RELEASE(bias_shape);
 139   const int32_t input_offset = params.input_offset;
 140   const int32_t filter_offset = params.weights_offset;
 141   const int32_t output_offset = params.output_offset;
 142   const int32_t output_multiplier = params.output_multiplier;
 143   const int output_shift = params.output_shift;
 144   const int32_t output_activation_min = params.quantized_activation_min;
 145   const int32_t output_activation_max = params.quantized_activation_max;
 146   assert(filter_shape.DimensionsCount() >= 2);
 147   assert(output_shape.DimensionsCount() >= 1);
 148
 149   assert(output_activation_min <= output_activation_max);
 150   // TODO(benoitjacob): This really should be:
 151   //     const int batches = ArraySize(output_dims, 1);
 152   // but the current --variable_batch hack consists in overwriting the 3rd
 153   // dimension with the runtime batch size, as we don't keep track for each
 154   // array of which dimension is the batch dimension in it.
 155   const int output_dim_count = output_shape.DimensionsCount();
 156   const int filter_dim_count = filter_shape.DimensionsCount();
 157   const int batches = FlatSizeSkipDim(output_shape, output_dim_count - 1);
 158   const int output_depth =
 159     MatchingDim(filter_shape, filter_dim_count - 2, output_shape, output_dim_count - 1);
 160   const int accum_depth = filter_shape.Dims(filter_dim_count - 1);
 161   for (int b = 0; b < batches; ++b)
 162   {
 163     for (int out_c = 0; out_c < output_depth; ++out_c)
 164     {
 165       int32_t acc = 0;
 166       for (int d = 0; d < accum_depth; ++d)
 167       {
 168         int32_t input_val = input_data[b * accum_depth + d];
 169         int32_t filter_val = filter_data[out_c * accum_depth + d];
 170         acc += (filter_val + filter_offset) * (input_val + input_offset);
 171       }
 172       if (bias_data)
 173       {
 174         acc += bias_data[out_c];
 175       }
 176       acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
 177       acc += output_offset;
 178       acc = std::max(acc, output_activation_min);
 179       acc = std::min(acc, output_activation_max);
 180       output_data[out_c + output_depth * b] = static_cast<uint8_t>(acc);
 181     }
 182   }
 183 }
 184
 185 inline void FullyConnectedHybrid(const FullyConnectedParams &params, const Shape &input_shape,
 186                                  const float *input_data, const Shape &filter_shape,
 187                                  const int8_t *filter_data, const Shape &, const float *bias_data,
 188                                  const Shape &output_shape, float *output_data,
 189                                  FCTempArena &temp_arena, ruy::Context *ruy_context)
 190 {
 191   int total_input_size = input_shape.FlatSize();
 192   const int input_size = filter_shape.Dims(1);
 193   const int batch_size = total_input_size / input_size;
 194   const int num_units = filter_shape.Dims(0);
 195
 196   // Output = bias if bias tensor exists.
 197   if (bias_data)
 198   {
 199     VectorBatchVectorAssign(bias_data, num_units, batch_size, output_data);
 200   }
 201   else
 202   {
 203     ZeroVector(output_data, batch_size * num_units);
 204   }
 205
 206   // Save matrix multiplication computation for all zero input.
 207   if (IsZeroVector(input_data, total_input_size))
 208   {
 209     ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
 210     return;
 211   }
 212
 213   // Quantize input from float to uint8 + quantization params (scaling factor).
 214   float unused_min, unused_max;
 215   float *scaling_factors_ptr = temp_arena.scaling_factors.data();
 216   int8_t *quant_data = temp_arena.input_quantized.data();
 217
 218   // Quantize each batch independently.
 219   for (int b = 0; b < batch_size; ++b)
 220   {
 221     const int offset = b * input_size;
 222     SymmetricQuantizeFloats(input_data + offset, input_size, quant_data + offset, &unused_min,
 223                             &unused_max, &scaling_factors_ptr[b]);
 224     // Incorporate scaling of the filter.
 225     scaling_factors_ptr[b] *= params.weights_scale;
 226   }
 227
 228 // Compute output += weight * quantized_input
 229 #ifdef USE_RUY_GEMV
 230   auto output_size = output_shape.FlatSize();
 231   temp_arena.accum_scratch.resize(output_size);
 232   int32_t *scratch = temp_arena.accum_scratch.data();
 233   MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
 234                                       scaling_factors_ptr, batch_size, scratch, output_data,
 235                                       /*result_stride=*/1, ruy_context);
 236 #else
 237   MatrixBatchVectorMultiplyAccumulate(filter_data, num_units, input_size, quant_data,
 238                                       scaling_factors_ptr, batch_size, output_data,
 239                                       /*result_stride=*/1);
 240   UNUSED_RELEASE(ruy_context);
 241   UNUSED_RELEASE(output_shape);
 242 #endif
 243
 244   // Apply activation function to floats.
 245   if (params.activation != FusedActivationFunctionType::kNone)
 246   {
 247     // Apply activation function
 248     ApplyActivationToVector(output_data, batch_size * num_units, params.activation, output_data);
 249   }
 250   return;
 251 }
 252
 253 inline void FullyConnectedSparseWeightRandom(const FullyConnectedParams &params,
 254                                              const Shape &input_shape, const float *input_data,
 255                                              const Shape &weights_shape, const float *weights_data,
 256                                              const Shape &bias_shape, const float *bias_data,
 257                                              const Shape &output_shape, float *output_data,
 258                                              const uint16_t *w1_segments,
 259                                              const uint16_t *w1_indices)
 260 {
 261   UNUSED_RELEASE(params);
 262   UNUSED_RELEASE(input_shape);
 263
 264   assert(weights_shape.DimensionsCount() == 2);
 265   assert(output_shape.DimensionsCount() == 2);
 266
 267   const int output_dims_count = output_shape.DimensionsCount();
 268   const int weights_dims_count = weights_shape.DimensionsCount();
 269   const int batches = FlatSizeSkipDim(output_shape, output_dims_count - 1);
 270   const int output_depth =
 271     MatchingDim(weights_shape, weights_dims_count - 2, output_shape, output_dims_count - 1);
 272   const int accum_depth = weights_shape.Dims(weights_dims_count - 1);
 273
 274   UNUSED_RELEASE(bias_shape);
 275   if (bias_data)
 276   {
 277     VectorBatchVectorAssign(bias_data, output_depth, batches, output_data);
 278   }
 279   else
 280   {
 281     ZeroVector(output_data, batches * output_depth);
 282   }
 283   for (int b = 0; b < batches; ++b)
 284   {
 285     for (int idx_0 = 0; idx_0 < output_depth; ++idx_0)
 286     {
 287       for (int pw1 = w1_segments[idx_0]; pw1 < w1_segments[idx_0 + 1]; ++pw1)
 288       {
 289         int idx_1 = w1_indices[pw1];
 290         output_data[b * output_depth + idx_0] +=
 291           weights_data[pw1] * input_data[b * accum_depth + idx_1];
 292       }
 293     }
 294   }
 295   if (params.activation != FusedActivationFunctionType::kNone)
 296   {
 297     // Apply activation function
 298     ApplyActivationToVector(output_data, batches * output_depth, params.activation, output_data);
 299   }
 300 }
 301
 302 } // namespace cker
 303 } // namespace nnfw
 304
 305 #endif // __NNFW_CKER_FULLY_CONNECTED_H__