runtime/onert/backend/cpu/ops/FullyConnectedLayer.cc

   1 /*
   2  * Copyright (c) 2018 Samsung Electronics Co., Ltd. All Rights Reserved
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *      http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16
  17 #include "FullyConnectedLayer.h"
  18
  19 #include "../Tensor.h"
  20 #include <cker/operation/FullyConnected.h>
  21 #include <cker/TensorUtils.h>
  22 #include <misc/polymorphic_downcast.h>
  23
  24 namespace onert
  25 {
  26 namespace backend
  27 {
  28 namespace cpu
  29 {
  30 namespace ops
  31 {
  32
  33 FullyConnectedLayer::FullyConnectedLayer()
  34   : _input(nullptr), _weights(nullptr), _bias(nullptr), _output(nullptr),
  35     _activation(ir::Activation::NONE), _temp_arena(new nnfw::cker::FCTempArena()),
  36     _external_context(nullptr), _is_hybrid(false), _is_shuffled16x1float32(false)
  37 {
  38   // DO NOTHING
  39 }
  40
  41 FullyConnectedLayer::~FullyConnectedLayer() = default;
  42
  43 void FullyConnectedLayer::fullyConnectedFloat32()
  44 {
  45   nnfw::cker::FullyConnectedParams op_params;
  46   op_params.activation = convertActivationType(_activation);
  47
  48   nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<float>(_input),
  49                              getShape(_weights), getBuffer<float>(_weights), getShape(_bias),
  50                              _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output),
  51                              getBuffer<float>(_output));
  52 }
  53
  54 // executionMutex is used to protect concurrent access of non-threadsafe resources
  55 // like gemmlowp::GemmContext.
  56 void FullyConnectedLayer::fullyConnectedQuant8()
  57 {
  58   double real_multiplier = 0.0;
  59   int32_t output_multiplier = 0;
  60   int32_t output_shift = 0;
  61   int32_t output_activation_min = 0;
  62   int32_t output_activation_max = 0;
  63   GetQuantizedConvolutionMultiplier(_input, _weights, _bias, _output, &real_multiplier);
  64   QuantizeMultiplier(real_multiplier, &output_multiplier, &output_shift);
  65   CalculateActivationRangeQuantized(_activation, _output, &output_activation_min,
  66                                     &output_activation_max);
  67
  68   nnfw::cker::FullyConnectedParams op_params;
  69   op_params.input_offset = -_input->data_zero_point();
  70   op_params.weights_offset = -_weights->data_zero_point();
  71   op_params.output_offset = _output->data_zero_point();
  72   op_params.output_multiplier = output_multiplier;
  73   op_params.output_shift = output_shift;
  74   op_params.quantized_activation_min = output_activation_min;
  75   op_params.quantized_activation_max = output_activation_max;
  76
  77   nnfw::cker::FullyConnected(op_params, getShape(_input), getBuffer<uint8_t>(_input),
  78                              getShape(_weights), getBuffer<uint8_t>(_weights), getShape(_bias),
  79                              _bias ? getBuffer<int32_t>(_bias) : nullptr, getShape(_output),
  80                              getBuffer<uint8_t>(_output));
  81 }
  82
  83 void FullyConnectedLayer::fullyConnectedHybrid()
  84 {
  85   nnfw::cker::FCTempArena &temp_arena = *_temp_arena;
  86   if (!temp_arena.prepared)
  87   {
  88     temp_arena.prepare(getShape(_input), getShape(_weights));
  89   }
  90
  91   nnfw::cker::FullyConnectedParams op_params;
  92   op_params.activation = convertActivationType(_activation);
  93   op_params.weights_scale = _weights->data_scale();
  94
  95 #ifndef USE_RUY_GEMV
  96   nnfw::cker::FullyConnectedHybrid(
  97     op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
  98     getBuffer<int8_t>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
  99     getShape(_output), getBuffer<float>(_output), temp_arena, _external_context->ruy_context());
 100 #else
 101   nnfw::cker::FullyConnectedHybrid(
 102     op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
 103     (_cached_weights) ? reinterpret_cast<const int8_t *>(_cached_weights)
 104                       : getBuffer<int8_t>(_weights),
 105     getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr, getShape(_output),
 106     getBuffer<float>(_output), temp_arena, _external_context->ruy_context());
 107
 108   if (_cached_weights == nullptr || _is_weights_freed)
 109     return;
 110
 111   // '_cached_weights is not nullptr and _is_weights_freed is false' means
 112   // this weight shape is satisfied with the ruy kernel's prepack cache's condition.
 113   // After entering here, it will not enter again except below the case - input is zero-vector
 114
 115   // if input's elements are filled with zero, it by-passes(does not enter ruy-kernel path)
 116   // so that handle this case
 117   const int input_size = getShape(_input).FlatSize();
 118   if (nnfw::cker::IsZeroVector(getBuffer<float>(_input), input_size))
 119     return;
 120
 121   auto weight_tensor = nnfw::misc::polymorphic_downcast<const Tensor *>(_weights);
 122
 123   // This weight tensor could be other ops' const tensor.
 124   // Therefore, below reference should be checked like following
 125   auto tensor = const_cast<Tensor *>(weight_tensor);
 126   if (tensor->buffer() == nullptr) // ref is already 0?
 127   {
 128     _is_weights_freed = true;
 129     return;
 130   }
 131
 132   tensor->decrease_ref();
 133   if (tensor->buffer() == nullptr) // ref == 0?
 134   {
 135 #if defined(__ANDROID__) && (__ANDROID_API__ >= 26)
 136     // NOTE This line forces OS to release any unused memory immediately
 137     mallopt(M_PURGE, 0);
 138 #endif
 139     _is_weights_freed = true;
 140   }
 141 #endif
 142 }
 143
 144 void FullyConnectedLayer::fullyConnectedSparseWeight()
 145 {
 146   nnfw::cker::FullyConnectedParams op_params;
 147   op_params.activation = convertActivationType(_activation);
 148
 149   const uint16_t *w1_segments = _weights->sparsity()->w1_segments();
 150   const uint16_t *w1_indices = _weights->sparsity()->w1_indices();
 151
 152   auto block_size = _weights->sparsity()->block_size();
 153   if (block_size.size() == 0)
 154   {
 155     nnfw::cker::FullyConnectedSparseWeightRandom(
 156       op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
 157       getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
 158       getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices);
 159   }
 160   else if (block_size.size() == 2 && block_size[0] == 16 && block_size[1] == 1)
 161   {
 162     nnfw::cker::FullyConnectedSparseWeight16x1(
 163       op_params, getShape(_input), getBuffer<float>(_input), getShape(_weights),
 164       getBuffer<float>(_weights), getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
 165       getShape(_output), getBuffer<float>(_output), w1_segments, w1_indices);
 166   }
 167   else
 168     throw std::runtime_error{"FullyConnected: unsupported sparsity"};
 169 }
 170
 171 void FullyConnectedLayer::fullyConnected16x1Float32()
 172 {
 173 #if defined(__aarch64__) && defined(USE_NEON)
 174   float output_activation_min = 0, output_activation_max = 0;
 175   CalculateActivationRange(_activation, &output_activation_min, &output_activation_max);
 176
 177   nnfw::cker::FullyConnectedParams op_params;
 178   op_params.activation = convertActivationType(_activation);
 179
 180   nnfw::cker::FullyConnected16x1Float32(op_params, getShape(_input), getBuffer<float>(_input),
 181                                         getShape(_weights), getBuffer<float>(_weights),
 182                                         getShape(_bias), _bias ? getBuffer<float>(_bias) : nullptr,
 183                                         getShape(_output), getBuffer<float>(_output));
 184 #else
 185   throw std::runtime_error{"FullyConnected: Shuffled16x1Float32 weights_format is not supported."};
 186 #endif
 187 }
 188
 189 void FullyConnectedLayer::configure(const IPortableTensor *input, const IPortableTensor *weights,
 190                                     const IPortableTensor *bias, ir::Activation activation,
 191                                     ir::FullyConnectedWeightsFormat weights_format,
 192                                     IPortableTensor *output,
 193                                     const std::shared_ptr<ExternalContext> &external_context)
 194 {
 195   _input = input;
 196   _weights = weights;
 197   _bias = bias;
 198   _activation = activation;
 199   _output = output;
 200   _is_hybrid = input->data_type() == OperandType::FLOAT32 &&
 201                weights->data_type() == OperandType::QUANT_INT8_SYMM;
 202   _is_shuffled16x1float32 = weights_format == ir::FullyConnectedWeightsFormat::Shuffled16x1Float32;
 203 #if !defined(__aarch64__) || !defined(USE_NEON)
 204   if (_is_shuffled16x1float32)
 205   {
 206     throw std::runtime_error{
 207       "FullyConnected: Shuffled16x1Float32 weights_format is not supported."};
 208   }
 209 #endif
 210   _external_context = external_context;
 211 }
 212
 213 void FullyConnectedLayer::run()
 214 {
 215   if (_is_hybrid)
 216   {
 217     fullyConnectedHybrid();
 218   }
 219   else if (_weights->sparsity())
 220   {
 221     fullyConnectedSparseWeight();
 222   }
 223   else if (_input->data_type() == OperandType::FLOAT32)
 224   {
 225     _is_shuffled16x1float32 ? fullyConnected16x1Float32() : fullyConnectedFloat32();
 226   }
 227   else if (_input->data_type() == OperandType::QUANT_UINT8_ASYMM)
 228   {
 229     fullyConnectedQuant8();
 230   }
 231   else
 232   {
 233     throw std::runtime_error{"FullyConnected: unsupported data type"};
 234   }
 235 }
 236
 237 void FullyConnectedLayer::prepare()
 238 {
 239   if (_bias && _bias->is_constant())
 240   {
 241     const int bias_size = getShape(_bias).FlatSize();
 242     if (nnfw::cker::IsZeroVector(getBuffer<float>(_bias), bias_size))
 243     {
 244       _bias = nullptr;
 245     }
 246   }
 247
 248 #if (defined(__ARM_NEON__) || defined(__ARM_NEON)) && defined(USE_RUY_GEMV)
 249   // TODO This is workaround
 250   // The only fc hybrid will use ruy kernel
 251   if (_input->data_type() != OperandType::FLOAT32 ||
 252       _weights->data_type() != OperandType::QUANT_INT8_SYMM)
 253   {
 254     return;
 255   }
 256
 257   // NOTE. The condition to enable caching on ruy kernel can be changed according to ruy's version
 258
 259   // If input is dynamic, it changes total size of input
 260   // If weights is not constant, weights cannot be cached
 261   if (_input->is_dynamic() || !_weights->is_constant())
 262     return;
 263
 264   const int rows = getShape(_weights).Dims(0);
 265   if (rows % 4 == 0)
 266   {
 267     // TODO If it's possible to extract precaching from ruy kernel,
 268     // place this instead of below code
 269
 270     // buffer will be used by ruy kernel as a cache key
 271     _cached_weights = _weights->buffer();
 272   }
 273 #endif
 274 }
 275
 276 } // namespace ops
 277 } // namespace cpu
 278 } // namespace backend
 279 } // namespace onert