inference-engine/src/gna_plugin/quantization/scale_factor_calc.hpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #pragma once
   6 #include <vector>
   7 #include <algorithm>
   8 #include <utility>
   9 #include <limits>
  10 #include <string>
  11 #include <map>
  12 #include "gna_layer_info.hpp"
  13 #include "ie_layers.h"
  14 #include "gna_plugin_log.hpp"
  15
  16 namespace GNAPluginNS {
  17 namespace details {
  18 using namespace InferenceEngine;
  19 struct ScaleFactorUpdateResult {
  20     CNNLayer *restartLayer = nullptr;
  21     ScaleFactorUpdateResult() = default;
  22     explicit ScaleFactorUpdateResult(CNNLayer * restartlayer) : restartLayer(restartlayer) {
  23     }
  24     operator bool() {
  25         return restartLayer == nullptr;
  26     }
  27 };
  28
  29 /**
  30  * @brief calculates output scale factor per layer
  31  * @tparam T
  32  */
  33 template<class T>
  34 class ScaleFactorPerLayer {
  35  public:
  36     /**
  37      * @brief calculates weights scale factor for fit dynamic range into target bitsize,
  38      * also calculates output scale factor for the given layer
  39      * @param cnnLayer
  40      * @param weightsSize
  41      * @param inputScaleFactor
  42      * @param result
  43      * @return
  44      */
  45     bool operator()(T cnnLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
  46         return false;
  47     }
  48 };
  49
  50 template<>
  51 class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
  52  private :
  53     const float activation_scale_factor = 2048.f;
  54     const float identity_scale_factor = 2049.0f;
  55     const float k = 5;
  56     const float k_identity = 6;
  57
  58  protected :
  59     static bool fp32eq(float p1, float p2) {
  60         return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
  61     }
  62     float getActivationScale(GNAPluginNS::LayerInfo const&  layer, QuantizedLayerParams const* qunatizedParams) {
  63             // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
  64             // set the initial value
  65             float result = 1.0f;
  66             result = (layer.isIdentity()) ? identity_scale_factor : activation_scale_factor;
  67             // if activation is one from relu family, we need to apply heuruistic to avoid activation output overflow
  68             if (layer.isRelu() &&
  69                     static_cast<uint64_t>(result * qunatizedParams->_src_quant.scale)
  70                                                                 > std::numeric_limits<int32_t>::max()-1) {
  71                 result = (result * 0.5);
  72             }
  73             return result;
  74     }
  75
  76  public :
  77     bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
  78         if ( !cnnLayer ) {
  79             THROW_IE_EXCEPTION << "Incorrect Convolutional Layer pointer \n";
  80         }
  81         LayerInfo layerInfo(*cnnLayer);
  82         // TODO: current approach set input scale factor for true input layer(s) equals to provided factor,
  83         auto quant = getInjectedData<QuantizedLayerParams>(*cnnLayer);
  84         if (InferenceEngine::details::CaselessEq<std::string>()(cnnLayer->type, "Memory")) {
  85              if (CNNNetHasPrevLayer(cnnLayer)) {
  86                 auto prevLayer = CNNNetPrevLayer(cnnLayer);
  87                 auto prevInfo = LayerInfo(prevLayer);
  88                 auto inputQuant = getInjectedData<QuantizedLayerParams>(prevLayer);
  89                // locating corresponding memory layers ith same ID
  90                 for (auto && input : CNNNetGetAllInputLayers(cnnLayer)) {
  91                     LayerInfo ll(input);
  92                     if (!ll.isMemory() ||
  93                         !InferenceEngine::details::CaselessEq<std::string>()(input->params["id"], cnnLayer->params["id"])) {
  94                         continue;
  95                     }
  96
  97                     auto quantSibling = getInjectedData<QuantizedLayerParams>(input);
  98
  99                     // after restarting from memory input - quant is fine
 100                     if (fp32eq(quantSibling->_dst_quant.scale, inputQuant->_dst_quant.scale)) {
 101                         quant->_src_quant.scale = quant->_dst_quant.scale = inputQuant->_dst_quant.scale;
 102                         return true;
 103                     }
 104
 105                     if (!fp32eq(quantSibling->_dst_quant.scale, 1)) {
 106                         // means we already restarted propagation from that memory layer - we cannot do mach here
 107                         THROW_GNA_EXCEPTION << "quantization error : input scale factor ( " << inputQuant->_dst_quant.scale <<") "
 108                                   << " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : "
 109                                   << activation_scale_factor;
 110                     }
 111
 112                     gnawarn() << "[INFO] quantization : input scale factor (" << inputQuant->_dst_quant.scale <<")"
 113                               << " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : "
 114                               << activation_scale_factor << ", restarting from corresponding memory: "<< input->name << std::endl;
 115
 116                     // try updating memory input layer scale factor and restart from it
 117                     quantSibling->_src_quant.scale = quantSibling->_dst_quant.scale = inputQuant->_dst_quant.scale;
 118                     result = ScaleFactorUpdateResult(input.get());
 119                     return true;
 120                 }
 121             }
 122             return true;
 123         }
 124
 125         if (!CNNNetHasPrevLayer(cnnLayer)) {
 126             quant->_dst_quant.scale = inputScaleFactor;
 127             return ScaleFactorUpdateResult();
 128         }
 129
 130         // by default layer is pass thru its scale factor
 131         auto inputQuant = getInjectedData<QuantizedLayerParams>(CNNNetPrevLayer(cnnLayer));
 132         quant->_dst_quant.scale = inputQuant->_dst_quant.scale;
 133         quant->_src_quant.scale = inputQuant->_dst_quant.scale;
 134
 135         if (layerInfo.isActivation()) {
 136             // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
 137             // set the initial value
 138             quant->_dst_quant.scale = getActivationScale(layerInfo, quant);
 139         }
 140         return true;
 141     }
 142 };
 143
 144 template<>
 145 class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
 146  public:
 147     bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
 148         if ( !eltwiseLayer ) {
 149             THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n";
 150         }
 151         auto in0 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 0);
 152         auto in1 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 1);
 153
 154         auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
 155         auto quantParams1 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in1);
 156         auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*eltwiseLayer);
 157
 158         switch (eltwiseLayer->_operation) {
 159             case InferenceEngine::EltwiseLayer::Prod: {
 160                 quantData->_weights_quant.scale = quantParams1->_dst_quant.scale;
 161                 quantData->_dst_quant.scale     = quantParams0->_dst_quant.scale * quantParams1->_dst_quant.scale;
 162                 break;
 163             }
 164             case InferenceEngine::EltwiseLayer::Sum: {
 165                 // detect which input will be used as biases
 166                 if (LayerInfo(in0).has32BOutput()) {
 167                     std::swap(in0, in1);
 168                     std::swap(quantParams0, quantParams1);
 169                 }
 170
 171                 // this path might result in significant data loss
 172                 quantData->_weights_quant.scale = quantParams1->_dst_quant.scale / quantParams0->_dst_quant.scale;
 173                 quantData->_dst_quant.scale = quantParams1->_dst_quant.scale;
 174
 175                 // eltwise will always work in int16
 176                 auto maxValue = std::numeric_limits<int16_t>::max() - 1;
 177                 if (quantData->_weights_quant.scale > maxValue + 1) {
 178                     // rescaling it's activation input
 179                     // iterating thru previous layers of eltwise
 180                     for (uint8_t i = 0; i < 2; ++i) {
 181                         InferenceEngine::CNNLayerPtr in = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, i);
 182                         // trick to get opposite index (for 0 -> 1 for 1 -> 0) by inversing i.
 183                         auto quantParams =
 184                                 InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(eltwiseLayer, !i));
 185
 186                         for (; InferenceEngine::CNNNetHasPrevLayer(in.get()); in = CNNNetPrevLayer(in)) {
 187                             auto info = LayerInfo(in);
 188                             // we skipping only split layers so far, also need to work on memory layers
 189                             // this case for input from port 0
 190                             if (info.isSplit() || info.isSlice()) {
 191                                 continue;
 192                             } else if (info.has16BOutput() && info.isActivation()) {
 193                                 auto newOutputScale = quantParams->_dst_quant.scale / maxValue;
 194                                 if (newOutputScale > std::numeric_limits<int16_t>::max() / 2) {
 195                                     break;
 196                                 }
 197                                 auto quantDataForActivation = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
 198                                 gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name
 199                                          << ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale
 200                                          << ", was " << quantDataForActivation->_dst_quant.scale <<"\n" << std::flush;
 201                                 quantDataForActivation->_dst_quant.scale = newOutputScale;
 202                                 result = ScaleFactorUpdateResult(in.get());
 203                                 return true;
 204                             } else if (info.has16BOutput()) {
 205                                 break;
 206                             }
 207
 208                             // if we are here it means that we are in the port 1
 209                             if (info.isFullyConnected() || info.isConvolution()) {
 210                                 auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
 211                                 auto newOutputScale = quantParams->_dst_quant.scale * maxValue;
 212                                 auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.scale;
 213                                 quantDataForInputLayer->_dst_quant.scale = newOutputScale;
 214                                 quantDataForInputLayer->_weights_quant.scale = newWeightScale;
 215                                 result = ScaleFactorUpdateResult(in.get());
 216                                 return true;
 217                             }
 218                         }
 219                     }
 220                     // we unable to rescale the input - results might be bad
 221                     gnawarn() << "[INFO] weights saturated for " << eltwiseLayer->name << "\n";
 222                 }
 223                 break;
 224             }
 225             default : THROW_GNA_EXCEPTION << "Unsupported Eltwise layer for quantisation: " << eltwiseLayer->_operation;
 226         }
 227         return true;
 228     }
 229 };
 230
 231 template<>
 232 class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
 233  public:
 234     bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
 235         if ( !concatLayer ) {
 236             THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n";
 237         }
 238         auto in0 = InferenceEngine::CNNNetPrevLayer(concatLayer, 0);
 239         auto in1 = InferenceEngine::CNNNetPrevLayer(concatLayer, 1);
 240         auto infoIn0 = LayerInfo(in0);
 241         auto infoIn1 = LayerInfo(in1);
 242         auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
 243         auto quantParams1 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in1);
 244         GNAPluginNS::QuantizedLayerParams* sourceQuantParams = NULL;
 245         auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*concatLayer);
 246
 247         if (quantParams0->_dst_quant.scale == quantParams1->_dst_quant.scale) {
 248             return true;
 249         } else if (infoIn0.isInput() && infoIn1.isInput()) {
 250             THROW_GNA_EXCEPTION << "Two Input layers has different scales in concat!!! \n";
 251         }
 252
 253         int i = 0;
 254         if (infoIn0.isInput()) {
 255             sourceQuantParams = quantParams0;
 256         } else if (infoIn1.isInput()) {
 257             ++i;
 258             sourceQuantParams = quantParams1;
 259         }
 260
 261         if (!sourceQuantParams) {
 262             THROW_GNA_EXCEPTION << "Concat quantization for this case need to be implemented!!! \n";
 263         }
 264         auto destinationQuantParams =
 265                 InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(concatLayer, !i));
 266         InferenceEngine::CNNLayerPtr in = InferenceEngine::CNNNetPrevLayer(concatLayer, !i);
 267
 268         quantData->_dst_quant.scale = sourceQuantParams->_dst_quant.scale;
 269         quantData->_src_quant.scale = sourceQuantParams->_dst_quant.scale;
 270
 271         destinationQuantParams->_dst_quant.scale = sourceQuantParams->_dst_quant.scale;
 272         result = ScaleFactorUpdateResult(in.get());
 273
 274         return true;
 275     }
 276 };
 277
 278 template<>
 279 class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
 280  private:
 281     float const _scale_reduction_50 = 0.50;
 282     float const _scale_reduction_45 = 0.45;
 283     float const _scale_reduction_40 = 0.40;
 284     float const _scale_reduction_35 = 0.35;
 285
 286     uint16_t const _scale_change_req_threshold = 30;
 287     uint16_t const _scale_change_threshold_100 = 100;
 288     uint16_t const _scale_change_threshold_150 = 150;
 289     uint16_t const _scale_change_threshold_200 = 200;
 290
 291  public:
 292     bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
 293         if ( !wl ) {
 294             THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer  \n";
 295         } else if (!wl->_weights) {
 296             THROW_GNA_EXCEPTION << "Incorrect weight value for " << wl->name << ":" << wl->type << "\n";
 297         }
 298
 299         auto prevLayer = CNNNetPrevLayer(wl);
 300         auto quantDataForInputLayer =
 301             InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
 302
 303         auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
 304         // TODO: pass 8 bits somehow
 305         if (quant->_weights_quant.scale == 1.0f) {
 306             size_t scaleRange = 0;
 307             if (weightsSize == 2) {
 308                 scaleRange = MAX_VAL_2B_WEIGHT;
 309             } else if (weightsSize == 1) {
 310                 scaleRange = MAX_VAL_1B_WEIGHT;
 311             } else {
 312                 THROW_GNA_EXCEPTION << "Unsupported weights size of: " << weightsSize;
 313             }
 314             quant->_weights_quant.scale =
 315                 ScaleFactorForQuantization(wl->_weights->buffer().as<float *>(), scaleRange, wl->_weights->size());
 316
 317             // TODO: findout why ???
 318             if (weightsSize == 1) {
 319                 quant->_weights_quant.scale *= MAX_OUT_MULTIPLIER;
 320             }
 321         }
 322
 323         quant->_src_quant.scale = quantDataForInputLayer->_dst_quant.scale;
 324
 325         double tmp_dst_quant_scale = quant->_weights_quant.scale * quantDataForInputLayer->_dst_quant.scale;
 326
 327         if (weightsSize == 1 &&
 328             static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.scale) >
 329                                                     static_cast<uint64_t>(std::numeric_limits<int32_t>::max()-1) * _scale_change_req_threshold) {
 330             gnawarn() << "Output scale for " << wl->name
 331                                             << " too large and are being reduced. Else saturations likely will happen \n";
 332             // reduce weight scale according experimentatl heuruistic
 333             if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_100) {
 334                 quant->_weights_quant.scale *= _scale_reduction_50;
 335                 tmp_dst_quant_scale *= _scale_reduction_50;
 336             } else if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_150) {
 337                 quant->_weights_quant.scale *= _scale_reduction_45;
 338                 tmp_dst_quant_scale *= _scale_reduction_45;
 339             } else if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_200) {
 340                 quant->_weights_quant.scale *= _scale_reduction_40;
 341                 tmp_dst_quant_scale *= _scale_reduction_40;
 342             } else {
 343                 quant->_weights_quant.scale *= _scale_reduction_35;
 344                 tmp_dst_quant_scale *= _scale_reduction_35;
 345             }
 346         }
 347
 348         quant->_dst_quant.scale = tmp_dst_quant_scale;
 349
 350         return true;
 351     }
 352 };
 353
 354 template<>
 355 class ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
 356  public:
 357     bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
 358         return ScaleFactorPerLayer<InferenceEngine::WeightableLayer*>::operator()(wl, 2, inputScaleFactor, result);
 359     }
 360 };
 361
 362 /**
 363  * GNA convolutions cannot be quantized in int8, remove when library starts support that
 364  */
 365 template<>
 366 class ScaleFactorPerLayer<InferenceEngine::ConvolutionLayer*> : public ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> {
 367 };
 368
 369
 370 }  // namespace details
 371
 372 /**
 373  * @brief scale factor calculator will calculate only output scale factors for the layer
 374  * if scale factor propagation not possible, it will fall indicate a restart condition
 375  */
 376 class ScaleFactorCalculator {
 377     using Cnt = std::vector<InferenceEngine::CNNLayerPtr>;
 378     Cnt  net;
 379     mutable Cnt::const_iterator idx;
 380     float inputScaleFactor;
 381     mutable bool needRestart = false;
 382     int weightsBytesSize;
 383
 384  public:
 385     ScaleFactorCalculator(Cnt &net, int weightsBytesSize, float inputScaleFactor)
 386             : net(net), inputScaleFactor(inputScaleFactor), weightsBytesSize(weightsBytesSize) {
 387         idx = std::begin(this->net);
 388     }
 389     bool needToRestart() const {
 390         return needRestart;
 391     }
 392     bool allLayersProcessed() const {
 393         return idx == std::end(net);
 394     }
 395     std::vector<InferenceEngine::CNNLayerPtr> getStartLayers() const {
 396         return std::vector<InferenceEngine::CNNLayerPtr>(idx, std::end(net));
 397     }
 398     template<class T>
 399     bool operator()(T ptr) const {
 400         needRestart = false;
 401         details::ScaleFactorUpdateResult result;
 402         if (!details::ScaleFactorPerLayer<T>()(ptr, weightsBytesSize, inputScaleFactor, result)) {
 403             return false;
 404         }
 405         if (result) {
 406             idx++;
 407             return true;
 408         }
 409
 410         idx = std::find_if(net.begin(), net.end(), [&](InferenceEngine::CNNLayerPtr cnnLayer) {
 411             if (!result) {
 412                 return result.restartLayer == cnnLayer.get();
 413             }
 414             return ptr == cnnLayer.get();
 415         });
 416         idx++;
 417         needRestart = true;
 418         return true;
 419     }
 420 };
 421
 422 }  // namespace GNAPluginNS