inference-engine/src/gna_plugin/quantization/layer_quantizer.hpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #pragma once
   6 #include <string>
   7 #include <utility>
   8 #include <gna-api-types-xnn.h>
   9 #include "ie_layers.h"
  10 #include "quantized_layer_params.hpp"
  11 #include "quantization.h"
  12 #include "details/caseless.hpp"
  13 #include "graph_tools.hpp"
  14 #include "blob_factory.hpp"
  15 #include "precision_ex.hpp"
  16 #include "pwl.h"
  17 #include "gna_layer_info.hpp"
  18
  19 namespace GNAPluginNS {
  20 namespace details {
  21
  22 /**
  23  * @brief description of quantisation precision
  24  * @tparam Ip - input precision
  25  * @tparam Wp - weights precision
  26  * @tparam Bp - biases precision
  27  * @tparam Np - network precision - can be auto generated in future
  28  */
  29 template <class Ip, class Op, class Wp, class Bp, class Np>
  30 struct QuantDescTmpl {
  31     using WeightsPrecision = Wp;
  32     using BiasesPrecision = Bp;
  33
  34     InferenceEngine::TPrecision<Ip> _Ip;
  35     InferenceEngine::TPrecision<Op> _Op;
  36     InferenceEngine::TPrecision<Wp> _Wp;
  37     InferenceEngine::TPrecision<Bp> _Bp;
  38     InferenceEngine::TPrecision<Np> _Np;
  39
  40     QuantDescTmpl() = default;
  41     QuantDescTmpl(InferenceEngine::TPrecision<Ip> _Ip,
  42               InferenceEngine::TPrecision<Op> _Op,
  43               InferenceEngine::TPrecision<Wp> _Wp,
  44               InferenceEngine::TPrecision<Bp> _Bp,
  45               InferenceEngine::TPrecision<Np> _Np) : _Op(_Op), _Ip(_Ip), _Wp(_Wp), _Bp(_Bp), _Np(_Np) {
  46     }
  47
  48     InferenceEngine::Precision getInputPrecision() const {
  49         return _Ip;
  50     }
  51     InferenceEngine::Precision getWeightsPrecision() const {
  52         return _Wp;
  53     }
  54     InferenceEngine::Precision getBiasesPrecision() const {
  55         return _Bp;
  56     }
  57     InferenceEngine::Precision getNetPrecision() const {
  58         return _Np;
  59     }
  60     InferenceEngine::Precision getOutputPrecision() const {
  61         return _Op;
  62     }
  63 };
  64
  65 #define P_TYPE(X)\
  66 typename InferenceEngine::PrecisionTrait<InferenceEngine::Precision::X>::value_type
  67
  68 #define PRECISION_TYPE(A, B, C, D, E)\
  69     P_TYPE(A), P_TYPE(B), P_TYPE(C), P_TYPE(D), P_TYPE(E)
  70
  71
  72 struct QuantI16 : public QuantDescTmpl<PRECISION_TYPE(I16, I32, I16, I32, MIXED)> {
  73     QuantI16() {
  74         _Np = InferenceEngine::Precision::MIXED;
  75     }
  76 };
  77 struct QuantI8  : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(I8), intel_compound_bias_t, P_TYPE(MIXED)> {
  78     QuantI8() {
  79         _Np = InferenceEngine::Precision::MIXED;
  80     }
  81 };
  82
  83 template <class A, class B>
  84 struct QuantPair {
  85     using MandatoryType = A;
  86     using OptionalType = B;
  87     static A mandatory () { return A();}
  88     static B optional () { return B();}
  89 };
  90
  91 /**
  92  * @brief should allocated blob for specific data type, in case of src blob is nullptr
  93  * @tparam T
  94  * @return
  95  */
  96 template <class T>
  97 inline bool shouldAlwaysAllocate() {
  98     return false;
  99 }
 100
 101 template <>
 102 inline bool shouldAlwaysAllocate<intel_compound_bias_t>() {
 103     return true;
 104 }
 105
 106
 107 #undef P_TYPE
 108 #undef PRECISION_TYPE
 109
 110 /**
 111  * @brief  designate actual data quantisation functions trait
 112  */
 113 template <class T>
 114 class Quant {
 115  public:
 116     template<class ...Args>
 117     void operator()(Args && ... args) const { }
 118 };
 119
 120 template<>
 121 class Quant<QuantI16> {
 122  public:
 123     template<class ...Args>
 124     void operator()(Args && ... args) const {
 125         QuantizeAffine16(std::forward<Args>(args)...);
 126     }
 127 };
 128
 129 template<>
 130 class Quant<QuantI8> {
 131  public:
 132     template<class ...Args>
 133     void operator()(Args && ... args) const {
 134         QuantizeAffine8(std::forward<Args>(args)...);
 135     }
 136 };
 137
 138 template<class QuantDesc, class QuantFunc>
 139 inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
 140                                   InferenceEngine::WeightableLayer *wl,
 141                                   const QuantFunc &fnc,
 142                                   bool isDiagonal = false) {  // for diagonal layer number of weights and biases significatly smaller
 143     // for quantized weights
 144     auto intWeights =
 145         make_custom_blob<typename QuantDesc::WeightsPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({wl->_weights->size()}));
 146     intWeights->allocate();
 147     if (intWeights->buffer() == nullptr) {
 148         THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
 149                             << "cannot copy weights for layer :"<< wl->name << " of size" << intWeights->byteSize();
 150     }
 151
 152
 153     auto getBiasSizeForLayer = [](InferenceEngine::WeightableLayer *wl) {
 154         if (wl->_biases) {
 155             return wl->_biases->size();
 156         }
 157         // calculating biases len using weight dims
 158         auto & dims = wl->outData.front()->getDims();
 159         return dims[1];
 160     };
 161
 162     using BiasesPrecision = typename QuantDesc::BiasesPrecision;
 163     auto biasMaker = [&] () {
 164         InferenceEngine::Blob::Ptr zero;
 165         if (!wl->_biases && !shouldAlwaysAllocate<BiasesPrecision>()) {
 166             return zero;
 167         }
 168         auto bias = make_custom_blob<BiasesPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({
 169             getBiasSizeForLayer(wl)
 170         }));
 171         bias->allocate();
 172         if (bias->buffer() == nullptr) {
 173             THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
 174                                 << "cannot copy bias for layer :"<< wl->name <<"of size" << bias->byteSize();
 175         }
 176
 177         memset(bias->buffer(), 0, bias->byteSize());
 178
 179         return bias;
 180     };
 181     auto intBiases = biasMaker();
 182
 183     float input_scale_factor = 1.f;
 184     if (InferenceEngine::CNNNetHasPrevLayer(wl)) {
 185         auto quantDataForInputLayer =
 186             InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
 187         input_scale_factor = quantDataForInputLayer->_dst_quant.scale;
 188         if (std::isnan(input_scale_factor) ||
 189             std::isinf(input_scale_factor)) {
 190             THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
 191         }
 192     }
 193     if (wl->outData[0]->getDims().size() < 2) {
 194         THROW_IE_EXCEPTION << "Unsupported output dims size for " << wl->name <<", should be > 1, but " << wl->outData[0]->getDims().size();
 195     }
 196     if (wl->insData[0].lock().get()->getDims().size() < 2) {
 197         THROW_IE_EXCEPTION << "Unsupported input dims size for " << wl->name << ", should be > 1, but " << wl->insData[0].lock().get()->getDims().size();
 198     }
 199     uint32_t num_rows = isDiagonal ? 1 : wl->outData[0]->getDims()[1];
 200     uint32_t num_columns = wl->insData[0].lock().get()->getDims()[1];
 201
 202     if (wl->type == "AffineFilter") {
 203         // for affine filter layer insdata size not equal to actual coded in input layer
 204         num_columns = wl->_weights->size() / num_rows;
 205     }
 206
 207     if (isDiagonal) {
 208         std::swap(num_rows, num_columns);
 209     }
 210
 211     uint32_t num_rows_padded = num_rows;
 212     uint32_t num_columns_padded = num_columns;
 213
 214     // TODO: replace this into fixed scale quantizer then
 215
 216     auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
 217     {
 218         fnc(wl->_weights->buffer().as<float *>(),
 219             wl->_biases ? wl->_biases->buffer().as<float *>() : nullptr,
 220             intWeights->buffer(),
 221             intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
 222             input_scale_factor,
 223             &quantData->_weights_quant.scale,
 224             &quantData->_dst_quant.scale,
 225             num_rows,
 226             num_columns,
 227             num_rows_padded,
 228             num_columns_padded);
 229     }
 230     wl->_weights = intWeights;
 231     wl->_biases = intBiases;
 232
 233     /**
 234      * correcting precision for outdata
 235      */
 236     wl->precision = quantDesc.getWeightsPrecision();
 237     for (auto &&outData : wl->outData) {
 238         outData->setPrecision(quantDesc.getOutputPrecision());
 239     }
 240 }
 241
 242
 243 template<class QuantDesc, class QuantFunc>
 244 inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
 245                                   InferenceEngine::WeightableLayer *conv,
 246                                   const QuantFunc &fnc) {
 247     // for quantized weights
 248     auto intWeights = make_custom_blob<typename QuantDesc::WeightsPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({conv->_weights->size()}));
 249     intWeights->allocate();
 250     if (intWeights->buffer() == nullptr) {
 251         THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
 252                             << "cannot copy weights for layer :"<< conv->name << " of size" << intWeights->byteSize();
 253     }
 254
 255
 256     auto getBiasSizeForLayer = [](InferenceEngine::WeightableLayer *wl) {
 257         if (wl->_biases) {
 258             return wl->_biases->size();
 259         }
 260         // calculating biases len using weight dims
 261         auto & dims = wl->outData.front()->getDims();
 262         return dims[1];
 263     };
 264
 265     using BiasesPrecision = typename QuantDesc::BiasesPrecision;
 266     auto biasMaker = [&] () {
 267         InferenceEngine::Blob::Ptr zero;
 268         if (!conv->_biases && !shouldAlwaysAllocate<BiasesPrecision>()) {
 269             return zero;
 270         }
 271         auto bias = make_custom_blob<BiasesPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({
 272                                                                                                           getBiasSizeForLayer(conv)
 273                                                                                                       }));
 274         bias->allocate();
 275         if (bias->buffer() == nullptr) {
 276             THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
 277                                 << "cannot copy bias for layer :"<< conv->name <<"of size" << bias->byteSize();
 278         }
 279         memset(bias->buffer(), 0, bias->byteSize());
 280
 281         return bias;
 282     };
 283     auto intBiases = biasMaker();
 284
 285     float input_scale_factor = 1.f;
 286     if (InferenceEngine::CNNNetHasPrevLayer(conv)) {
 287         auto quantDataForInputLayer =
 288             InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(conv).get());
 289         input_scale_factor = quantDataForInputLayer->_dst_quant.scale;
 290         if (std::isnan(input_scale_factor) ||
 291             std::isinf(input_scale_factor)) {
 292             THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
 293         }
 294     }
 295     if (conv->outData[0]->getDims().size() < 2) {
 296         THROW_IE_EXCEPTION << "Unsupported output dims size for " << conv->name <<", should be > 1, but " << conv->outData[0]->getDims().size();
 297     }
 298     if (conv->insData[0].lock().get()->getDims().size() < 2) {
 299         THROW_IE_EXCEPTION << "Unsupported input dims size for " << conv->name << ", should be > 1, but " << conv->insData[0].lock().get()->getDims().size();
 300     }
 301     auto inputData = conv->insData[0].lock();
 302
 303     uint32_t num_rows = getBiasSizeForLayer(conv);
 304     uint32_t num_columns = conv->_weights->size() / num_rows;
 305
 306     uint32_t num_rows_padded = num_rows;
 307     uint32_t num_columns_padded = num_columns;
 308
 309     // TODO: replace this into fixed scale quantizer then
 310
 311     auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*conv);
 312     {
 313         fnc(conv->_weights->buffer().as<float *>(),
 314             conv->_biases ? conv->_biases->buffer().as<float *>() : nullptr,
 315             intWeights->buffer(),
 316             intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
 317             input_scale_factor,
 318             &quantData->_weights_quant.scale,
 319             &quantData->_dst_quant.scale,
 320             num_rows,
 321             num_columns,
 322             num_rows_padded,
 323             num_columns_padded);
 324     }
 325     conv->_weights = intWeights;
 326     conv->_biases = intBiases;
 327
 328     /**
 329      * correcting precision for outdata
 330      */
 331     conv->precision = quantDesc.getWeightsPrecision();
 332     for (auto &&outData : conv->outData) {
 333         outData->setPrecision(quantDesc.getOutputPrecision());
 334     }
 335 }
 336
 337
 338 class DataQuantizerBase {
 339  public:
 340     explicit DataQuantizerBase(float scaleFactor) : scaleFactor(scaleFactor) {
 341     }
 342  protected:
 343     float scaleFactor = 1.0;
 344 };
 345 /**
 346  * Helper class to use partial specialisation of Layer type
 347  * @tparam Desc
 348  * @tparam Layer
 349  */
 350 template<class Desc, class Layer>
 351 class DataQuantizer : public DataQuantizerBase {
 352  public:
 353     explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
 354     bool operator()(Layer cnnLayer) const {
 355         return false;
 356     }
 357 };
 358
 359 template<class Desc>
 360 class DataQuantizer<Desc, InferenceEngine::CNNLayer *> : public DataQuantizerBase {
 361  public:
 362     explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
 363
 364     bool operator()(InferenceEngine::CNNLayer *cnnLayer) const {
 365         for (auto &&outData : cnnLayer->outData) {
 366             outData->setPrecision(Desc::mandatory().getOutputPrecision());
 367         }
 368         // set scale factor for input layers
 369         auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
 370         if (cnnLayer->insData.empty()) {
 371             for (auto &&outData : cnnLayer->outData) {
 372                 outData->setPrecision(Desc::mandatory().getInputPrecision());
 373             }
 374         } else {
 375                 if (LayerInfo(*cnnLayer).isActivation() ||
 376                         LayerInfo(*cnnLayer).isCopy()) {
 377                 // precision of activation layers is always equal input precision
 378                 for (auto &&outData : cnnLayer->outData) {
 379                     outData->setPrecision(Desc::mandatory().getInputPrecision());
 380                 }
 381             }
 382         }
 383         cnnLayer->precision = Desc::mandatory().getInputPrecision();
 384
 385         return true;
 386     }
 387 };
 388
 389
 390 template<class Desc>
 391 class DataQuantizer<Desc, InferenceEngine::SplitLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
 392     using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
 393  public:
 394     explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
 395     bool operator()(InferenceEngine::SplitLayer *splitLayer) const {
 396         base::operator()(splitLayer);
 397         // split layer doesnt change it's data at all
 398         for (auto &&outData : splitLayer->outData) {
 399             outData->setPrecision(Desc::mandatory().getInputPrecision());
 400         }
 401         return true;
 402     }
 403 };
 404
 405 template<class Desc>
 406 class DataQuantizer<Desc, InferenceEngine::ConcatLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
 407     using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
 408  public:
 409     explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
 410     bool operator()(InferenceEngine::ConcatLayer *concatLayer) const {
 411         base::operator()(concatLayer);
 412         for (auto &&outData : concatLayer->outData) {
 413             outData->setPrecision(Desc::mandatory().getInputPrecision());
 414         }
 415         return true;
 416     }
 417 };
 418
 419 template<class Desc>
 420 class DataQuantizer<Desc, InferenceEngine::CropLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
 421     using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
 422  public:
 423     explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
 424     bool operator()(InferenceEngine::CropLayer *cropLayer) const {
 425         base::operator()(cropLayer);
 426         for (auto &&outData : cropLayer->outData) {
 427             outData->setPrecision(Desc::mandatory().getInputPrecision());
 428         }
 429         return true;
 430     }
 431 };
 432
 433 template<class Desc>
 434 class DataQuantizer<Desc, InferenceEngine::ReshapeLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
 435     using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
 436  public:
 437     explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
 438     bool operator()(InferenceEngine::ReshapeLayer *reshapeLayer) const {
 439         base::operator()(reshapeLayer);
 440         // reshape layer doesnt change it's data at all
 441         for (auto &&outData : reshapeLayer->outData) {
 442             outData->setPrecision(Desc::mandatory().getInputPrecision());
 443         }
 444         return true;
 445     }
 446 };
 447
 448 template<class Desc>
 449 class DataQuantizer<Desc, InferenceEngine::WeightableLayer *> : public DataQuantizerBase {
 450  public:
 451     explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
 452     bool operator()(InferenceEngine::WeightableLayer *wl) const {
 453         quantizeWeightsBiases<typename Desc::MandatoryType>(Desc::mandatory(), wl, Quant<typename Desc::MandatoryType>());
 454         return true;
 455     }
 456 };
 457
 458 template<class Desc>
 459 class DataQuantizer<Desc, InferenceEngine::ConvolutionLayer *> : public DataQuantizerBase {
 460  public:
 461     explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
 462     bool operator()(InferenceEngine::WeightableLayer *wl) const {
 463         quantizeWeightsBiasesConv<typename Desc::OptionalType>(Desc::optional(), wl, Quant<typename Desc::OptionalType>());
 464         return true;
 465     }
 466 };
 467
 468 template<class Desc>
 469 class DataQuantizer<Desc, InferenceEngine::ScaleShiftLayer *> : public DataQuantizerBase {
 470  public:
 471     explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
 472     bool operator()(InferenceEngine::ScaleShiftLayer *wl) const {
 473         quantizeWeightsBiases<typename Desc::OptionalType>(Desc::optional(), wl, Quant<typename Desc::OptionalType>(), true);
 474         return true;
 475     }
 476 };
 477
 478 }  // namespace details
 479
 480 template<class Desc>
 481 class LayersQuantizer : public details::DataQuantizerBase {
 482  public:
 483     explicit LayersQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
 484     template<class T>
 485     bool operator()(T input) const {
 486         return details::DataQuantizer<Desc, T>(scaleFactor)(input);
 487     }
 488 };
 489
 490 using QuantI16 = details::QuantPair<details::QuantI16, details::QuantI16>;
 491 using QuantI8 = details::QuantPair<details::QuantI8, details::QuantI16>;
 492
 493 }  // namespace GNAPluginNS