Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / src / gna_plugin / quantization / scale_factor_calc.hpp
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #pragma once
6 #include <vector>
7 #include <algorithm>
8 #include <utility>
9 #include <limits>
10 #include <string>
11 #include <map>
12 #include "gna_layer_info.hpp"
13 #include "ie_layers.h"
14 #include "gna_plugin_log.hpp"
15
16 namespace GNAPluginNS {
17 namespace details {
18 using namespace InferenceEngine;
19 struct ScaleFactorUpdateResult {
20     CNNLayer *restartLayer = nullptr;
21     ScaleFactorUpdateResult() = default;
22     explicit ScaleFactorUpdateResult(CNNLayer * restartlayer) : restartLayer(restartlayer) {
23     }
24     operator bool() {
25         return restartLayer == nullptr;
26     }
27 };
28
29 /**
30  * @brief calculates output scale factor per layer
31  * @tparam T
32  */
33 template<class T>
34 class ScaleFactorPerLayer {
35  public:
36     /**
37      * @brief calculates weights scale factor for fit dynamic range into target bitsize,
38      * also calculates output scale factor for the given layer
39      * @param cnnLayer
40      * @param weightsSize
41      * @param inputScaleFactor
42      * @param result
43      * @return
44      */
45     bool operator()(T cnnLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
46         return false;
47     }
48 };
49
50 template<>
51 class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
52  private :
53     const float activation_scale_factor = 2048.f;
54     const float identity_scale_factor = 2049.0f;
55     const float k = 5;
56     const float k_identity = 6;
57
58  protected :
59     static bool fp32eq(float p1, float p2) {
60         return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
61     }
62     float getActivationScale(GNAPluginNS::LayerInfo const&  layer, QuantizedLayerParams const* qunatizedParams) {
63             // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
64             // set the initial value
65             float result = 1.0f;
66             result = (layer.isIdentity()) ? identity_scale_factor : activation_scale_factor;
67             // if activation is one from relu family, we need to apply heuruistic to avoid activation output overflow
68             if (layer.isRelu() &&
69                     static_cast<uint64_t>(result * qunatizedParams->_src_quant.scale)
70                                                                 > std::numeric_limits<int32_t>::max()-1) {
71                 result = (result * 0.5);
72             }
73             return result;
74     }
75
76  public :
77     bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
78         if ( !cnnLayer ) {
79             THROW_IE_EXCEPTION << "Incorrect Convolutional Layer pointer \n";
80         }
81         LayerInfo layerInfo(*cnnLayer);
82         // TODO: current approach set input scale factor for true input layer(s) equals to provided factor,
83         auto quant = getInjectedData<QuantizedLayerParams>(*cnnLayer);
84         if (InferenceEngine::details::CaselessEq<std::string>()(cnnLayer->type, "Memory")) {
85              if (CNNNetHasPrevLayer(cnnLayer)) {
86                 auto prevLayer = CNNNetPrevLayer(cnnLayer);
87                 auto prevInfo = LayerInfo(prevLayer);
88                 auto inputQuant = getInjectedData<QuantizedLayerParams>(prevLayer);
89                // locating corresponding memory layers ith same ID
90                 for (auto && input : CNNNetGetAllInputLayers(cnnLayer)) {
91                     LayerInfo ll(input);
92                     if (!ll.isMemory() ||
93                         !InferenceEngine::details::CaselessEq<std::string>()(input->params["id"], cnnLayer->params["id"])) {
94                         continue;
95                     }
96
97                     auto quantSibling = getInjectedData<QuantizedLayerParams>(input);
98
99                     // after restarting from memory input - quant is fine
100                     if (fp32eq(quantSibling->_dst_quant.scale, inputQuant->_dst_quant.scale)) {
101                         quant->_src_quant.scale = quant->_dst_quant.scale = inputQuant->_dst_quant.scale;
102                         return true;
103                     }
104
105                     if (!fp32eq(quantSibling->_dst_quant.scale, 1)) {
106                         // means we already restarted propagation from that memory layer - we cannot do mach here
107                         THROW_GNA_EXCEPTION << "quantization error : input scale factor ( " << inputQuant->_dst_quant.scale <<") "
108                                   << " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : "
109                                   << activation_scale_factor;
110                     }
111
112                     gnawarn() << "[INFO] quantization : input scale factor (" << inputQuant->_dst_quant.scale <<")"
113                               << " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : "
114                               << activation_scale_factor << ", restarting from corresponding memory: "<< input->name << std::endl;
115
116                     // try updating memory input layer scale factor and restart from it
117                     quantSibling->_src_quant.scale = quantSibling->_dst_quant.scale = inputQuant->_dst_quant.scale;
118                     result = ScaleFactorUpdateResult(input.get());
119                     return true;
120                 }
121             }
122             return true;
123         }
124
125         if (!CNNNetHasPrevLayer(cnnLayer)) {
126             quant->_dst_quant.scale = inputScaleFactor;
127             return ScaleFactorUpdateResult();
128         }
129
130         // by default layer is pass thru its scale factor
131         auto inputQuant = getInjectedData<QuantizedLayerParams>(CNNNetPrevLayer(cnnLayer));
132         quant->_dst_quant.scale = inputQuant->_dst_quant.scale;
133         quant->_src_quant.scale = inputQuant->_dst_quant.scale;
134
135         if (layerInfo.isActivation()) {
136             // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
137             // set the initial value
138             quant->_dst_quant.scale = getActivationScale(layerInfo, quant);
139         }
140         return true;
141     }
142 };
143
144 template<>
145 class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
146  public:
147     bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
148         if ( !eltwiseLayer ) {
149             THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n";
150         }
151         auto in0 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 0);
152         auto in1 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 1);
153
154         auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
155         auto quantParams1 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in1);
156         auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*eltwiseLayer);
157
158         switch (eltwiseLayer->_operation) {
159             case InferenceEngine::EltwiseLayer::Prod: {
160                 quantData->_weights_quant.scale = quantParams1->_dst_quant.scale;
161                 quantData->_dst_quant.scale     = quantParams0->_dst_quant.scale * quantParams1->_dst_quant.scale;
162                 break;
163             }
164             case InferenceEngine::EltwiseLayer::Sum: {
165                 // detect which input will be used as biases
166                 if (LayerInfo(in0).has32BOutput()) {
167                     std::swap(in0, in1);
168                     std::swap(quantParams0, quantParams1);
169                 }
170
171                 // this path might result in significant data loss
172                 quantData->_weights_quant.scale = quantParams1->_dst_quant.scale / quantParams0->_dst_quant.scale;
173                 quantData->_dst_quant.scale = quantParams1->_dst_quant.scale;
174
175                 // eltwise will always work in int16
176                 auto maxValue = std::numeric_limits<int16_t>::max() - 1;
177                 if (quantData->_weights_quant.scale > maxValue + 1) {
178                     // rescaling it's activation input
179                     // iterating thru previous layers of eltwise
180                     for (uint8_t i = 0; i < 2; ++i) {
181                         InferenceEngine::CNNLayerPtr in = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, i);
182                         // trick to get opposite index (for 0 -> 1 for 1 -> 0) by inversing i.
183                         auto quantParams =
184                                 InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(eltwiseLayer, !i));
185
186                         for (; InferenceEngine::CNNNetHasPrevLayer(in.get()); in = CNNNetPrevLayer(in)) {
187                             auto info = LayerInfo(in);
188                             // we skipping only split layers so far, also need to work on memory layers
189                             // this case for input from port 0
190                             if (info.isSplit() || info.isSlice()) {
191                                 continue;
192                             } else if (info.has16BOutput() && info.isActivation()) {
193                                 auto newOutputScale = quantParams->_dst_quant.scale / maxValue;
194                                 if (newOutputScale > std::numeric_limits<int16_t>::max() / 2) {
195                                     break;
196                                 }
197                                 auto quantDataForActivation = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
198                                 gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name
199                                          << ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale
200                                          << ", was " << quantDataForActivation->_dst_quant.scale <<"\n" << std::flush;
201                                 quantDataForActivation->_dst_quant.scale = newOutputScale;
202                                 result = ScaleFactorUpdateResult(in.get());
203                                 return true;
204                             } else if (info.has16BOutput()) {
205                                 break;
206                             }
207
208                             // if we are here it means that we are in the port 1
209                             if (info.isFullyConnected() || info.isConvolution()) {
210                                 auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
211                                 auto newOutputScale = quantParams->_dst_quant.scale * maxValue;
212                                 auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.scale;
213                                 quantDataForInputLayer->_dst_quant.scale = newOutputScale;
214                                 quantDataForInputLayer->_weights_quant.scale = newWeightScale;
215                                 result = ScaleFactorUpdateResult(in.get());
216                                 return true;
217                             }
218                         }
219                     }
220                     // we unable to rescale the input - results might be bad
221                     gnawarn() << "[INFO] weights saturated for " << eltwiseLayer->name << "\n";
222                 }
223                 break;
224             }
225             default : THROW_GNA_EXCEPTION << "Unsupported Eltwise layer for quantisation: " << eltwiseLayer->_operation;
226         }
227         return true;
228     }
229 };
230
231 template<>
232 class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
233  public:
234     bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
235         if ( !concatLayer ) {
236             THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n";
237         }
238         auto in0 = InferenceEngine::CNNNetPrevLayer(concatLayer, 0);
239         auto in1 = InferenceEngine::CNNNetPrevLayer(concatLayer, 1);
240         auto infoIn0 = LayerInfo(in0);
241         auto infoIn1 = LayerInfo(in1);
242         auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
243         auto quantParams1 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in1);
244         GNAPluginNS::QuantizedLayerParams* sourceQuantParams = NULL;
245         auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*concatLayer);
246
247         if (quantParams0->_dst_quant.scale == quantParams1->_dst_quant.scale) {
248             return true;
249         } else if (infoIn0.isInput() && infoIn1.isInput()) {
250             THROW_GNA_EXCEPTION << "Two Input layers has different scales in concat!!! \n";
251         }
252
253         int i = 0;
254         if (infoIn0.isInput()) {
255             sourceQuantParams = quantParams0;
256         } else if (infoIn1.isInput()) {
257             ++i;
258             sourceQuantParams = quantParams1;
259         }
260
261         if (!sourceQuantParams) {
262             THROW_GNA_EXCEPTION << "Concat quantization for this case need to be implemented!!! \n";
263         }
264         auto destinationQuantParams =
265                 InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(concatLayer, !i));
266         InferenceEngine::CNNLayerPtr in = InferenceEngine::CNNNetPrevLayer(concatLayer, !i);
267
268         quantData->_dst_quant.scale = sourceQuantParams->_dst_quant.scale;
269         quantData->_src_quant.scale = sourceQuantParams->_dst_quant.scale;
270
271         destinationQuantParams->_dst_quant.scale = sourceQuantParams->_dst_quant.scale;
272         result = ScaleFactorUpdateResult(in.get());
273
274         return true;
275     }
276 };
277
278 template<>
279 class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
280  private:
281     float const _scale_reduction_50 = 0.50;
282     float const _scale_reduction_45 = 0.45;
283     float const _scale_reduction_40 = 0.40;
284     float const _scale_reduction_35 = 0.35;
285
286     uint16_t const _scale_change_req_threshold = 30;
287     uint16_t const _scale_change_threshold_100 = 100;
288     uint16_t const _scale_change_threshold_150 = 150;
289     uint16_t const _scale_change_threshold_200 = 200;
290
291  public:
292     bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
293         if ( !wl ) {
294             THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer  \n";
295         } else if (!wl->_weights) {
296             THROW_GNA_EXCEPTION << "Incorrect weight value for " << wl->name << ":" << wl->type << "\n";
297         }
298
299         auto prevLayer = CNNNetPrevLayer(wl);
300         auto quantDataForInputLayer =
301             InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
302
303         auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
304         // TODO: pass 8 bits somehow
305         if (quant->_weights_quant.scale == 1.0f) {
306             size_t scaleRange = 0;
307             if (weightsSize == 2) {
308                 scaleRange = MAX_VAL_2B_WEIGHT;
309             } else if (weightsSize == 1) {
310                 scaleRange = MAX_VAL_1B_WEIGHT;
311             } else {
312                 THROW_GNA_EXCEPTION << "Unsupported weights size of: " << weightsSize;
313             }
314             quant->_weights_quant.scale =
315                 ScaleFactorForQuantization(wl->_weights->buffer().as<float *>(), scaleRange, wl->_weights->size());
316
317             // TODO: findout why ???
318             if (weightsSize == 1) {
319                 quant->_weights_quant.scale *= MAX_OUT_MULTIPLIER;
320             }
321         }
322
323         quant->_src_quant.scale = quantDataForInputLayer->_dst_quant.scale;
324
325         double tmp_dst_quant_scale = quant->_weights_quant.scale * quantDataForInputLayer->_dst_quant.scale;
326
327         if (weightsSize == 1 &&
328             static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.scale) >
329                                                     static_cast<uint64_t>(std::numeric_limits<int32_t>::max()-1) * _scale_change_req_threshold) {
330             gnawarn() << "Output scale for " << wl->name
331                                             << " too large and are being reduced. Else saturations likely will happen \n";
332             // reduce weight scale according experimentatl heuruistic
333             if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_100) {
334                 quant->_weights_quant.scale *= _scale_reduction_50;
335                 tmp_dst_quant_scale *= _scale_reduction_50;
336             } else if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_150) {
337                 quant->_weights_quant.scale *= _scale_reduction_45;
338                 tmp_dst_quant_scale *= _scale_reduction_45;
339             } else if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_200) {
340                 quant->_weights_quant.scale *= _scale_reduction_40;
341                 tmp_dst_quant_scale *= _scale_reduction_40;
342             } else {
343                 quant->_weights_quant.scale *= _scale_reduction_35;
344                 tmp_dst_quant_scale *= _scale_reduction_35;
345             }
346         }
347
348         quant->_dst_quant.scale = tmp_dst_quant_scale;
349
350         return true;
351     }
352 };
353
354 template<>
355 class ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
356  public:
357     bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
358         return ScaleFactorPerLayer<InferenceEngine::WeightableLayer*>::operator()(wl, 2, inputScaleFactor, result);
359     }
360 };
361
362 /**
363  * GNA convolutions cannot be quantized in int8, remove when library starts support that
364  */
365 template<>
366 class ScaleFactorPerLayer<InferenceEngine::ConvolutionLayer*> : public ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> {
367 };
368
369
370 }  // namespace details
371
372 /**
373  * @brief scale factor calculator will calculate only output scale factors for the layer
374  * if scale factor propagation not possible, it will fall indicate a restart condition
375  */
376 class ScaleFactorCalculator {
377     using Cnt = std::vector<InferenceEngine::CNNLayerPtr>;
378     Cnt  net;
379     mutable Cnt::const_iterator idx;
380     float inputScaleFactor;
381     mutable bool needRestart = false;
382     int weightsBytesSize;
383
384  public:
385     ScaleFactorCalculator(Cnt &net, int weightsBytesSize, float inputScaleFactor)
386             : net(net), inputScaleFactor(inputScaleFactor), weightsBytesSize(weightsBytesSize) {
387         idx = std::begin(this->net);
388     }
389     bool needToRestart() const {
390         return needRestart;
391     }
392     bool allLayersProcessed() const {
393         return idx == std::end(net);
394     }
395     std::vector<InferenceEngine::CNNLayerPtr> getStartLayers() const {
396         return std::vector<InferenceEngine::CNNLayerPtr>(idx, std::end(net));
397     }
398     template<class T>
399     bool operator()(T ptr) const {
400         needRestart = false;
401         details::ScaleFactorUpdateResult result;
402         if (!details::ScaleFactorPerLayer<T>()(ptr, weightsBytesSize, inputScaleFactor, result)) {
403             return false;
404         }
405         if (result) {
406             idx++;
407             return true;
408         }
409
410         idx = std::find_if(net.begin(), net.end(), [&](InferenceEngine::CNNLayerPtr cnnLayer) {
411             if (!result) {
412                 return result.restartLayer == cnnLayer.get();
413             }
414             return ptr == cnnLayer.get();
415         });
416         idx++;
417         needRestart = true;
418         return true;
419     }
420 };
421
422 }  // namespace GNAPluginNS