5ef267270d77527de13ade7daa78cf7682c72551
[platform/upstream/dldt.git] / inference-engine / src / legacy_api / src / cnn_network_int8_normalizer.cpp
1 // Copyright (C) 2018-2020 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #include "cnn_network_int8_normalizer.hpp"
6
7 #include <data_stats.h>
8 #include <details/ie_cnn_network_tools.h>
9 #include <ie_common.h>
10
11 #include <algorithm>
12 #include <blob_factory.hpp>
13 #include <cassert>
14 #include <cmath>
15 #include <details/caseless.hpp>
16 #include <fstream>
17 #include <limits>
18 #include <map>
19 #include <memory>
20 #include <set>
21 #include <string>
22 #include <utility>
23 #include <vector>
24
25 #include "cnn_network_impl.hpp"
26 #include "cnn_network_stats_impl.hpp"
27 #include "ie_util_internal.hpp"
28
29 IE_SUPPRESS_DEPRECATED_START
30
31 using namespace std;
32 using namespace InferenceEngine;
33 using namespace InferenceEngine::details;
34
35 using StatsMap = std::map<std::string, InferenceEngine::NetworkNodeStatsPtr>;
36
37 CNNStatisticHelper::CNNStatisticHelper(CNNNetwork& network,
38                                        const std::map<std::string, NetworkNodeStatsPtr>& internalNodesStats,
39                                        int maxSign, int maxUnsign) {
40     internalNodesStats_ = internalNodesStats;
41     network_ = network;
42     maxSign_ = maxSign;
43     maxUnsign_ = maxUnsign;
44
45     NormalizeStatistic();
46 }
47
48 bool CNNStatisticHelper::canLayerBeQuantized(CNNLayer::Ptr layer) const {
49     // verification of existing statistic for all inputs
50     for (const auto i : layer->insData) {
51         if (internalNodesStats_.find(i.lock()->getCreatorLayer().lock()->name) == internalNodesStats_.end()) {
52             return false;
53         }
54     }
55     // verification if there is a statistic for output of the layer
56     if ((layer->outData.size() > 1) && (internalNodesStats_.find(layer->name) == internalNodesStats_.end())) {
57         return false;
58     }
59     return true;
60 }
61
62 void CNNStatisticHelper::copyStatistics(const std::string& srcName, const std::string& dstName) {
63     internalNodesStats_[dstName] = internalNodesStats_[srcName];
64 }
65
66 bool CNNStatisticHelper::hasNegativeOutput(const std::string& layerName, int outputPort) const {
67     // TODO(amalyshe) parameter outputPort is not used yet, logic of dedication to the port
68     // should be implemented
69
70     NetworkNodeStatsPtr layerStat = internalNodesStats_.at(layerName);
71     for (auto v : layerStat->_minOutputs) {
72         if (v < 0.f) {
73             return true;
74         }
75     }
76     return false;
77 }
78
79 InferenceEngine::Blob::Ptr CNNStatisticHelper::getInputScale(CNNLayer::Ptr layer) const {
80     auto inDataPtr = layer->insData[0].lock();
81     if (inDataPtr == nullptr)
82         return nullptr;
83     auto previousLayer = inDataPtr->getCreatorLayer().lock();
84     std::string inputLayerName = previousLayer->name;
85
86     // for case when we have the only average pooling before, we need to take this
87     // statistic from input of avg pooling to compensate work of average pooling
88     // and to stay in int8 as much as we can
89     if (previousLayer->type == "Pooling" &&
90         (previousLayer->precision == Precision::I8 || previousLayer->precision == Precision::U8)) {
91         // take input name to the pooling
92         auto prevInDataPtr = previousLayer->insData[0].lock();
93         if (prevInDataPtr == nullptr)
94             return nullptr;
95         inputLayerName = prevInDataPtr->getCreatorLayer().lock()->name;
96     }
97     size_t inputChannels = inDataPtr->getTensorDesc().getDims()[1];
98     if (getStatistic(previousLayer)->_minOutputs.size() != inputChannels ||
99         getStatistic(previousLayer)->_maxOutputs.size() != inputChannels) {
100         THROW_IE_EXCEPTION << "min and max sizes should be equal to input channels count for " << previousLayer->name;
101     }
102
103     // current normalization algorithm can have nodes with fp32 edges. it can happen only in places
104     // of initial quantization of int8 chains. Currently adding scaleshift adds certain I8/U8 precision
105     // but calcualtion of scales happens before adding of scale shifts.
106     // for fixing problem with cases of not determined yet presision and for following of
107     // quantizatoin scheme defined by normalizer, we are adding here verification of negative output
108     // in some cases and then verify exact precision of I8/U8 on node for covering of fully determined cases
109     int maxValue = hasNegativeOutput(previousLayer->name) ? maxSign_ : maxUnsign_;
110     if (previousLayer->outData[0]->getPrecision() == Precision::U8) {
111         maxValue = maxUnsign_;
112     } else if (previousLayer->outData[0]->getPrecision() == Precision::I8) {
113         maxValue = maxSign_;
114     }
115
116     return calculateScaleFactor(inputChannels, getStatistic(previousLayer), maxValue);
117 }
118
119 InferenceEngine::Blob::Ptr CNNStatisticHelper::getOutputScale(CNNLayer::Ptr layer) const {
120     // TODO(amalyshe) for now we are looking to precision on the data node
121     size_t outputChannels = layer->outData[0]->getTensorDesc().getDims()[1];
122     if (layer->outData.size() != 1) {
123         THROW_IE_EXCEPTION << "Trying to get scales after layer having multiple output ports";
124     }
125
126     auto it = internalNodesStats_.find(layer->name);
127     if (it == internalNodesStats_.end()) {
128         return std::shared_ptr<Blob>();
129     }
130
131     if (getStatistic(layer)->_minOutputs.size() != outputChannels ||
132         getStatistic(layer)->_maxOutputs.size() != outputChannels) {
133         THROW_IE_EXCEPTION << "min and max sizes should be equal to output channels count for " << layer->name;
134     }
135
136     return calculateScaleFactor(outputChannels, getStatistic(layer),
137                                 layer->outData[0]->getPrecision() == Precision::I8 ? maxSign_ : maxUnsign_);
138 }
139
140 int CNNStatisticHelper::getMaxSignValue() const {
141     return maxSign_;
142 }
143
144 InferenceEngine::Blob::Ptr CNNStatisticHelper::calculateScaleFactor(size_t channels, NetworkNodeStatsPtr stats,
145                                                                     int maxInt) const {
146     if (stats->_minOutputs.size() != channels || stats->_maxOutputs.size() != channels) {
147         THROW_IE_EXCEPTION << "min and max sizes should be equal to channels count";
148     }
149
150     // Creating i-scale blob
151     std::shared_ptr<Data> iScaleData =
152         std::shared_ptr<Data>(new Data("scale", {Precision::FP32, {channels}, Layout::C}));
153     auto iScale = CreateBlobFromData(iScaleData);
154     iScale->allocate();
155     float* iScaleMemory = static_cast<float*>(iScale->buffer());
156
157     for (int c = 0; c < channels; c++) {
158         // maxc = fmax(maxc, fabs(stats[k]->_minOutputs[c]));        // TODO Check if we should take minimums into
159         // account
160         float maxc = fabs(stats->_maxOutputs[c]);
161         maxc = fmax(maxc, fabs(stats->_minOutputs[c]));
162
163         iScaleMemory[c] = maxc / static_cast<float>(maxInt);
164
165         if (fabs(iScaleMemory[c]) < 1e-7) {
166             iScaleMemory[c] = 1.0f;
167         }
168     }
169     return iScale;
170 }
171
172 NetworkNodeStatsPtr CNNStatisticHelper::getStatistic(CNNLayer::Ptr layer) const {
173     // TODO(amalyshe) all logic of traversing over network and get apropriate statistics should be here
174     // for now it is a stub
175     auto it = internalNodesStats_.find(getLatestInFuse(layer)->name);
176     if (it != internalNodesStats_.end()) {
177         return it->second;
178     }
179     THROW_IE_EXCEPTION << "no stat for layer " << getLatestInFuse(layer)->name;
180 }
181
182 CNNLayer::Ptr CNNStatisticHelper::getLatestInFuse(CNNLayer::Ptr layer) const {
183     if (layer->outData[0]->getInputTo().size() == 1 &&
184         (CaselessEq<std::string>()(layer->outData[0]->getInputTo().begin()->second->type, "relu") ||
185          CNNNetworkInt8Normalizer::isReLULikeClamp(layer->outData[0]->getInputTo().begin()->second))) {
186         return layer->outData[0]->getInputTo().begin()->second;
187     }
188     // Conv-Sum-ReLU fuse
189     // We need to return original layer if it will be used as a sum parame and ReLU if
190     // iterating over outputs of pointed layer and look for the only eltwise
191     CNNLayer::Ptr eltwise = nullptr;
192     if (layer->outData.size() == 1) {
193         for (auto it : layer->outData[0]->getInputTo()) {
194             if (CaselessEq<std::string>()(it.second->type, "eltwise")) {
195                 if (eltwise) {
196                     THROW_IE_EXCEPTION << "Pattern when one layer pass data to several eltwise layers are not "
197                                           "supported in int8 quantization";
198                 }
199                 eltwise = it.second;
200             }
201         }
202     }
203
204     if (eltwise) {
205         // if current layer is not a convolution return it as finish of fuse
206         if (!CaselessEq<std::string>()(layer->type, "convolution")) {
207             return layer;
208         } else {
209             // look to the ports of eltwise
210             if (eltwise->insData[0].lock() != nullptr
211                     && eltwise->insData[1].lock() != nullptr
212                     && eltwise->insData[1].lock()->getCreatorLayer().lock() == layer
213                     && CaselessEq<std::string>()(eltwise->insData[0].lock()->getCreatorLayer().lock()->type, "convolution")
214                     && eltwise->insData[0].lock()->getInputTo().size() == 1) {
215                 // this is a case when two convolutions come to eltwise, the second one will be selected for fuse,
216                 // first will be used as sum operator
217                 return layer;
218             }
219             // given layer is a convolution and will be used for fuse, but we need to verify if there is ReLU after
220             // eltwise
221             if (eltwise->outData[0]->getInputTo().size() == 1 &&
222                 (CaselessEq<std::string>()(eltwise->outData[0]->getInputTo().begin()->second->type, "relu") ||
223                  CNNNetworkInt8Normalizer::isReLULikeClamp(eltwise->outData[0]->getInputTo().begin()->second))) {
224                 return eltwise->outData[0]->getInputTo().begin()->second;
225             }
226             return eltwise;
227         }
228     }
229
230     return layer;
231 }
232
233 void CNNStatisticHelper::NormalizeStatistic() {
234     StatsMap newMap;
235
236     // In case when we have statistics in negative range when min clamped value is 0,
237     // we are changing statistics here to non negative. This is not fully correct behaviour since
238     // it can extend range and affect accuracy, but this approach works quite well
239     std::vector<CNNLayerPtr> sortedLayersRC = CNNNetSortTopologically(network_);
240     for (auto l : sortedLayersRC) {
241         if (CNNNetworkInt8Normalizer::isReLULikeClamp(l)) {
242             if (l->outData.size() == 1) {
243                 size_t outputChannels = l->outData[0]->getTensorDesc().getDims()[1];
244                 auto oldStat = internalNodesStats_.find(l->name);
245                 if ((oldStat != internalNodesStats_.end()) && outputChannels > 1) {
246                     for (size_t q = 0; q < oldStat->second->_minOutputs.size(); q++) {
247                         oldStat->second->_minOutputs[q] = 0.f;
248                     }
249                 }
250             }
251         }
252     }
253
254     float dummy = 0.0f;
255
256     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network_);
257     for (auto l : sortedLayers) {
258         // if layer's statistic exists in the newMap, ignore it
259         if (newMap.find(l->name) != newMap.end()) {
260             continue;
261         }
262         // verify if layer is starter layer for propagating of statistic
263         bool isStarterLayer = false;
264
265         // a case if we do not have converted statistic before the current layer
266         // go over all inputs and verify if statistic exists for all of inputs
267         bool allInputsHaveStatistics = true;
268         for (auto i : l->insData) {
269             if (newMap.find(i.lock()->getCreatorLayer().lock()->name) == newMap.end()) {
270                 allInputsHaveStatistics = false;
271                 break;
272             }
273         }
274         // if we do not have statistic - verify who is consumer of this layer
275         if (!allInputsHaveStatistics) {
276             if (l->outData.size() == 1) {
277                 for (auto it : l->outData[0]->getInputTo()) {
278                     if (CaselessEq<std::string>()(it.second->type, "scaleshift") ||
279                         CaselessEq<std::string>()(it.second->type, "convolution") ||
280                         CaselessEq<std::string>()(it.second->type, "fullyconnected")) {
281                         isStarterLayer = true;
282                         break;
283                     }
284                 }
285             }
286         } else {
287             isStarterLayer = true;
288         }
289         if (CaselessEq<std::string>()(l->type, "scaleshift") || CaselessEq<std::string>()(l->type, "convolution") ||
290             CaselessEq<std::string>()(l->type, "fullyconnected")) {
291             isStarterLayer = true;
292         }
293
294         if (!isStarterLayer) {
295             continue;
296         }
297
298         // we do not support yet layers for quantization which split data
299         if (l->outData.size() != 1) {
300             continue;
301         }
302
303         InferenceEngine::NetworkNodeStatsPtr currentStat = std::make_shared<NetworkNodeStats>();
304
305         bool perChannelScale = true;
306
307         if (CaselessEq<std::string>()(l->type, "concat") && l->outData.size() == 1 &&
308             l->outData[0]->getTensorDesc().getDims().size() == 4 && allInputsHaveStatistics) {
309             size_t concatLayerIdx = 0;
310             for (int k = 0; k < l->insData.size(); k++) {
311                 auto prevKLayer = l->insData[k].lock()->getCreatorLayer().lock();
312                 // looking for the statistic for prevKLayer
313                 auto kLayerStat = newMap.find(prevKLayer->name);
314                 if (kLayerStat != newMap.end()) {
315                     for (size_t ikStat = 0; ikStat < kLayerStat->second->_maxOutputs.size();
316                          ikStat++, concatLayerIdx++) {
317                         currentStat->_maxOutputs.push_back(kLayerStat->second->_maxOutputs[ikStat]);
318                         currentStat->_minOutputs.push_back(kLayerStat->second->_minOutputs[ikStat]);
319                     }
320                 } else {
321                     THROW_IE_EXCEPTION << "We have incomplete statistic for predecessors of concat layer " << l->name;
322                 }
323             }
324         } else if (CaselessEq<std::string>()(l->type, "resample")) {
325             if (l->insData.size() == 1) {
326                 CNNLayerPtr creator = l->insData[0].lock()->getCreatorLayer().lock();
327                 if (CaselessEq<std::string>()(creator->type, "concat")) {
328                     auto concatStat = newMap[creator->name];
329                     currentStat->_maxOutputs = concatStat->_maxOutputs;
330                     currentStat->_minOutputs = concatStat->_minOutputs;
331                     newMap[l->name] = currentStat;
332                 } else {
333                     auto itOld = internalNodesStats_.find(l->name);
334                     if (itOld != internalNodesStats_.end()) {
335                         currentStat->_maxOutputs = itOld->second->_maxOutputs;
336                         currentStat->_minOutputs = itOld->second->_minOutputs;
337                         newMap[l->name] = currentStat;
338                     }
339                 }
340             }
341         } else {
342             // go over all children until we get convoluition, scaleshift, eltwise or unknown layer
343             // layers Pooling and ReLU are passthrough
344             // to understand the granularity of the scaling
345             // layer concat is a layer which produce statistics and waterfall it down
346             std::vector<CNNLayer::Ptr> toAnalyze;
347             for (auto it : l->outData[0]->getInputTo()) {
348                 toAnalyze.push_back(it.second);
349             }
350
351             if (CaselessEq<std::string>()(l->type, "eltwise")) {
352                 perChannelScale = false;
353             }
354             while (!toAnalyze.empty() && perChannelScale) {
355                 CNNLayer::Ptr tl = toAnalyze.back();
356                 toAnalyze.pop_back();
357                 if (CaselessEq<std::string>()(tl->type, "pooling") || CaselessEq<std::string>()(tl->type, "relu") ||
358                     CNNNetworkInt8Normalizer::isReLULikeClamp(tl) || CaselessEq<std::string>()(tl->type, "concat")) {
359                     if (tl->outData.size() == 1) {
360                         for (auto it : tl->outData[0]->getInputTo()) {
361                             toAnalyze.push_back(it.second);
362                         }
363                     }
364                 } else if (CaselessEq<std::string>()(tl->type, "convolution")) {
365                     // verify number of groups
366                     ConvolutionLayer* pConv = dynamic_cast<ConvolutionLayer*>(tl.get());
367                     if (pConv == nullptr) {
368                         THROW_IE_EXCEPTION << "Layer " << tl->name << " is not instance of ConvolutionLayer class";
369                     }
370                     if (pConv->_group != pConv->_out_depth) {
371                         perChannelScale = false;
372                     }
373                 } else if (CaselessEq<std::string>()(tl->type, "eltwise")) {
374                     perChannelScale = false;
375                 }
376             }
377
378             auto itOld = internalNodesStats_.find(getLatestInFuse(l)->name);
379             if (itOld == internalNodesStats_.end()) {
380                 itOld = internalNodesStats_.find(l->name);
381             }
382             if (itOld != internalNodesStats_.end()) {
383                 if (!perChannelScale) {
384                     currentStat->_maxOutputs.resize(itOld->second->_maxOutputs.size());
385                     if (!itOld->second->_maxOutputs.empty()) {
386                         float max = FLT_MIN;
387                         DataStats::GetDataAbsMax(&itOld->second->_maxOutputs[0], itOld->second->_maxOutputs.size(),
388                                                  max);
389                         std::fill(currentStat->_maxOutputs.begin(), currentStat->_maxOutputs.end(), max);
390                     }
391
392                     currentStat->_minOutputs.resize(itOld->second->_minOutputs.size());
393                     if (!itOld->second->_minOutputs.empty()) {
394                         float min = FLT_MAX;
395                         DataStats::GetDataMinMax(&itOld->second->_minOutputs[0], itOld->second->_minOutputs.size(), min,
396                                                  dummy);
397                         std::fill(currentStat->_minOutputs.begin(), currentStat->_minOutputs.end(), min);
398                     }
399                 } else {
400                     currentStat->_maxOutputs = itOld->second->_maxOutputs;
401                     currentStat->_minOutputs = itOld->second->_minOutputs;
402                 }
403             }
404
405             if (l->outData.size() == 1) {
406                 size_t ch_indx = l->outData[0]->getTensorDesc().getDims().size() > 1 ? 1 : 0;
407                 size_t outputChannels = l->outData[0]->getTensorDesc().getDims()[ch_indx];
408                 auto oldStat = internalNodesStats_.find(l->name);
409                 if ((oldStat != internalNodesStats_.end()) && outputChannels > 1 &&
410                     oldStat->second->_minOutputs.size() == 1) {
411                     auto min = oldStat->second->_minOutputs[0];
412                     auto max = oldStat->second->_maxOutputs[0];
413
414                     currentStat->_minOutputs = std::vector<float>(outputChannels);
415                     currentStat->_maxOutputs = std::vector<float>(outputChannels);
416                     std::fill(currentStat->_minOutputs.begin(), currentStat->_minOutputs.end(), min);
417                     std::fill(currentStat->_maxOutputs.begin(), currentStat->_maxOutputs.end(), max);
418                 }
419             }
420         }
421
422         // propagate this statistic to all layers without scale in primitives
423         if (!currentStat->_maxOutputs.empty() && !currentStat->_minOutputs.empty()) {
424             std::vector<CNNLayer::Ptr> toAnalyze;
425             toAnalyze.push_back(l);
426             while (!toAnalyze.empty()) {
427                 CNNLayer::Ptr tl = toAnalyze.back();
428                 toAnalyze.pop_back();
429                 newMap[tl->name] = currentStat;
430                 if (tl->outData.size() == 1) {
431                     for (auto it : tl->outData[0]->getInputTo()) {
432                         if (CaselessEq<std::string>()(it.second->type, "pooling") ||
433                             CaselessEq<std::string>()(it.second->type, "relu") ||
434                             CNNNetworkInt8Normalizer::isReLULikeClamp(it.second)) {
435                             toAnalyze.push_back(it.second);
436                         }
437                     }
438                 }
439             }
440         }
441     }
442
443     internalNodesStats_ = newMap;
444 }
445
446 void CNNNetworkInt8Normalizer::AddLayerToCNNNetworkBeforeLayer(CNNLayer::Ptr newLayer, CNNLayer::Ptr successor,
447                                                                size_t port) {
448     // verify if data exists
449     if (newLayer && successor && successor->insData.size() > port) {
450         // get the insData
451         DataPtr pData = successor->insData[port].lock();
452
453         Data* edge2 = new Data(*pData.get());
454         DataPtr newEdge(edge2);
455         newEdge->getInputTo().clear();
456         newEdge->getInputTo()[successor->name] = successor;
457         newEdge->setName(newLayer->name);
458         newEdge->getCreatorLayer() = newLayer;
459         successor->insData[port] = newEdge;
460         newLayer->outData.push_back(newEdge);
461
462         newLayer->insData.push_back(pData);
463         pData->getInputTo().erase(successor->name);
464         pData->getInputTo()[newLayer->name] = newLayer;
465     } else {
466         THROW_IE_EXCEPTION << "Invalid argument";
467     }
468 }
469
470 CNNLayer::Ptr CNNNetworkInt8Normalizer::addU8ToI8Conversion(DataPtr data, CNNLayer::Ptr successor,
471                                                             CNNStatisticHelper& statHelper) {
472     if (data->getPrecision() == Precision::U8 || data->getPrecision() == Precision::I8) {
473         size_t c = static_cast<size_t>(data->getDims()[1]);
474
475         std::vector<float> ssWValues;
476         std::vector<float> ssSValues;
477         for (auto i = 0; i < c; i++) {
478             ssWValues.push_back(1.0f);
479             ssSValues.push_back(0.0f);
480         }
481         std::string layerName = data->getCreatorLayer().lock()->name + "_Eltwise_ScaleShift_U8I8_" + successor->name;
482         CNNLayer::Ptr newLayer = createDWConvolutionForScale(layerName, c, ssWValues.data(), ssSValues.data());
483         newLayer->precision = Precision::I8;
484
485         for (size_t i = 0; i < successor->insData.size(); i++) {
486             if (successor->insData[i].lock() == data) {
487                 AddLayerToCNNNetworkBeforeLayer(newLayer, successor, i);
488
489                 // update statistic to pass quantization smoothly
490                 if (newLayer->insData[0].lock() == nullptr)
491                     continue;
492                 std::string inputLayerName = newLayer->insData[0].lock()->getCreatorLayer().lock()->name;
493                 statHelper.copyStatistics(inputLayerName, layerName);
494                 if (data->getPrecision() == Precision::U8) {
495                     newLayer->outData[0]->setPrecision(Precision::I8);
496                 } else {
497                     newLayer->outData[0]->setPrecision(Precision::U8);
498                 }
499             }
500         }
501         return newLayer;
502     }
503     return nullptr;
504 }
505
506 void CNNNetworkInt8Normalizer::AddLayerToCNNNetworkAfterData(DataPtr pData, CNNLayer::Ptr layer,
507                                                              const std::string& nextLayerName) {
508     // verify if data exists
509     if (pData && layer && pData->getCreatorLayer().lock() &&
510         pData->getInputTo().find(nextLayerName) != pData->getInputTo().end()) {
511         CNNLayerPtr nextLayer = pData->getInputTo()[nextLayerName];
512
513         DataPtr newEdgeAfterLayer(new Data(*pData.get()));
514         newEdgeAfterLayer->setName(layer->name);
515         newEdgeAfterLayer->getCreatorLayer() = layer;
516         newEdgeAfterLayer->getInputTo().clear();
517         newEdgeAfterLayer->getInputTo()[nextLayerName] = nextLayer;
518         newEdgeAfterLayer->setPrecision(Precision::FP32);
519
520         pData->getInputTo().erase(nextLayerName);
521         pData->getInputTo()[layer->name] = layer;
522
523         layer->insData.push_back(pData);
524         layer->outData.push_back(newEdgeAfterLayer);
525
526         for (size_t i = 0; i < nextLayer->insData.size(); i++) {
527             if (nextLayer->insData[i].lock() == pData) {
528                 nextLayer->insData[i] = newEdgeAfterLayer;
529             }
530         }
531     } else {
532         THROW_IE_EXCEPTION << "Invalid argument";
533     }
534 }
535
536 void CNNNetworkInt8Normalizer::fillInScaleShift(ScaleShiftLayer* scshLayer, size_t c, float* weightsN,
537                                                 float* weightsD) {
538     // Setting "scales"
539     SizeVector weightsSize = {c};
540     TensorDesc weightsDesc(Precision::FP32, weightsSize, InferenceEngine::C);
541     scshLayer->_weights = InferenceEngine::make_shared_blob<float>(weightsDesc);
542     scshLayer->_weights->allocate();
543     float* weightsData = scshLayer->_weights->buffer();
544     for (size_t i = 0; i < c; i++) {
545         if (weightsN == nullptr && weightsD != nullptr) {
546             weightsData[i] = 1.0 / weightsD[i];
547         } else if (weightsD == nullptr && weightsN != nullptr) {
548             weightsData[i] = weightsN[i];
549         } else if (weightsN != nullptr && weightsD != nullptr) {
550             weightsData[i] = weightsN[i] / weightsD[i];
551         } else {
552             weightsData[i] = 1.0;
553         }
554     }
555
556     // Setting "shifts"
557     SizeVector shiftsSize = {c};
558     TensorDesc shiftsDesc(Precision::FP32, shiftsSize, InferenceEngine::C);
559     scshLayer->_biases = InferenceEngine::make_shared_blob<float>(shiftsDesc);
560     scshLayer->_biases->allocate();
561     float* biasesData = scshLayer->_biases->buffer();
562     for (size_t i = 0; i < c; i++) {
563         biasesData[i] = 0.f;  // Setting to constant "0"
564     }
565 }
566
567 void CNNNetworkInt8Normalizer::AddScaleShiftBetween(CNNNetwork& net, const CNNLayerPtr layer1, const CNNLayerPtr layer2,
568                                                     CNNStatisticHelper& statHelper) {
569     if (CaselessEq<std::string>()(layer2->type, "priorbox") ||
570         CaselessEq<std::string>()(layer2->type, "priorboxclustered")) {
571         return;
572     }
573
574     // Searching the connection between the layers
575     int l1_out_i = 0;
576     for (; l1_out_i < layer1->outData.size(); l1_out_i++) {
577         if (layer1->outData[l1_out_i]->getInputTo().find(layer2->name) !=
578             layer1->outData[l1_out_i]->getInputTo().end()) {
579             break;
580         }
581     }
582     if (l1_out_i == layer1->outData.size()) {
583         THROW_IE_EXCEPTION << "Can't find layer " << layer2->name << " among layer " << layer1->name << " outputs";
584     }
585
586     int l2_in_i = 0;
587     for (; l2_in_i < layer2->insData.size(); l2_in_i++) {
588         if (layer2->insData[l2_in_i].lock() != nullptr
589                 && layer2->insData[l2_in_i].lock()->getCreatorLayer().lock() == layer1) {
590             break;
591         }
592     }
593     if (l2_in_i == layer2->insData.size()) {
594         THROW_IE_EXCEPTION << "Can't find layer " << layer2->name << " among layer " << layer1->name << " inputs";
595     }
596
597     DataPtr outData = layer1->outData[l1_out_i];
598
599     Blob::Ptr oScaleBlob = nullptr;
600     if (layer1->blobs.find("o-scale") != layer1->blobs.end()) {
601         oScaleBlob = layer1->blobs["o-scale"];
602     }
603
604     Blob::Ptr iScaleBlob = nullptr;
605     if (layer2->blobs.find("i-scale") != layer2->blobs.end()) {
606         iScaleBlob = layer2->blobs["i-scale"];
607     }
608
609     if (iScaleBlob == nullptr && oScaleBlob == nullptr) {
610         return;  // No multipliers found around this edge. We can't create a ScaleShift here;
611     } else {
612         // Creating a ScaleShiftLayer
613         std::string prefix;
614         float *iScaleBuffer = nullptr, *oScaleBuffer = nullptr;
615         if (oScaleBlob != nullptr) {
616             oScaleBuffer = static_cast<float*>(oScaleBlob->buffer());
617             prefix += "o";
618         }
619         if (iScaleBlob != nullptr) {
620             iScaleBuffer = static_cast<float*>(iScaleBlob->buffer());
621             prefix += "i";
622         }
623
624         std::string layerName = layer1->name + "_" + prefix + "ScaleShift_" + layer2->name;
625         LayerParams ssCnnLayerParams {layerName, "ScaleShift", Precision::FP32};
626         CNNLayerPtr ssCnnLayer(new ScaleShiftLayer(ssCnnLayerParams));
627
628         AddLayerToCNNNetworkAfterData(outData, ssCnnLayer, layer2->name);
629
630         size_t c = static_cast<size_t>(outData->getDims()[1]);
631
632         {
633             ScaleShiftLayer* scshLayer = dynamic_cast<ScaleShiftLayer*>(ssCnnLayer.get());
634             if (scshLayer == nullptr) {
635                 THROW_IE_EXCEPTION << "Layer " << ssCnnLayer->name << " is not instance of ScaleShiftLayer class";
636             }
637             fillInScaleShift(scshLayer, c, oScaleBuffer, iScaleBuffer);
638         }
639
640         Precision odPrecision = Precision::FP32;
641         if (layer2->precision == Precision::I8) {
642             odPrecision = statHelper.hasNegativeOutput(layer1->name) ? Precision::I8 : Precision::U8;
643         }
644         ssCnnLayer->outData[0]->setPrecision(odPrecision);
645     }
646 }
647
648 void CNNNetworkInt8Normalizer::AddScaleShifts(CNNNetwork& net, CNNStatisticHelper& statHelper) {
649     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
650
651     std::vector<std::pair<CNNLayerPtr, CNNLayerPtr>> pairs;
652
653     for (auto iter : sortedLayers) {
654         for (int l1_out_i = 0; l1_out_i < iter->outData.size(); l1_out_i++) {
655             for (auto nextIter : iter->outData[l1_out_i]->getInputTo()) {
656                 CNNLayer::Ptr next = nextIter.second;
657
658                 // Checking for an INT8 convolution or fully connected with FP32 output
659                 if ((CaselessEq<std::string>()(iter->type, "Convolution") ||
660                      CaselessEq<std::string>()(iter->type, "FullyConnected")) &&
661                     iter->precision == Precision::I8 && next->precision == Precision::FP32 &&
662                     iter->outData[l1_out_i]->getPrecision() == Precision::FP32) {
663                     // Do nothing here only if iter provides data to fp32 layers
664                     // MKLDNNPlugin will generate x8->f32 convolution
665
666                 } else if ((iter->precision != Precision::FP32 && next->precision == Precision::FP32) ||
667                            (iter->precision == Precision::FP32 && next->precision != Precision::FP32)) {
668                     pairs.push_back(std::pair<CNNLayerPtr, CNNLayerPtr>(iter, next));
669                 }
670             }
671         }
672     }
673
674     for (auto& pair : pairs) {
675         AddScaleShiftBetween(net, pair.first, pair.second, statHelper);
676     }
677 }
678
679 void CNNNetworkInt8Normalizer::ClampsToReLU(CNNNetwork& net, CNNStatisticHelper& statHelper) {
680     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
681
682     for (auto iter : sortedLayers) {
683         if (isReLULikeClamp(iter) && (iter->precision == Precision::I8 || iter->precision == Precision::U8)) {
684             std::string layerName = iter->name + "_ReLU";
685             LayerParams ssCnnLayerParams {layerName, "ReLU", iter->precision};
686             CNNLayerPtr ssCnnLayer(new ReLULayer(ssCnnLayerParams));
687
688             auto previousLayer = iter->insData[0].lock()->getCreatorLayer().lock();
689             ssCnnLayer->insData.push_back(iter->insData[0]);
690             if (ssCnnLayer->insData[0].lock() == nullptr)
691                 continue;
692             ssCnnLayer->insData[0].lock()->getInputTo().erase(iter->name);
693             ssCnnLayer->insData[0].lock()->getInputTo()[iter->name] = ssCnnLayer;
694
695             ssCnnLayer->outData.push_back(iter->outData[0]);
696             ssCnnLayer->outData[0]->getCreatorLayer() = ssCnnLayer;
697
698             iter->insData.clear();
699             iter->outData.clear();
700         }
701     }
702 }
703
704 void CNNNetworkInt8Normalizer::ScaleDataToInt(const float* srcData, size_t srcSize, Blob::Ptr int8blob,
705                                               const std::vector<float>& scales) {
706     if (scales.size() == 0 || /*srcblob->size()*/ srcSize % scales.size() != 0) {
707         THROW_IE_EXCEPTION << "Wrong number of scale factors";
708     }
709
710     size_t channels = scales.size();
711     size_t channelSize = /*srcblob->size()*/ srcSize / channels;
712
713     const float* data = srcData;
714     if (int8blob->getTensorDesc().getPrecision() == Precision::I8) {
715         int8_t* int8data = static_cast<int8_t*>(int8blob->buffer());
716         int minValue = std::numeric_limits<int8_t>::min();
717         int maxValue = std::numeric_limits<int8_t>::max();
718
719         size_t offset;
720
721         float val;
722
723         for (size_t ch = 0; ch < channels; ch++) {
724             offset = channelSize * ch;
725
726             for (size_t i = 0; i < channelSize; i++) {
727                 val = data[offset + i] * scales[ch];
728
729                 if (val > maxValue) {
730                     val = maxValue;
731                 } else if (val < minValue) {
732                     val = minValue;
733                 }
734
735                 int8data[offset + i] = round(val);
736             }
737         }
738     } else if (int8blob->getTensorDesc().getPrecision() == Precision::I32) {
739         int32_t* int32data = static_cast<int32_t*>(int8blob->buffer());
740         int maxValue = std::numeric_limits<int32_t>::max();
741         int minValue = std::numeric_limits<int32_t>::min();
742
743         size_t offset;
744
745         float val;
746
747         for (size_t ch = 0; ch < channels; ch++) {
748             offset = channelSize * ch;
749
750             for (size_t i = 0; i < channelSize; i++) {
751                 val = data[offset + i] * scales[ch];
752
753                 if (val > maxValue) {
754                     val = maxValue;
755                 } else if (val < minValue) {
756                     val = minValue;
757                 }
758
759                 int32data[offset + i] = round(val);
760             }
761         }
762     }
763 }
764
765 CNNLayer::Ptr CNNNetworkInt8Normalizer::createDWConvolutionForScale(const std::string& layerName, size_t channels,
766                                                                     float* ssWValues, float* ssSValues) {
767     // create new Convolution layer
768     LayerParams params;
769     params.name = layerName;
770     params.precision = Precision::FP32;
771     params.type = "Convolution";
772
773     CNNLayerPtr lptr = std::make_shared<ConvolutionLayer>(params);
774     auto* pConv = dynamic_cast<ConvolutionLayer*>(lptr.get());
775     if (pConv == nullptr) {
776         THROW_IE_EXCEPTION << "Layer " << lptr->name << " is not instance of ConvolutionLayer class";
777     }
778
779     pConv->_kernel.insert(X_AXIS, 1);
780     pConv->_kernel.insert(Y_AXIS, 1);
781     pConv->_stride.insert(X_AXIS, 1);
782     pConv->_stride.insert(Y_AXIS, 1);
783     pConv->_padding.insert(X_AXIS, 0);
784     pConv->_padding.insert(Y_AXIS, 0);
785     pConv->_pads_end.insert(X_AXIS, 0);
786     pConv->_pads_end.insert(Y_AXIS, 0);
787     pConv->_dilation.insert(X_AXIS, 1);
788     pConv->_dilation.insert(Y_AXIS, 1);
789
790     pConv->_out_depth = channels;
791     // mkl-dnn does not have i8 depthwise convolution accepting signed i8 input
792     // when it is available, need to uncomment below lines
793
794     // workaround - creation of new weights for simple convolution
795     if (pConv->_out_depth % 16 == 0) {
796         pConv->_group = pConv->_out_depth / 16;
797         Blob::Ptr weights = nullptr;
798         std::shared_ptr<Data> wData =
799             std::shared_ptr<Data>(new Data("weights", {Precision::FP32, {pConv->_out_depth * 16}, Layout::C}));
800         weights = CreateBlobFromData(wData);
801         weights->allocate();
802         float* buffer = weights->buffer().as<float*>();
803         size_t iDist = 0, iSrc = 0;
804         for (size_t g = 0; g < pConv->_group; g++) {
805             for (size_t k = 0; k < 16; k++) {
806                 for (size_t s = 0; s < 16; s++) {
807                     buffer[iDist++] = (s == k) ? ssWValues[iSrc++] : 0.f;
808                 }
809             }
810         }
811         pConv->_weights = weights;
812         pConv->blobs["weights"] = weights;
813     } else {
814         Blob::Ptr weights = nullptr;
815         std::shared_ptr<Data> wData = std::shared_ptr<Data>(
816             new Data("weights", {Precision::FP32, {pConv->_out_depth * pConv->_out_depth}, Layout::C}));
817         weights = CreateBlobFromData(wData);
818         weights->allocate();
819         float* buffer = weights->buffer().as<float*>();
820         for (size_t i = 0, idx = 0; i < pConv->_out_depth; i++) {
821             for (size_t j = 0; j < pConv->_out_depth; j++) {
822                 if (i == j) {
823                     buffer[idx] = ssWValues[i];
824                 } else {
825                     buffer[idx] = 0.f;
826                 }
827                 idx++;
828             }
829         }
830         pConv->_weights = weights;
831         pConv->blobs["weights"] = weights;
832         pConv->_group = 1;
833     }
834     // end of workaround
835
836     // fililng of biases
837     Blob::Ptr biasesBlob = nullptr;
838     std::shared_ptr<Data> bData =
839         std::shared_ptr<Data>(new Data("biases", {Precision::FP32, {pConv->_out_depth}, Layout::C}));
840     biasesBlob = CreateBlobFromData(bData);
841     biasesBlob->allocate();
842     float* bufferBiases = biasesBlob->buffer().as<float*>();
843     for (size_t c = 0; c < pConv->_out_depth; c++) {
844         bufferBiases[c] = ssSValues[c];
845     }
846     pConv->_biases = biasesBlob;
847
848     pConv->blobs["weights"] = pConv->_weights;
849     pConv->blobs["biases"] = pConv->_biases;
850     return lptr;
851 }
852
853 void CNNNetworkInt8Normalizer::replaceScaleShiftByDWConvolution(CNNNetwork& net) {
854     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
855     for (auto layer : sortedLayers) {
856         if (CaselessEq<std::string>()(layer->type, "scaleshift") &&
857             layer->insData[0].lock()->getCreatorLayer().lock() &&
858             !CaselessEq<std::string>()(layer->insData[0].lock()->getCreatorLayer().lock()->type, "input") &&
859             layer->outData[0]->getInputTo().size() > 0) {
860             const auto dims = layer->insData[0].lock()->getTensorDesc().getDims();
861             // only four or five dimensions Convolution layers are supported
862             if ((dims.size() == 4) || (dims.size() == 5)) {
863                 // verification if this layer does not pass data to PriorBox, if it passes, we do not substitute
864                 bool notToPriorBox = true;
865                 for (auto o : layer->outData[0]->getInputTo()) {
866                     if (CaselessEq<std::string>()(o.second->type, "priorbox") ||
867                         CaselessEq<std::string>()(o.second->type, "priorboxclustered")) {
868                         notToPriorBox = false;
869                     }
870                 }
871                 if (notToPriorBox) {
872                     ScaleShiftLayer* pSS = dynamic_cast<ScaleShiftLayer*>(layer.get());
873                     float* ssWValues = pSS->_weights->buffer().as<float*>();
874                     float* ssSValues = pSS->_biases->buffer().as<float*>();
875                     CNNLayer::Ptr newLayer = createDWConvolutionForScale(
876                         layer->name, layer->outData[0]->getTensorDesc().getDims()[1], ssWValues, ssSValues);
877
878                     newLayer->outData = layer->outData;
879                     newLayer->outData[0]->getCreatorLayer() = newLayer;
880                     newLayer->insData = layer->insData;
881                     if (newLayer->insData[0].lock() == nullptr)
882                         continue;
883                     newLayer->insData[0].lock()->getInputTo().erase(layer->name);
884                     newLayer->insData[0].lock()->getInputTo()[newLayer->name] = newLayer;
885                 }
886             }
887         }
888     }
889 }
890
891 void CNNNetworkInt8Normalizer::QuantizeConvolutionOrFullyConnected(CNNLayer::Ptr target_layer,
892                                                                    CNNStatisticHelper& statHelper) {
893     size_t inputChannels = target_layer->insData[0].lock()->getTensorDesc().getDims()[1];
894     size_t outputChannels = target_layer->outData[0]->getTensorDesc().getDims()[1];
895
896     auto iScale = statHelper.getInputScale(target_layer);
897     if (iScale == nullptr)
898         THROW_IE_EXCEPTION << "Layer '" << target_layer->name << "'has invalid scale";
899
900     target_layer->blobs["i-scale"] = iScale;
901
902     Blob::Ptr weights = nullptr;
903     Blob::Ptr biases = nullptr;
904
905     Blob::Ptr int8weights = nullptr;
906     Blob::Ptr int32biases = nullptr;
907
908     if (target_layer->blobs.find("weights") != target_layer->blobs.end()) {
909         weights = target_layer->blobs["weights"];
910
911         // Creating int8 weights blob
912         std::shared_ptr<Data> int8WeightsData =
913             std::shared_ptr<Data>(new Data("weights", TensorDesc(Precision::I8, weights->getTensorDesc().getDims(),
914                                                                  weights->getTensorDesc().getLayout())));
915         int8weights = CreateBlobFromData(int8WeightsData);
916         int8weights->allocate();
917         target_layer->blobs["weights"] = int8weights;
918     }
919
920     if (target_layer->blobs.find("biases") != target_layer->blobs.end()) {
921         biases = target_layer->blobs["biases"];
922
923         // Creating int8 biases blob
924         std::shared_ptr<Data> int32BiasesData =
925             std::shared_ptr<Data>(new Data("biases", TensorDesc(Precision::I32, biases->getTensorDesc().getDims(),
926                                                                 biases->getTensorDesc().getLayout())));
927         int32biases = CreateBlobFromData(int32BiasesData);
928         int32biases->allocate();
929         target_layer->blobs["biases"] = int32biases;
930     }
931
932     std::vector<float> weightScalers;
933
934     // Creating w-scale blob
935     if (weights) {
936         const float* weight = static_cast<const float*>(weights->buffer());
937
938         ConvolutionLayer* pConv1 = dynamic_cast<ConvolutionLayer*>(target_layer.get());
939
940         if (pConv1 != nullptr && pConv1->_group == 0) {
941             THROW_IE_EXCEPTION << "Convolution '" << target_layer->name << "'has wrong groups number == 0";
942         }
943         int group = 1;
944         if (pConv1 != nullptr && pConv1->_group != 1) {
945             group = pConv1->_group;
946         }
947
948         std::vector<float> newWeights;  // "new" weights are weights multiplied by i-scale
949
950         size_t W_CO = outputChannels / group, W_CI = inputChannels / group,
951                W_HW = weights->size() / W_CI / W_CO / group;
952
953         {
954             float* iScaleMemory = static_cast<float*>(iScale->buffer());
955             for (size_t g = 0; g < group; g++) {
956                 for (size_t co = 0; co < W_CO; co++) {
957                     for (size_t ci = 0; ci < W_CI; ci++) {
958                         size_t kernelBase = g * W_CO * W_CI * W_HW + co * W_CI * W_HW + ci * W_HW;
959                         for (size_t hw = 0; hw < W_HW; hw++) {
960                             newWeights.push_back(weight[kernelBase + hw] * iScaleMemory[g * W_CI + ci]);
961                         }
962                     }
963                 }
964             }
965         }
966         if (newWeights.empty())
967             THROW_IE_EXCEPTION << "Could not quantize layer '" << target_layer->name << "'. Invalid layer parameters.";
968         size_t outChannelSize = weights->getTensorDesc().getDims().back() / W_CO / group;
969
970         // Calculating weights normalization scale factor (w-scale)
971
972         std::set<double> individualsG;
973         size_t co;
974         float* weight_convolution;
975         bool bwquantized = false;
976         double symQuant = 0.f;
977
978         for (co = 0, weight_convolution = &newWeights[0]; co < outputChannels;
979              co++, weight_convolution += outChannelSize) {
980             for (size_t i = 0; i < outChannelSize && individualsG.size() < 256; i++) {
981                 individualsG.insert(static_cast<double>(weight_convolution[i]));
982             }
983         }
984         // If we have 256 quantums for all filters in convolution, it can be already int8 quantized weights
985         // We can support symmetric quantization
986         // Below conditions verify if weights are symmetric quantized around 0, what are min/max borders
987         // These parameters are required to repeat exactly the same quantum as model was trained
988         // The algorithm of restoring min/max parameters has couple assumptions which might not work for 100%
989         // cases. We want to explicitly define them. We assume that
990         // 1. All convolutions have 1st quantum either from positive or negative side. See how we calculate symQuant
991         // 2. If quantization is not symmetric, there should be quant on one of the side which demonstrate this
992         if (individualsG.size() < 256) {
993             // going over weights and verify that weights stay on quant positions
994             std::set<double> intervals;
995             double prev = 0.f;
996             for (auto it = individualsG.begin(); it != individualsG.end(); it++) {
997                 if (prev) {
998                     intervals.insert(*it - prev);
999                 }
1000                 prev = *it;
1001             }
1002             if (!intervals.empty()) {
1003                 symQuant = *(intervals.begin());
1004             }
1005             std::set<double> divs;
1006             if (symQuant != 0.) {
1007                 prev = 0.f;
1008                 for (auto it = individualsG.begin(); it != individualsG.end(); it++) {
1009                     if (prev) {
1010                         divs.insert((*it - prev) / symQuant);
1011                     }
1012                     prev = *it;
1013                 }
1014             }
1015
1016             bwquantized = true;
1017             for (auto it3 = divs.begin(); it3 != divs.end(); it3++) {
1018                 if (fabs(round(*it3) - *it3) > 0.001) {
1019                     bwquantized = false;
1020                 }
1021             }
1022
1023             // we want to make sure that quantization is symmetric. this way we are looking for the
1024             // value in weights matching to the quant (positive or negative
1025             if (bwquantized) {
1026                 // take the minimal and maximum values on calculated symQuant and compare with data from individuals
1027                 double minCalc = symQuant * -128.0f;
1028                 double maxCalc = symQuant * 128.0f;
1029                 for (auto it = individualsG.begin(); it != individualsG.end(); it++) {
1030                     if (*it < minCalc || *it > maxCalc) {
1031                         bwquantized = false;
1032                     }
1033                 }
1034             }
1035         }
1036         if (bwquantized && symQuant != 0.0f) {
1037             float max = symQuant * 127.0f;
1038             for (co = 0, weight_convolution = &newWeights[0]; co < outputChannels;
1039                  co++, weight_convolution += outChannelSize) {
1040                 float scaler = static_cast<float>(statHelper.getMaxSignValue()) / max;
1041                 weightScalers.push_back(scaler);
1042             }
1043         } else {
1044             for (co = 0, weight_convolution = &newWeights[0]; co < outputChannels;
1045                  co++, weight_convolution += outChannelSize) {
1046                 float max = FLT_MIN;
1047                 DataStats::GetDataAbsMax(weight_convolution, outChannelSize, max);
1048
1049                 float scaler = static_cast<float>(statHelper.getMaxSignValue()) / max;
1050                 weightScalers.push_back(scaler);
1051             }
1052         }
1053
1054         std::shared_ptr<Data> wScaleData =
1055             std::shared_ptr<Data>(new Data("w-scale", {Precision::FP32, {outputChannels}, Layout::C}));
1056         auto wScale = CreateBlobFromData(wScaleData);
1057         wScale->allocate();
1058
1059         float* wScaleMemory = static_cast<float*>(wScale->buffer());
1060
1061         for (size_t i = 0; i < outputChannels; i++) {
1062             wScaleMemory[i] = 1.0 / weightScalers[i];
1063         }
1064         target_layer->blobs["w-scale"] = wScale;
1065
1066         auto oScale = statHelper.getOutputScale(statHelper.getLatestInFuse(target_layer));
1067         if (oScale) {
1068             // there might not be o-scale if we do not have statistic after convolution that means
1069             // returning to float precision after convolution
1070             target_layer->blobs["o-scale"] = oScale;
1071
1072             // debug scales. Need to compare with actual values in FP32 scoring
1073             target_layer->blobs["ext-scale"] = target_layer->blobs["o-scale"];
1074         } else {
1075             // we do not have statistics here, we cannot calculate requantizatin scales,
1076             // next layer will be calculated in fp32
1077             // it's time to return forcedly edge to fp32 as well
1078             target_layer->outData[0]->setPrecision(Precision::FP32);
1079         }
1080
1081         // Normalizing the weights
1082         ScaleDataToInt(&newWeights[0], weights->size(), int8weights, weightScalers);
1083     }
1084
1085     // Normalizing the biases
1086     if (biases) {
1087         const float* bias = static_cast<const float*>(biases->buffer());
1088         ScaleDataToInt(bias, biases->size(), int32biases, weightScalers);
1089     }
1090 }
1091
1092 bool CNNNetworkInt8Normalizer::layerProducesFloat(const CNNLayer::Ptr layer) {
1093     // currently we support only case of layers which have one output port
1094     if (layer->outData.size() > 1) {
1095         return false;
1096     }
1097
1098     bool consumersFP32 = true;
1099     for (const auto dOut : layer->outData[0]->getInputTo()) {
1100         if (dOut.second->precision != Precision::FP32) {
1101             consumersFP32 = false;
1102         }
1103     }
1104     return consumersFP32;
1105 }
1106
1107 void CNNNetworkInt8Normalizer::returnTailToFP32(const CNNLayer::Ptr layer) {
1108     std::set<CNNLayer::Ptr> layersToReturn;
1109     if (layerProducesFloat(layer)) {
1110         layersToReturn.insert(layer);
1111     }
1112
1113     while (!layersToReturn.empty()) {
1114         CNNLayer::Ptr layerA = *layersToReturn.begin();
1115         layersToReturn.erase(layerA);
1116         // 1. if it is Pooling layer, or concat layer, we can return it to FP32 as well
1117         // we need to return it's out data
1118         if ((CaselessEq<std::string>()(layerA->type, "pooling") || CaselessEq<std::string>()(layerA->type, "concat")) &&
1119             layerA->outData.size() == 1) {
1120             layerA->precision = Precision::FP32;
1121             layerA->outData[0]->setPrecision(Precision::FP32);
1122         }
1123
1124         if ((CaselessEq<std::string>()(layerA->type, "convolution") ||
1125              CaselessEq<std::string>()(layerA->type, "fullyconnected") ||
1126              CaselessEq<std::string>()(layerA->type, "relu") || isReLULikeClamp(layerA)) &&
1127             layerA->outData.size() == 1) {
1128             layerA->outData[0]->setPrecision(Precision::FP32);
1129             if (CaselessEq<std::string>()(layerA->type, "relu")
1130                     && layerA->insData[0].lock() != nullptr
1131                     && canLayerBeI8(layerA->insData[0].lock()->getCreatorLayer().lock())) {
1132                 layerA->precision = Precision::FP32;
1133                 layerA->insData[0].lock()->getCreatorLayer().lock()->outData[0]->setPrecision(Precision::FP32);
1134             }
1135         }
1136
1137         // adding parents for analysis
1138         if (!CaselessEq<std::string>()(layerA->type, "convolution") &&
1139             !CaselessEq<std::string>()(layerA->type, "fullyconnected")) {
1140             // for all parents, if they produce data to only FP32 layers
1141             for (auto i : layerA->insData) {
1142                 DataPtr d = i.lock();
1143                 if (d != nullptr && d->getCreatorLayer().lock()->precision != Precision::FP32 &&
1144                     (CaselessEq<std::string>()(layerA->type, "pooling") ||
1145                      CaselessEq<std::string>()(layerA->type, "relu") || isReLULikeClamp(layerA) ||
1146                      CaselessEq<std::string>()(layerA->type, "concat"))) {
1147                     if (layerProducesFloat(d->getCreatorLayer().lock())) {
1148                         layersToReturn.insert(d->getCreatorLayer().lock());
1149                     }
1150                 }
1151             }
1152         }
1153     }
1154 }
1155
1156 bool CNNNetworkInt8Normalizer::canLayerBeI8(const CNNLayer::Ptr& layer) {
1157     // fusion can happen only if initial layer supplies data to only one layer
1158     // if it sends to several layers - it is safe to execute initial layer in any precision
1159     if (layer->outData[0]->getInputTo().size() == 1) {
1160         std::string aType = layer->outData[0]->getInputTo().begin()->second->type;
1161         if (CaselessEq<std::string>()(aType, "relu")) {
1162             return true;
1163         } else if (CaselessEq<std::string>()(aType, "clamp")) {
1164             if (!isReLULikeClamp(layer->outData[0]->getInputTo().begin()->second)) {
1165                 return false;
1166             }
1167         } else {
1168             static const InferenceEngine::details::caseless_set<std::string> nonSuportedActivations = {
1169                 "elu",  "clamp",  "tanh",        "logistic",  "square", "abs",
1170                 "sqrt", "linear", "bounded_elu", "sort_relu", "relu6"};
1171             return nonSuportedActivations.find(aType) == nonSuportedActivations.end();
1172         }
1173     }
1174     return true;
1175 }
1176
1177 bool CNNNetworkInt8Normalizer::isNextFusionAllowed(const CNNLayer::Ptr& layer) {
1178     // fusion can happen only if initial layer supplies data to only one layer
1179     // if it sends to several layers - it is safe to execute initial layer in any precision
1180     if (layer->outData[0]->getInputTo().size() == 1) {
1181         std::string aType = layer->outData[0]->getInputTo().begin()->second->type;
1182         if (CaselessEq<std::string>()(aType, "relu")) {
1183             ReLULayer* rL = dynamic_cast<ReLULayer*>(layer->outData[0]->getInputTo().begin()->second.get());
1184             if (rL == nullptr) {
1185                 THROW_IE_EXCEPTION << "Layer " << layer->outData[0]->getInputTo().begin()->second->name
1186                                    << " is not instance of ReLULayer class";
1187             }
1188             if (rL->negative_slope != 0.f) {
1189                 return false;
1190             }
1191         } else if (CaselessEq<std::string>()(aType, "clamp")) {
1192             if (!isReLULikeClamp(layer->outData[0]->getInputTo().begin()->second)) {
1193                 return false;
1194             }
1195         } else {
1196             static const InferenceEngine::details::caseless_set<std::string> nonSuportedActivations = {
1197                 "elu",  "clamp",  "tanh",        "logistic",  "square", "abs",
1198                 "sqrt", "linear", "bounded_elu", "sort_relu", "relu6"};
1199             return nonSuportedActivations.find(aType) == nonSuportedActivations.end();
1200         }
1201     } else {
1202         if (CaselessEq<std::string>()(layer->type, "eltwise")) {
1203             return false;
1204         }
1205     }
1206     return true;
1207 }
1208
1209 bool CNNNetworkInt8Normalizer::isReLULikeClamp(CNNLayer::Ptr layer) {
1210     if (CaselessEq<std::string>()(layer->type, "Clamp")) {
1211         ClampLayer* clamp = dynamic_cast<ClampLayer*>(layer.get());
1212         if (clamp == nullptr) {
1213             THROW_IE_EXCEPTION << "Int8 Normalizer error: cannot cast layer '" << layer->name << "' to Clamp";
1214         }
1215         return clamp->min_value == 0;
1216     }
1217     return false;
1218 }
1219
1220 void CNNNetworkInt8Normalizer::DefinesExecutionPrecision(CNNNetwork& net, CNNStatisticHelper& statHelper) {
1221     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
1222
1223     // Converting layers to Int8. Calculating the multipliers if needed
1224     for (auto iter : sortedLayers) {
1225         if (iter->params.find("quantization_level") != iter->params.end() &&
1226             (iter->params["quantization_level"] == "FP32" || iter->params["quantization_level"] == "FP16")) {
1227             continue;
1228         }
1229
1230         // Legacy: FullyConnected should not be converted to Int8,
1231         // if it isn't explicitly marked to.
1232         if (iter->params.find("quantization_level") == iter->params.end() &&
1233             CaselessEq<std::string>()(iter->type, "fullyconnected")) {
1234             continue;
1235         }
1236
1237         if (!statHelper.canLayerBeQuantized(iter)) {
1238             continue;
1239         }
1240
1241         if (CaselessEq<std::string>()(iter->type, "convolution") ||
1242             CaselessEq<std::string>()(iter->type, "fullyconnected")) {
1243             if (canLayerBeI8(iter)) {
1244                 iter->precision = Precision::I8;
1245                 // we will override I8 to U8 during analysing of Conv-ReLU and Conv-Sum-ReLU fusions
1246                 iter->outData[0]->setPrecision(Precision::I8);
1247             }
1248         } else if (CaselessEq<std::string>()(iter->type, "relu") || isReLULikeClamp(iter)) {
1249             // casting to ReLU
1250             ReLULayer* rL = dynamic_cast<ReLULayer*>(iter.get());
1251             DataPtr outData = iter->outData.size() ? iter->outData[0] : nullptr;
1252             auto inputData = iter->insData[0].lock();
1253             if (inputData && inputData->getCreatorLayer().lock()->precision != Precision::FP32 &&
1254                 outData->getPrecision() == Precision::FP32) {
1255                 iter->precision = Precision::I8;
1256                 if (rL != nullptr && rL->negative_slope != 0.0f) {
1257                     outData->setPrecision(Precision::I8);
1258                 } else {
1259                     outData->setPrecision(Precision::U8);
1260                     // if convolution is a predecessor, change its data to U8 also
1261                     CNNLayer::Ptr prevLayer = inputData->getCreatorLayer().lock();
1262                     if (prevLayer && (CaselessEq<std::string>()(prevLayer->type, "convolution") ||
1263                                       CaselessEq<std::string>()(prevLayer->type, "fullyconnected") ||
1264                                       CaselessEq<std::string>()(prevLayer->type, "eltwise"))) {
1265                         if (!isNextFusionAllowed(prevLayer) && inputData->getPrecision() == Precision::I8) {
1266                             outData->setPrecision(Precision::I8);
1267                         } else {
1268                             inputData->setPrecision(Precision::U8);
1269                         }
1270                     }
1271                     // if there is a patter A0 -> Eltwise -> ReLU and Convolution -> Eltwise -> ReLU,
1272                     // need to mark data after conv as U8
1273                     if (prevLayer && CaselessEq<std::string>()(prevLayer->type, "eltwise")) {
1274                         // decising which input will be used for fusion conv-sum-relu
1275                         CNNLayer::Ptr input1 = prevLayer->insData[0].lock()->getCreatorLayer().lock();
1276                         CNNLayer::Ptr input2 = prevLayer->insData[1].lock()->getCreatorLayer().lock();
1277                         CNNLayer::Ptr convLayer = nullptr;
1278                         CNNLayer::Ptr sumLayer = nullptr;
1279
1280                         if (!CaselessEq<std::string>()(input1->type, "convolution")) {
1281                             sumLayer = input1;
1282                             convLayer = input2;
1283                         } else {
1284                             // it covers a case when both inputs are convolutions or when first input is not convolution
1285                             convLayer = input1;
1286                             sumLayer = input2;
1287                         }
1288                         convLayer->outData[0]->setPrecision(sumLayer->outData[0]->getPrecision());
1289                     }
1290                 }
1291             }
1292         } else if (CaselessEq<std::string>()(iter->type, "pooling")) {
1293             auto pool = dynamic_cast<PoolingLayer*>(iter.get());
1294             if (pool == nullptr) {
1295                 THROW_IE_EXCEPTION << "Int8 Normalizer error: cannot cast layer '" << iter->name << "' to pooling";
1296             }
1297
1298             if (pool->_type == PoolingLayer::MAX || (pool->_type == PoolingLayer::AVG && pool->outData.size() == 1)) {
1299                 auto prevLayer = iter->insData[0].lock()->getCreatorLayer().lock();
1300                 if (prevLayer && (prevLayer->precision == Precision::I8 || prevLayer->precision == Precision::U8)) {
1301                     iter->precision = Precision::I8;
1302                     iter->outData[0]->setPrecision(statHelper.hasNegativeOutput(iter->name) ? Precision::I8
1303                                                                                             : Precision::U8);
1304                 }
1305             }
1306         } else if (CaselessEq<std::string>()(iter->type, "concat")) {
1307             // we can do safe
1308             // casting to concat and take axis parameter
1309             // we can concat scales only if concat does concatination by feature maps
1310             bool axisFeatureMaps = false;
1311             auto concatLayer = dynamic_cast<ConcatLayer*>(iter.get());
1312             if (concatLayer) {
1313                 if (concatLayer->_axis == 1 && concatLayer->insData.size() &&
1314                     concatLayer->insData[0].lock()->getTensorDesc().getDims().size() == 4) {
1315                     axisFeatureMaps = true;
1316                 }
1317             } else {
1318                 THROW_IE_EXCEPTION << "Int8 Normalizer error: cannot cast layer " << iter->name << " to concat";
1319             }
1320
1321             if (axisFeatureMaps) {
1322                 // verification of input data types
1323                 bool inputFP32 = false;
1324                 bool inputI8 = false;
1325                 bool inputU8 = false;
1326
1327                 for (auto inputData : iter->insData) {
1328                     auto data = inputData.lock();
1329                     if (data->getPrecision() == Precision::FP32) {
1330                         inputFP32 = true;
1331                     } else if (data->getPrecision() == Precision::I8) {
1332                         inputI8 = true;
1333                     } else if (data->getPrecision() == Precision::U8) {
1334                         inputU8 = true;
1335                     } else {
1336                         // Is it a case of input, i.e. passing I16 to concat?
1337                         // TODO(amalyshe) to handle inputs as a separate usecase
1338                         THROW_IE_EXCEPTION << "I8 normalizer: input data has unknown precision on the edge for concat: "
1339                                            << data->getName();
1340                     }
1341                 }
1342
1343                 if (inputFP32) {
1344                     for (auto i : iter->insData) {
1345                         if (i.lock()->getCreatorLayer().lock()->precision != Precision::FP32) {
1346                             returnTailToFP32(i.lock()->getCreatorLayer().lock());
1347                         }
1348                     }
1349                 } else {
1350                     iter->precision = Precision::I8;
1351
1352                     // we set outpout precision to U8 only if all inputs are U8, in other case it will be I8
1353                     auto outputPrecision = (inputU8 && !inputI8) ? Precision::U8 : Precision::I8;
1354
1355                     // if we have mixed input for I8 and U8, we have to insert scale to edges having U8 to convert to I8
1356                     // Yes, it leads to loosing of some precision and might lead to some performance degradation
1357                     // until we have scale supporting s8/u8 input and s8/u8 output.
1358                     if (inputU8 && inputI8) {
1359                         // looking for all edges having U8
1360                         for (size_t d = 0; d < iter->insData.size(); d++) {
1361                             auto data = iter->insData[d].lock();
1362                             if (data->getPrecision() == Precision::U8) {
1363                                 const size_t c = static_cast<size_t>(data->getDims()[1]);
1364                                 std::vector<float> ssWValues(c, 1.0f);
1365                                 std::vector<float> ssSValues(c, 0.0f);
1366
1367                                 std::string layerName =
1368                                     data->getCreatorLayer().lock()->name + "_Concat_ScaleShift_U8I8_" + iter->name;
1369                                 CNNLayer::Ptr newLayer =
1370                                     createDWConvolutionForScale(layerName, c, ssWValues.data(), ssSValues.data());
1371                                 newLayer->precision = Precision::I8;
1372                                 AddLayerToCNNNetworkBeforeLayer(newLayer, iter, d);
1373
1374                                 // update statistic to pass quantization smoothly
1375                                 std::string inputLayerName =
1376                                     newLayer->insData[0].lock()->getCreatorLayer().lock()->name;
1377                                 statHelper.copyStatistics(inputLayerName, layerName);
1378                                 newLayer->outData[0]->setPrecision(Precision::I8);
1379                             }
1380                         }
1381                     }
1382
1383                     if (iter->outData.size() == 1) {
1384                         for (auto&& out : iter->outData) {
1385                             out->setPrecision(outputPrecision);
1386                         }
1387                     }
1388                 }
1389             }
1390         } else if (CaselessEq<std::string>()(iter->type, "eltwise")) {
1391             // we decide which of the layers will be in int-8 mode and initialize special scale which will be used
1392             // later in "conv-sum-relu" fuse. i8 execution of eltwise always assume this fusion
1393             if (canLayerBeI8(iter)) {
1394                 if (iter->insData.size() == 2) {
1395                     CNNLayer::Ptr input1 = iter->insData[0].lock()->getCreatorLayer().lock();
1396                     CNNLayer::Ptr input2 = iter->insData[1].lock()->getCreatorLayer().lock();
1397                     if ((CaselessEq<std::string>()(input1->type, "convolution") ||
1398                          CaselessEq<std::string>()(input2->type, "convolution")) &&
1399                         !CaselessEq<std::string>()(input1->type, "concat") &&
1400                         !CaselessEq<std::string>()(input2->type, "concat") && input1->precision != Precision::FP32 &&
1401                         input2->precision != Precision::FP32) {
1402                         // understand which layer will be used for sum
1403                         CNNLayer::Ptr sumLayer = nullptr;
1404                         CNNLayer::Ptr convLayer = nullptr;
1405
1406                         if (!CaselessEq<std::string>()(input1->type, "convolution")) {
1407                             sumLayer = input1;
1408                             convLayer = input2;
1409                         } else {
1410                             // it covers a case when both inputs are convolutions or when first input is not convolution
1411                             sumLayer = input2;
1412                             convLayer = input1;
1413                         }
1414
1415                         // if we find supported activation, mark it's output as I8 or U8 depending on statistics
1416                         if (iter->outData.size() == 1 && iter->outData[0]->getInputTo().size() == 1 &&
1417                             (CaselessEq<std::string>()(iter->outData[0]->getInputTo().begin()->second->type, "ReLU") ||
1418                              CNNNetworkInt8Normalizer::isReLULikeClamp(
1419                                  iter->outData[0]->getInputTo().begin()->second))) {
1420                             auto activation = iter->outData[0]->getInputTo().begin()->second;
1421                             activation->precision = Precision::I8;
1422                             if (!statHelper.hasNegativeOutput(statHelper.getLatestInFuse(convLayer)->name)) {
1423                                 activation->outData[0]->setPrecision(Precision::U8);
1424                                 iter->outData[0]->setPrecision(Precision::U8);
1425                             } else {
1426                                 activation->outData[0]->setPrecision(Precision::I8);
1427                                 iter->outData[0]->setPrecision(Precision::I8);
1428                             }
1429                         } else {
1430                             iter->outData[0]->setPrecision(Precision::I8);
1431                         }
1432
1433                         if (convLayer->outData[0]->getTensorDesc().getPrecision() == Precision::I8) {
1434                             // verify precision on input edges before and after eltwise fusion
1435                             // if we have i8/u8 missmatch between sum layer input and conv-sum-activation output,
1436                             // then in this case we have to add requantization to i8 on sum input edge
1437                             auto latestInFuse = statHelper.getLatestInFuse(convLayer);
1438                             if (latestInFuse->outData[0]->getTensorDesc().getPrecision() == Precision::I8) {
1439                                 if (input1 == sumLayer &&
1440                                     iter->insData[0].lock()->getTensorDesc().getPrecision() == Precision::U8) {
1441                                     sumLayer = addU8ToI8Conversion(iter->insData[0].lock(), iter, statHelper);
1442                                 } else if (input2 == sumLayer &&
1443                                            iter->insData[1].lock()->getTensorDesc().getPrecision() == Precision::U8) {
1444                                     sumLayer = addU8ToI8Conversion(iter->insData[0].lock(), iter, statHelper);
1445                                 }
1446                                 if (!sumLayer) {
1447                                     THROW_IE_EXCEPTION << "I8 normalizer had to add U8->I8 conversion before "
1448                                                        << iter->name << " but failed to do this";
1449                                 }
1450                             }
1451
1452                             // mark eltwise as a I8 executable, mark out data as I8
1453                             iter->precision = Precision::I8;
1454                             convLayer->outData[0]->setPrecision(sumLayer->outData[0]->getPrecision());
1455                             // calculate the only scale
1456                             Blob::Ptr sumLayerScales = statHelper.getOutputScale(statHelper.getLatestInFuse(sumLayer));
1457                             Blob::Ptr convLayerScales =
1458                                 statHelper.getOutputScale(statHelper.getLatestInFuse(convLayer));
1459                             float* sumScale = sumLayerScales->buffer().as<float*>();
1460                             float* convScale = convLayerScales->buffer().as<float*>();
1461                             for (size_t i = 0; i < sumLayerScales->size(); i++) {
1462                                 sumScale[i] /= convScale[i];
1463                             }
1464
1465                             iter->blobs["eltwise-sum-scale"] = sumLayerScales;
1466                         }
1467                     }
1468                 }
1469             } else {
1470                 // if there are convolutions are inputs to this eltwise, we forcedly move them to FP32
1471                 for (auto i : iter->insData) {
1472                     auto type = i.lock()->getCreatorLayer().lock()->type;
1473                     if (CaselessEq<std::string>()(type, "convolution") ||
1474                         CaselessEq<std::string>()(type, "fullyconnected")) {
1475                         i.lock()->getCreatorLayer().lock()->precision = Precision::FP32;
1476                         i.lock()->setPrecision(Precision::FP32);
1477                     }
1478                 }
1479             }
1480         } else if (CaselessEq<std::string>()(iter->type, "resample")) {
1481             iter->precision = Precision::I8;
1482             iter->outData[0]->setPrecision(iter->insData[0].lock()->getPrecision());
1483         }
1484     }
1485
1486     // quantization of weights/biases
1487     sortedLayers = CNNNetSortTopologically(net);
1488     for (auto iter : sortedLayers) {
1489         if (iter->precision == Precision::I8 && (CaselessEq<std::string>()(iter->type, "convolution") ||
1490                                                  CaselessEq<std::string>()(iter->type, "fullyconnected"))) {
1491             QuantizeConvolutionOrFullyConnected(iter, statHelper);
1492         }
1493     }
1494
1495     // Returning of tails to FP32 mode if optimistic approach marked them as I8
1496     // no sense to do pooling in i8, we can return just after convolution
1497     for (auto iter : sortedLayers) {
1498         // TODO(amalyshe) here is a handling of case when iter provides data to the only one next layer
1499         // need to extend to cases when it provides data to many layers
1500         if (iter->precision == Precision::I8 && iter->outData.size() == 1) {
1501             if ((iter->outData[0]->getInputTo().size() == 1 &&
1502                  iter->outData[0]->getInputTo().begin()->second->precision == Precision::FP32) ||
1503                 iter->outData[0]->getInputTo().size() == 0) {
1504                 returnTailToFP32(iter);
1505             }
1506         }
1507     }
1508 }
1509
1510 void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net, const CNNStatisticHelper& statHelper) {
1511     std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
1512
1513     // Moving o-scales down
1514     for (auto iter : sortedLayers) {
1515         if (iter->type == "Concat" && iter->precision == Precision::I8) {
1516             // Checking if all inputs are INT8
1517             bool all_inputs_are_int8 = true;
1518             for (int k = 0; k < iter->insData.size(); k++) {
1519                 auto prevKLayer = iter->insData[k].lock()->getCreatorLayer().lock();
1520                 if ((prevKLayer->precision != Precision::I8 && prevKLayer->precision != Precision::U8) ||
1521                     prevKLayer->blobs.find("i-concat-scale") == prevKLayer->blobs.end()) {
1522                     all_inputs_are_int8 = false;
1523                     break;
1524                 }
1525             }
1526
1527             if (all_inputs_are_int8) {
1528                 // Merging o-scales of the inputs to make one for the Concat
1529                 // Creating the o-scale for the Concat by concatenating the input concats
1530                 size_t outputChannels = iter->outData[0]->getTensorDesc().getDims()[1];
1531
1532                 std::shared_ptr<Data> oScaleData =
1533                     std::shared_ptr<Data>(new Data("o-scale", {Precision::FP32, {outputChannels}, Layout::C}));
1534                 auto oScale = CreateBlobFromData(oScaleData);
1535                 oScale->allocate();
1536
1537                 float* oScaleMemory = static_cast<float*>(oScale->buffer());
1538                 int cc = 0;
1539                 for (int in = 0; in < iter->insData.size(); in++) {
1540                     auto prevOScale = iter->insData[in].lock()->getCreatorLayer().lock()->blobs["i-concat-scale"];
1541                     float* prevOScaleMemory = static_cast<float*>(prevOScale->buffer());
1542
1543                     for (int c = 0; c < prevOScale->size(); c++) {
1544                         oScaleMemory[cc] = prevOScaleMemory[c];
1545                         cc++;
1546                     }
1547                 }
1548                 if (cc != outputChannels)
1549                     THROW_IE_EXCEPTION << "Size of o-scale after " << iter->name
1550                                        << " isn't equal to the channels count";
1551
1552                 iter->precision = Precision::I8;
1553                 iter->blobs["o-scale"] = oScale;
1554             }
1555         }
1556
1557         if (iter->blobs.find("o-scale") != iter->blobs.end()) {
1558             int int8Consumers = 0;
1559             int fp32Consumers = 0;
1560             if (iter->outData.size() > 1) {
1561                 THROW_IE_EXCEPTION << "normalization algorithm for int8 found layer having o-scale and multiple ports";
1562             }
1563             if (iter->outData.size() == 1) {
1564                 for (auto l : iter->outData[0]->getInputTo()) {
1565                     if (l.second->precision == Precision::I8 || l.second->precision == Precision::U8) {
1566                         if (CaselessEq<std::string>()(l.second->type, "Pooling") ||
1567                             CaselessEq<std::string>()(l.second->type, "ReLU") ||
1568                             CNNNetworkInt8Normalizer::isReLULikeClamp(l.second)) {
1569                             l.second->blobs["o-scale"] = iter->blobs["o-scale"];
1570                             // debug scales. Need to compare with actual values in FP32 scoring
1571                             l.second->blobs["ext-scale"] = l.second->blobs["o-scale"];
1572                             int8Consumers++;
1573                         } else if (l.second->type == "Convolution") {
1574                             l.second->blobs.erase("i-scale");
1575                             int8Consumers++;
1576                         } else if (CaselessEq<std::string>()(l.second->type, "Eltwise")) {
1577                             if (statHelper.getLatestInFuse(iter) != iter) {
1578                                 l.second->blobs["o-scale"] = iter->blobs["o-scale"];
1579                             }
1580                             int8Consumers++;
1581                         } else if ((l.second->precision == Precision::I8 || l.second->precision == Precision::U8) &&
1582                                    CaselessEq<std::string>()(l.second->type, "Resample")) {
1583                             // If resample has concat as input layer it should inherit it's
1584                             // output scale
1585                             if (l.second->insData.size() == 1) {
1586                                 CNNLayerPtr creator = l.second->insData[0].lock()->getCreatorLayer().lock();
1587                                 if (CaselessEq<std::string>()(creator->type, "Concat")) {
1588                                     l.second->blobs["o-scale"] = creator->blobs["o-scale"];
1589                                     l.second->blobs["i-concat-scale"] = l.second->blobs["o-scale"];
1590                                 }
1591                             }
1592
1593                             // No concat found, let use statistics
1594                             if (l.second->blobs.find("o-scale") == l.second->blobs.end()) {
1595                                 auto oScale = statHelper.getOutputScale(l.second);
1596                                 l.second->blobs["o-scale"] = oScale;
1597                                 l.second->blobs["i-concat-scale"] = l.second->blobs["o-scale"];
1598                             }
1599                             int8Consumers++;
1600                         } else if ((l.second->precision == Precision::I8) &&
1601                                    CaselessEq<std::string>()(l.second->type, "concat")) {
1602                             // if concat is i8, we can propagate oscale further to concat.
1603                             // The logic around o-scale assumes that if we have it in the layer after iteration
1604                             // in this loop it means that it must not be removed and we need to place
1605                             // scale. While for concat we return to one layer back and again need to analyze o-scale
1606                             // and it is not clear if we need to return o-scale or it was only for concat.
1607                             // Having all of this in mind, it's better to rename o-scale to i-concat-scale
1608                             iter->blobs["i-concat-scale"] = iter->blobs["o-scale"];
1609                             int8Consumers++;
1610                         } else {
1611                             fp32Consumers++;
1612                         }
1613                     } else if (CaselessEq<std::string>()(l.second->type, "priorbox") ||
1614                                CaselessEq<std::string>()(l.second->type, "priorboxclustered")) {
1615                     } else {
1616                         // we are leaving o-scale still for adding of scale-shift before FP32 layer
1617                         fp32Consumers++;
1618                     }
1619                 }
1620
1621                 if (iter->outData[0]->getInputTo().empty()) {
1622                     fp32Consumers++;
1623                 }
1624
1625                 if (CaselessEq<std::string>()(iter->type, "Convolution") ||
1626                     CaselessEq<std::string>()(iter->type, "FullyConnected")) {
1627                     if (int8Consumers) {
1628                         iter->blobs["oi-scale"] = iter->blobs["o-scale"];
1629                     } else {
1630                         iter->outData[0]->setPrecision(Precision::FP32);
1631                     }
1632                 }
1633                 if (!fp32Consumers) {
1634                     iter->blobs.erase("o-scale");
1635                 }
1636             }
1637         }
1638     }
1639
1640     // fixing cornercases when o-scale was propagated through linear tail but it is more efficient to leave
1641     // conversion to de-normalized values in convolution
1642     for (auto iter : sortedLayers) {
1643         if (iter->blobs.find("o-scale") != iter->blobs.end()) {
1644             // go over out data. if all outputs are fp32, continue this optimization
1645             bool canOptimize = true;
1646
1647             // current layer must not be convolution
1648             if (CaselessEq<std::string>()(iter->type, "convolution")) {
1649                 canOptimize = false;
1650             }
1651             for (auto o : iter->outData) {
1652                 for (auto ol : o->getInputTo()) {
1653                     if (ol.second->precision == Precision::I8) {
1654                         canOptimize = false;
1655                     }
1656                 }
1657             }
1658             if (!canOptimize) {
1659                 continue;
1660             }
1661             // trying to go up until convolution
1662             auto curLayer = iter;
1663             bool eliminateOScale = true;
1664             while (curLayer && curLayer->blobs.find("oi-scale") == curLayer->blobs.end() && eliminateOScale) {
1665                 if (curLayer->insData.size() == 1 && curLayer->insData[0].lock()->getCreatorLayer().lock() &&
1666                     curLayer->insData[0].lock()->getCreatorLayer().lock()->outData.size() == 1 &&
1667                     curLayer->insData[0].lock()->getInputTo().size() == 1) {
1668                     curLayer = curLayer->insData[0].lock()->getCreatorLayer().lock();
1669                     if (!CaselessEq<std::string>()(curLayer->type, "Pooling") &&
1670                         !CaselessEq<std::string>()(curLayer->type, "ReLU") && !isReLULikeClamp(curLayer) &&
1671                         !CaselessEq<std::string>()(curLayer->type, "Convolution")) {
1672                         eliminateOScale = false;
1673                     }
1674                 } else {
1675                     eliminateOScale = false;
1676                 }
1677             }
1678             if (eliminateOScale && curLayer) {
1679                 for (auto o : iter->outData) {
1680                     o->setPrecision(Precision::FP32);
1681                 }
1682                 for (auto o : curLayer->outData) {
1683                     o->setPrecision(Precision::FP32);
1684                 }
1685
1686                 curLayer->blobs.erase("oi-scale");
1687                 iter->blobs.erase("o-scale");
1688                 auto iLayer = iter;
1689                 while (iLayer != curLayer) {
1690                     if (iLayer->type == "Pooling") {
1691                         iLayer->precision = Precision::FP32;
1692                     }
1693                     iLayer = iLayer->insData[0].lock()->getCreatorLayer().lock();
1694                 }
1695             }
1696         }
1697     }
1698 }
1699
1700 std::string getBlobDimention(const Blob::Ptr blob) {
1701     size_t idx = blob->getTensorDesc().getDims().size();
1702
1703     std::stringstream blobDimention;
1704     blobDimention << "[";
1705     for (auto& dim : blob->getTensorDesc().getDims()) {
1706         blobDimention << dim << ((--idx) != 0u ? ", " : "");
1707     }
1708     blobDimention << "]";
1709
1710     return blobDimention.str();
1711 }
1712
1713 void precisionColoring(const CNNLayerPtr layer, ordered_properties& printed_properties,
1714                        ordered_properties& node_properties) {
1715     // looking for the w-scale
1716     if (layer->blobs.find("w-scale") != layer->blobs.end()) {
1717         printed_properties.insert(
1718             printed_properties.begin(),
1719             std::pair<std::string, std::string>("w-scale", getBlobDimention(layer->blobs.find("w-scale")->second)));
1720     }
1721
1722     // looking for the oi-scale
1723     if (layer->blobs.find("oi-scale") != layer->blobs.end()) {
1724         printed_properties.insert(
1725             printed_properties.begin(),
1726             std::pair<std::string, std::string>("oi-scale", getBlobDimention(layer->blobs.find("oi-scale")->second)));
1727     }
1728
1729     // looking for the o-scale
1730     if (layer->blobs.find("o-scale") != layer->blobs.end()) {
1731         printed_properties.insert(
1732             printed_properties.begin(),
1733             std::pair<std::string, std::string>("o-scale", getBlobDimention(layer->blobs.find("o-scale")->second)));
1734     }
1735     // looking for the i-scale
1736     if (layer->blobs.find("i-scale") != layer->blobs.end()) {
1737         printed_properties.insert(
1738             printed_properties.begin(),
1739             std::pair<std::string, std::string>("i-scale", getBlobDimention(layer->blobs.find("i-scale")->second)));
1740     }
1741
1742     printed_properties.insert(
1743         printed_properties.begin(),
1744         std::pair<std::string, std::string>("Precision", layer->precision == Precision::FP32 ? "FP32" : "I8"));
1745
1746     if (layer->precision == Precision::FP32) {
1747         node_properties.emplace_back("fillcolor", "#5A5DF0");
1748     } else {
1749         node_properties.emplace_back("fillcolor", "#20F608");
1750     }
1751 }
1752
1753 void CNNNetworkInt8Normalizer::NormalizeNetwork(ICNNNetwork& network, ICNNNetworkStats& netStats) {
1754     CNNNetwork cnnn(ICNNNetwork::Ptr(&network, [](void*) {}));
1755
1756     int maxSign = 0x7F;
1757     int maxUnsign = 0xFF;
1758
1759     // Applying int8-conversion
1760     StatsMap statsMap = netStats.getNodesStats();
1761
1762     CNNStatisticHelper statHelper(cnnn, statsMap, maxSign, maxUnsign);
1763
1764     replaceScaleShiftByDWConvolution(cnnn);
1765
1766     DefinesExecutionPrecision(cnnn, statHelper);
1767     PropagateScaleFactors(cnnn, statHelper);
1768     ClampsToReLU(cnnn, statHelper);
1769     AddScaleShifts(cnnn, statHelper);
1770 #ifndef NDEBUG
1771     std::ofstream file("i8_normalized.dot");
1772     saveGraphToDot(cnnn, file, precisionColoring);
1773 #endif
1774 }