1 // Copyright (C) 2018-2020 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
5 #include "cnn_network_int8_normalizer.hpp"
7 #include <data_stats.h>
8 #include <details/ie_cnn_network_tools.h>
12 #include <blob_factory.hpp>
15 #include <details/caseless.hpp>
25 #include "cnn_network_impl.hpp"
26 #include "cnn_network_stats_impl.hpp"
27 #include "ie_util_internal.hpp"
29 IE_SUPPRESS_DEPRECATED_START
32 using namespace InferenceEngine;
33 using namespace InferenceEngine::details;
35 using StatsMap = std::map<std::string, InferenceEngine::NetworkNodeStatsPtr>;
37 CNNStatisticHelper::CNNStatisticHelper(CNNNetwork& network,
38 const std::map<std::string, NetworkNodeStatsPtr>& internalNodesStats,
39 int maxSign, int maxUnsign) {
40 internalNodesStats_ = internalNodesStats;
43 maxUnsign_ = maxUnsign;
48 bool CNNStatisticHelper::canLayerBeQuantized(CNNLayer::Ptr layer) const {
49 // verification of existing statistic for all inputs
50 for (const auto i : layer->insData) {
51 if (internalNodesStats_.find(i.lock()->getCreatorLayer().lock()->name) == internalNodesStats_.end()) {
55 // verification if there is a statistic for output of the layer
56 if ((layer->outData.size() > 1) && (internalNodesStats_.find(layer->name) == internalNodesStats_.end())) {
62 void CNNStatisticHelper::copyStatistics(const std::string& srcName, const std::string& dstName) {
63 internalNodesStats_[dstName] = internalNodesStats_[srcName];
66 bool CNNStatisticHelper::hasNegativeOutput(const std::string& layerName, int outputPort) const {
67 // TODO(amalyshe) parameter outputPort is not used yet, logic of dedication to the port
68 // should be implemented
70 NetworkNodeStatsPtr layerStat = internalNodesStats_.at(layerName);
71 for (auto v : layerStat->_minOutputs) {
79 InferenceEngine::Blob::Ptr CNNStatisticHelper::getInputScale(CNNLayer::Ptr layer) const {
80 auto inDataPtr = layer->insData[0].lock();
81 if (inDataPtr == nullptr)
83 auto previousLayer = inDataPtr->getCreatorLayer().lock();
84 std::string inputLayerName = previousLayer->name;
86 // for case when we have the only average pooling before, we need to take this
87 // statistic from input of avg pooling to compensate work of average pooling
88 // and to stay in int8 as much as we can
89 if (previousLayer->type == "Pooling" &&
90 (previousLayer->precision == Precision::I8 || previousLayer->precision == Precision::U8)) {
91 // take input name to the pooling
92 auto prevInDataPtr = previousLayer->insData[0].lock();
93 if (prevInDataPtr == nullptr)
95 inputLayerName = prevInDataPtr->getCreatorLayer().lock()->name;
97 size_t inputChannels = inDataPtr->getTensorDesc().getDims()[1];
98 if (getStatistic(previousLayer)->_minOutputs.size() != inputChannels ||
99 getStatistic(previousLayer)->_maxOutputs.size() != inputChannels) {
100 THROW_IE_EXCEPTION << "min and max sizes should be equal to input channels count for " << previousLayer->name;
103 // current normalization algorithm can have nodes with fp32 edges. it can happen only in places
104 // of initial quantization of int8 chains. Currently adding scaleshift adds certain I8/U8 precision
105 // but calcualtion of scales happens before adding of scale shifts.
106 // for fixing problem with cases of not determined yet presision and for following of
107 // quantizatoin scheme defined by normalizer, we are adding here verification of negative output
108 // in some cases and then verify exact precision of I8/U8 on node for covering of fully determined cases
109 int maxValue = hasNegativeOutput(previousLayer->name) ? maxSign_ : maxUnsign_;
110 if (previousLayer->outData[0]->getPrecision() == Precision::U8) {
111 maxValue = maxUnsign_;
112 } else if (previousLayer->outData[0]->getPrecision() == Precision::I8) {
116 return calculateScaleFactor(inputChannels, getStatistic(previousLayer), maxValue);
119 InferenceEngine::Blob::Ptr CNNStatisticHelper::getOutputScale(CNNLayer::Ptr layer) const {
120 // TODO(amalyshe) for now we are looking to precision on the data node
121 size_t outputChannels = layer->outData[0]->getTensorDesc().getDims()[1];
122 if (layer->outData.size() != 1) {
123 THROW_IE_EXCEPTION << "Trying to get scales after layer having multiple output ports";
126 auto it = internalNodesStats_.find(layer->name);
127 if (it == internalNodesStats_.end()) {
128 return std::shared_ptr<Blob>();
131 if (getStatistic(layer)->_minOutputs.size() != outputChannels ||
132 getStatistic(layer)->_maxOutputs.size() != outputChannels) {
133 THROW_IE_EXCEPTION << "min and max sizes should be equal to output channels count for " << layer->name;
136 return calculateScaleFactor(outputChannels, getStatistic(layer),
137 layer->outData[0]->getPrecision() == Precision::I8 ? maxSign_ : maxUnsign_);
140 int CNNStatisticHelper::getMaxSignValue() const {
144 InferenceEngine::Blob::Ptr CNNStatisticHelper::calculateScaleFactor(size_t channels, NetworkNodeStatsPtr stats,
146 if (stats->_minOutputs.size() != channels || stats->_maxOutputs.size() != channels) {
147 THROW_IE_EXCEPTION << "min and max sizes should be equal to channels count";
150 // Creating i-scale blob
151 std::shared_ptr<Data> iScaleData =
152 std::shared_ptr<Data>(new Data("scale", {Precision::FP32, {channels}, Layout::C}));
153 auto iScale = CreateBlobFromData(iScaleData);
155 float* iScaleMemory = static_cast<float*>(iScale->buffer());
157 for (int c = 0; c < channels; c++) {
158 // maxc = fmax(maxc, fabs(stats[k]->_minOutputs[c])); // TODO Check if we should take minimums into
160 float maxc = fabs(stats->_maxOutputs[c]);
161 maxc = fmax(maxc, fabs(stats->_minOutputs[c]));
163 iScaleMemory[c] = maxc / static_cast<float>(maxInt);
165 if (fabs(iScaleMemory[c]) < 1e-7) {
166 iScaleMemory[c] = 1.0f;
172 NetworkNodeStatsPtr CNNStatisticHelper::getStatistic(CNNLayer::Ptr layer) const {
173 // TODO(amalyshe) all logic of traversing over network and get apropriate statistics should be here
174 // for now it is a stub
175 auto it = internalNodesStats_.find(getLatestInFuse(layer)->name);
176 if (it != internalNodesStats_.end()) {
179 THROW_IE_EXCEPTION << "no stat for layer " << getLatestInFuse(layer)->name;
182 CNNLayer::Ptr CNNStatisticHelper::getLatestInFuse(CNNLayer::Ptr layer) const {
183 if (layer->outData[0]->getInputTo().size() == 1 &&
184 (CaselessEq<std::string>()(layer->outData[0]->getInputTo().begin()->second->type, "relu") ||
185 CNNNetworkInt8Normalizer::isReLULikeClamp(layer->outData[0]->getInputTo().begin()->second))) {
186 return layer->outData[0]->getInputTo().begin()->second;
188 // Conv-Sum-ReLU fuse
189 // We need to return original layer if it will be used as a sum parame and ReLU if
190 // iterating over outputs of pointed layer and look for the only eltwise
191 CNNLayer::Ptr eltwise = nullptr;
192 if (layer->outData.size() == 1) {
193 for (auto it : layer->outData[0]->getInputTo()) {
194 if (CaselessEq<std::string>()(it.second->type, "eltwise")) {
196 THROW_IE_EXCEPTION << "Pattern when one layer pass data to several eltwise layers are not "
197 "supported in int8 quantization";
205 // if current layer is not a convolution return it as finish of fuse
206 if (!CaselessEq<std::string>()(layer->type, "convolution")) {
209 // look to the ports of eltwise
210 if (eltwise->insData[0].lock() != nullptr
211 && eltwise->insData[1].lock() != nullptr
212 && eltwise->insData[1].lock()->getCreatorLayer().lock() == layer
213 && CaselessEq<std::string>()(eltwise->insData[0].lock()->getCreatorLayer().lock()->type, "convolution")
214 && eltwise->insData[0].lock()->getInputTo().size() == 1) {
215 // this is a case when two convolutions come to eltwise, the second one will be selected for fuse,
216 // first will be used as sum operator
219 // given layer is a convolution and will be used for fuse, but we need to verify if there is ReLU after
221 if (eltwise->outData[0]->getInputTo().size() == 1 &&
222 (CaselessEq<std::string>()(eltwise->outData[0]->getInputTo().begin()->second->type, "relu") ||
223 CNNNetworkInt8Normalizer::isReLULikeClamp(eltwise->outData[0]->getInputTo().begin()->second))) {
224 return eltwise->outData[0]->getInputTo().begin()->second;
233 void CNNStatisticHelper::NormalizeStatistic() {
236 // In case when we have statistics in negative range when min clamped value is 0,
237 // we are changing statistics here to non negative. This is not fully correct behaviour since
238 // it can extend range and affect accuracy, but this approach works quite well
239 std::vector<CNNLayerPtr> sortedLayersRC = CNNNetSortTopologically(network_);
240 for (auto l : sortedLayersRC) {
241 if (CNNNetworkInt8Normalizer::isReLULikeClamp(l)) {
242 if (l->outData.size() == 1) {
243 size_t outputChannels = l->outData[0]->getTensorDesc().getDims()[1];
244 auto oldStat = internalNodesStats_.find(l->name);
245 if ((oldStat != internalNodesStats_.end()) && outputChannels > 1) {
246 for (size_t q = 0; q < oldStat->second->_minOutputs.size(); q++) {
247 oldStat->second->_minOutputs[q] = 0.f;
256 std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(network_);
257 for (auto l : sortedLayers) {
258 // if layer's statistic exists in the newMap, ignore it
259 if (newMap.find(l->name) != newMap.end()) {
262 // verify if layer is starter layer for propagating of statistic
263 bool isStarterLayer = false;
265 // a case if we do not have converted statistic before the current layer
266 // go over all inputs and verify if statistic exists for all of inputs
267 bool allInputsHaveStatistics = true;
268 for (auto i : l->insData) {
269 if (newMap.find(i.lock()->getCreatorLayer().lock()->name) == newMap.end()) {
270 allInputsHaveStatistics = false;
274 // if we do not have statistic - verify who is consumer of this layer
275 if (!allInputsHaveStatistics) {
276 if (l->outData.size() == 1) {
277 for (auto it : l->outData[0]->getInputTo()) {
278 if (CaselessEq<std::string>()(it.second->type, "scaleshift") ||
279 CaselessEq<std::string>()(it.second->type, "convolution") ||
280 CaselessEq<std::string>()(it.second->type, "fullyconnected")) {
281 isStarterLayer = true;
287 isStarterLayer = true;
289 if (CaselessEq<std::string>()(l->type, "scaleshift") || CaselessEq<std::string>()(l->type, "convolution") ||
290 CaselessEq<std::string>()(l->type, "fullyconnected")) {
291 isStarterLayer = true;
294 if (!isStarterLayer) {
298 // we do not support yet layers for quantization which split data
299 if (l->outData.size() != 1) {
303 InferenceEngine::NetworkNodeStatsPtr currentStat = std::make_shared<NetworkNodeStats>();
305 bool perChannelScale = true;
307 if (CaselessEq<std::string>()(l->type, "concat") && l->outData.size() == 1 &&
308 l->outData[0]->getTensorDesc().getDims().size() == 4 && allInputsHaveStatistics) {
309 size_t concatLayerIdx = 0;
310 for (int k = 0; k < l->insData.size(); k++) {
311 auto prevKLayer = l->insData[k].lock()->getCreatorLayer().lock();
312 // looking for the statistic for prevKLayer
313 auto kLayerStat = newMap.find(prevKLayer->name);
314 if (kLayerStat != newMap.end()) {
315 for (size_t ikStat = 0; ikStat < kLayerStat->second->_maxOutputs.size();
316 ikStat++, concatLayerIdx++) {
317 currentStat->_maxOutputs.push_back(kLayerStat->second->_maxOutputs[ikStat]);
318 currentStat->_minOutputs.push_back(kLayerStat->second->_minOutputs[ikStat]);
321 THROW_IE_EXCEPTION << "We have incomplete statistic for predecessors of concat layer " << l->name;
324 } else if (CaselessEq<std::string>()(l->type, "resample")) {
325 if (l->insData.size() == 1) {
326 CNNLayerPtr creator = l->insData[0].lock()->getCreatorLayer().lock();
327 if (CaselessEq<std::string>()(creator->type, "concat")) {
328 auto concatStat = newMap[creator->name];
329 currentStat->_maxOutputs = concatStat->_maxOutputs;
330 currentStat->_minOutputs = concatStat->_minOutputs;
331 newMap[l->name] = currentStat;
333 auto itOld = internalNodesStats_.find(l->name);
334 if (itOld != internalNodesStats_.end()) {
335 currentStat->_maxOutputs = itOld->second->_maxOutputs;
336 currentStat->_minOutputs = itOld->second->_minOutputs;
337 newMap[l->name] = currentStat;
342 // go over all children until we get convoluition, scaleshift, eltwise or unknown layer
343 // layers Pooling and ReLU are passthrough
344 // to understand the granularity of the scaling
345 // layer concat is a layer which produce statistics and waterfall it down
346 std::vector<CNNLayer::Ptr> toAnalyze;
347 for (auto it : l->outData[0]->getInputTo()) {
348 toAnalyze.push_back(it.second);
351 if (CaselessEq<std::string>()(l->type, "eltwise")) {
352 perChannelScale = false;
354 while (!toAnalyze.empty() && perChannelScale) {
355 CNNLayer::Ptr tl = toAnalyze.back();
356 toAnalyze.pop_back();
357 if (CaselessEq<std::string>()(tl->type, "pooling") || CaselessEq<std::string>()(tl->type, "relu") ||
358 CNNNetworkInt8Normalizer::isReLULikeClamp(tl) || CaselessEq<std::string>()(tl->type, "concat")) {
359 if (tl->outData.size() == 1) {
360 for (auto it : tl->outData[0]->getInputTo()) {
361 toAnalyze.push_back(it.second);
364 } else if (CaselessEq<std::string>()(tl->type, "convolution")) {
365 // verify number of groups
366 ConvolutionLayer* pConv = dynamic_cast<ConvolutionLayer*>(tl.get());
367 if (pConv == nullptr) {
368 THROW_IE_EXCEPTION << "Layer " << tl->name << " is not instance of ConvolutionLayer class";
370 if (pConv->_group != pConv->_out_depth) {
371 perChannelScale = false;
373 } else if (CaselessEq<std::string>()(tl->type, "eltwise")) {
374 perChannelScale = false;
378 auto itOld = internalNodesStats_.find(getLatestInFuse(l)->name);
379 if (itOld == internalNodesStats_.end()) {
380 itOld = internalNodesStats_.find(l->name);
382 if (itOld != internalNodesStats_.end()) {
383 if (!perChannelScale) {
384 currentStat->_maxOutputs.resize(itOld->second->_maxOutputs.size());
385 if (!itOld->second->_maxOutputs.empty()) {
387 DataStats::GetDataAbsMax(&itOld->second->_maxOutputs[0], itOld->second->_maxOutputs.size(),
389 std::fill(currentStat->_maxOutputs.begin(), currentStat->_maxOutputs.end(), max);
392 currentStat->_minOutputs.resize(itOld->second->_minOutputs.size());
393 if (!itOld->second->_minOutputs.empty()) {
395 DataStats::GetDataMinMax(&itOld->second->_minOutputs[0], itOld->second->_minOutputs.size(), min,
397 std::fill(currentStat->_minOutputs.begin(), currentStat->_minOutputs.end(), min);
400 currentStat->_maxOutputs = itOld->second->_maxOutputs;
401 currentStat->_minOutputs = itOld->second->_minOutputs;
405 if (l->outData.size() == 1) {
406 size_t ch_indx = l->outData[0]->getTensorDesc().getDims().size() > 1 ? 1 : 0;
407 size_t outputChannels = l->outData[0]->getTensorDesc().getDims()[ch_indx];
408 auto oldStat = internalNodesStats_.find(l->name);
409 if ((oldStat != internalNodesStats_.end()) && outputChannels > 1 &&
410 oldStat->second->_minOutputs.size() == 1) {
411 auto min = oldStat->second->_minOutputs[0];
412 auto max = oldStat->second->_maxOutputs[0];
414 currentStat->_minOutputs = std::vector<float>(outputChannels);
415 currentStat->_maxOutputs = std::vector<float>(outputChannels);
416 std::fill(currentStat->_minOutputs.begin(), currentStat->_minOutputs.end(), min);
417 std::fill(currentStat->_maxOutputs.begin(), currentStat->_maxOutputs.end(), max);
422 // propagate this statistic to all layers without scale in primitives
423 if (!currentStat->_maxOutputs.empty() && !currentStat->_minOutputs.empty()) {
424 std::vector<CNNLayer::Ptr> toAnalyze;
425 toAnalyze.push_back(l);
426 while (!toAnalyze.empty()) {
427 CNNLayer::Ptr tl = toAnalyze.back();
428 toAnalyze.pop_back();
429 newMap[tl->name] = currentStat;
430 if (tl->outData.size() == 1) {
431 for (auto it : tl->outData[0]->getInputTo()) {
432 if (CaselessEq<std::string>()(it.second->type, "pooling") ||
433 CaselessEq<std::string>()(it.second->type, "relu") ||
434 CNNNetworkInt8Normalizer::isReLULikeClamp(it.second)) {
435 toAnalyze.push_back(it.second);
443 internalNodesStats_ = newMap;
446 void CNNNetworkInt8Normalizer::AddLayerToCNNNetworkBeforeLayer(CNNLayer::Ptr newLayer, CNNLayer::Ptr successor,
448 // verify if data exists
449 if (newLayer && successor && successor->insData.size() > port) {
451 DataPtr pData = successor->insData[port].lock();
453 Data* edge2 = new Data(*pData.get());
454 DataPtr newEdge(edge2);
455 newEdge->getInputTo().clear();
456 newEdge->getInputTo()[successor->name] = successor;
457 newEdge->setName(newLayer->name);
458 newEdge->getCreatorLayer() = newLayer;
459 successor->insData[port] = newEdge;
460 newLayer->outData.push_back(newEdge);
462 newLayer->insData.push_back(pData);
463 pData->getInputTo().erase(successor->name);
464 pData->getInputTo()[newLayer->name] = newLayer;
466 THROW_IE_EXCEPTION << "Invalid argument";
470 CNNLayer::Ptr CNNNetworkInt8Normalizer::addU8ToI8Conversion(DataPtr data, CNNLayer::Ptr successor,
471 CNNStatisticHelper& statHelper) {
472 if (data->getPrecision() == Precision::U8 || data->getPrecision() == Precision::I8) {
473 size_t c = static_cast<size_t>(data->getDims()[1]);
475 std::vector<float> ssWValues;
476 std::vector<float> ssSValues;
477 for (auto i = 0; i < c; i++) {
478 ssWValues.push_back(1.0f);
479 ssSValues.push_back(0.0f);
481 std::string layerName = data->getCreatorLayer().lock()->name + "_Eltwise_ScaleShift_U8I8_" + successor->name;
482 CNNLayer::Ptr newLayer = createDWConvolutionForScale(layerName, c, ssWValues.data(), ssSValues.data());
483 newLayer->precision = Precision::I8;
485 for (size_t i = 0; i < successor->insData.size(); i++) {
486 if (successor->insData[i].lock() == data) {
487 AddLayerToCNNNetworkBeforeLayer(newLayer, successor, i);
489 // update statistic to pass quantization smoothly
490 if (newLayer->insData[0].lock() == nullptr)
492 std::string inputLayerName = newLayer->insData[0].lock()->getCreatorLayer().lock()->name;
493 statHelper.copyStatistics(inputLayerName, layerName);
494 if (data->getPrecision() == Precision::U8) {
495 newLayer->outData[0]->setPrecision(Precision::I8);
497 newLayer->outData[0]->setPrecision(Precision::U8);
506 void CNNNetworkInt8Normalizer::AddLayerToCNNNetworkAfterData(DataPtr pData, CNNLayer::Ptr layer,
507 const std::string& nextLayerName) {
508 // verify if data exists
509 if (pData && layer && pData->getCreatorLayer().lock() &&
510 pData->getInputTo().find(nextLayerName) != pData->getInputTo().end()) {
511 CNNLayerPtr nextLayer = pData->getInputTo()[nextLayerName];
513 DataPtr newEdgeAfterLayer(new Data(*pData.get()));
514 newEdgeAfterLayer->setName(layer->name);
515 newEdgeAfterLayer->getCreatorLayer() = layer;
516 newEdgeAfterLayer->getInputTo().clear();
517 newEdgeAfterLayer->getInputTo()[nextLayerName] = nextLayer;
518 newEdgeAfterLayer->setPrecision(Precision::FP32);
520 pData->getInputTo().erase(nextLayerName);
521 pData->getInputTo()[layer->name] = layer;
523 layer->insData.push_back(pData);
524 layer->outData.push_back(newEdgeAfterLayer);
526 for (size_t i = 0; i < nextLayer->insData.size(); i++) {
527 if (nextLayer->insData[i].lock() == pData) {
528 nextLayer->insData[i] = newEdgeAfterLayer;
532 THROW_IE_EXCEPTION << "Invalid argument";
536 void CNNNetworkInt8Normalizer::fillInScaleShift(ScaleShiftLayer* scshLayer, size_t c, float* weightsN,
539 SizeVector weightsSize = {c};
540 TensorDesc weightsDesc(Precision::FP32, weightsSize, InferenceEngine::C);
541 scshLayer->_weights = InferenceEngine::make_shared_blob<float>(weightsDesc);
542 scshLayer->_weights->allocate();
543 float* weightsData = scshLayer->_weights->buffer();
544 for (size_t i = 0; i < c; i++) {
545 if (weightsN == nullptr && weightsD != nullptr) {
546 weightsData[i] = 1.0 / weightsD[i];
547 } else if (weightsD == nullptr && weightsN != nullptr) {
548 weightsData[i] = weightsN[i];
549 } else if (weightsN != nullptr && weightsD != nullptr) {
550 weightsData[i] = weightsN[i] / weightsD[i];
552 weightsData[i] = 1.0;
557 SizeVector shiftsSize = {c};
558 TensorDesc shiftsDesc(Precision::FP32, shiftsSize, InferenceEngine::C);
559 scshLayer->_biases = InferenceEngine::make_shared_blob<float>(shiftsDesc);
560 scshLayer->_biases->allocate();
561 float* biasesData = scshLayer->_biases->buffer();
562 for (size_t i = 0; i < c; i++) {
563 biasesData[i] = 0.f; // Setting to constant "0"
567 void CNNNetworkInt8Normalizer::AddScaleShiftBetween(CNNNetwork& net, const CNNLayerPtr layer1, const CNNLayerPtr layer2,
568 CNNStatisticHelper& statHelper) {
569 if (CaselessEq<std::string>()(layer2->type, "priorbox") ||
570 CaselessEq<std::string>()(layer2->type, "priorboxclustered")) {
574 // Searching the connection between the layers
576 for (; l1_out_i < layer1->outData.size(); l1_out_i++) {
577 if (layer1->outData[l1_out_i]->getInputTo().find(layer2->name) !=
578 layer1->outData[l1_out_i]->getInputTo().end()) {
582 if (l1_out_i == layer1->outData.size()) {
583 THROW_IE_EXCEPTION << "Can't find layer " << layer2->name << " among layer " << layer1->name << " outputs";
587 for (; l2_in_i < layer2->insData.size(); l2_in_i++) {
588 if (layer2->insData[l2_in_i].lock() != nullptr
589 && layer2->insData[l2_in_i].lock()->getCreatorLayer().lock() == layer1) {
593 if (l2_in_i == layer2->insData.size()) {
594 THROW_IE_EXCEPTION << "Can't find layer " << layer2->name << " among layer " << layer1->name << " inputs";
597 DataPtr outData = layer1->outData[l1_out_i];
599 Blob::Ptr oScaleBlob = nullptr;
600 if (layer1->blobs.find("o-scale") != layer1->blobs.end()) {
601 oScaleBlob = layer1->blobs["o-scale"];
604 Blob::Ptr iScaleBlob = nullptr;
605 if (layer2->blobs.find("i-scale") != layer2->blobs.end()) {
606 iScaleBlob = layer2->blobs["i-scale"];
609 if (iScaleBlob == nullptr && oScaleBlob == nullptr) {
610 return; // No multipliers found around this edge. We can't create a ScaleShift here;
612 // Creating a ScaleShiftLayer
614 float *iScaleBuffer = nullptr, *oScaleBuffer = nullptr;
615 if (oScaleBlob != nullptr) {
616 oScaleBuffer = static_cast<float*>(oScaleBlob->buffer());
619 if (iScaleBlob != nullptr) {
620 iScaleBuffer = static_cast<float*>(iScaleBlob->buffer());
624 std::string layerName = layer1->name + "_" + prefix + "ScaleShift_" + layer2->name;
625 LayerParams ssCnnLayerParams {layerName, "ScaleShift", Precision::FP32};
626 CNNLayerPtr ssCnnLayer(new ScaleShiftLayer(ssCnnLayerParams));
628 AddLayerToCNNNetworkAfterData(outData, ssCnnLayer, layer2->name);
630 size_t c = static_cast<size_t>(outData->getDims()[1]);
633 ScaleShiftLayer* scshLayer = dynamic_cast<ScaleShiftLayer*>(ssCnnLayer.get());
634 if (scshLayer == nullptr) {
635 THROW_IE_EXCEPTION << "Layer " << ssCnnLayer->name << " is not instance of ScaleShiftLayer class";
637 fillInScaleShift(scshLayer, c, oScaleBuffer, iScaleBuffer);
640 Precision odPrecision = Precision::FP32;
641 if (layer2->precision == Precision::I8) {
642 odPrecision = statHelper.hasNegativeOutput(layer1->name) ? Precision::I8 : Precision::U8;
644 ssCnnLayer->outData[0]->setPrecision(odPrecision);
648 void CNNNetworkInt8Normalizer::AddScaleShifts(CNNNetwork& net, CNNStatisticHelper& statHelper) {
649 std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
651 std::vector<std::pair<CNNLayerPtr, CNNLayerPtr>> pairs;
653 for (auto iter : sortedLayers) {
654 for (int l1_out_i = 0; l1_out_i < iter->outData.size(); l1_out_i++) {
655 for (auto nextIter : iter->outData[l1_out_i]->getInputTo()) {
656 CNNLayer::Ptr next = nextIter.second;
658 // Checking for an INT8 convolution or fully connected with FP32 output
659 if ((CaselessEq<std::string>()(iter->type, "Convolution") ||
660 CaselessEq<std::string>()(iter->type, "FullyConnected")) &&
661 iter->precision == Precision::I8 && next->precision == Precision::FP32 &&
662 iter->outData[l1_out_i]->getPrecision() == Precision::FP32) {
663 // Do nothing here only if iter provides data to fp32 layers
664 // MKLDNNPlugin will generate x8->f32 convolution
666 } else if ((iter->precision != Precision::FP32 && next->precision == Precision::FP32) ||
667 (iter->precision == Precision::FP32 && next->precision != Precision::FP32)) {
668 pairs.push_back(std::pair<CNNLayerPtr, CNNLayerPtr>(iter, next));
674 for (auto& pair : pairs) {
675 AddScaleShiftBetween(net, pair.first, pair.second, statHelper);
679 void CNNNetworkInt8Normalizer::ClampsToReLU(CNNNetwork& net, CNNStatisticHelper& statHelper) {
680 std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
682 for (auto iter : sortedLayers) {
683 if (isReLULikeClamp(iter) && (iter->precision == Precision::I8 || iter->precision == Precision::U8)) {
684 std::string layerName = iter->name + "_ReLU";
685 LayerParams ssCnnLayerParams {layerName, "ReLU", iter->precision};
686 CNNLayerPtr ssCnnLayer(new ReLULayer(ssCnnLayerParams));
688 auto previousLayer = iter->insData[0].lock()->getCreatorLayer().lock();
689 ssCnnLayer->insData.push_back(iter->insData[0]);
690 if (ssCnnLayer->insData[0].lock() == nullptr)
692 ssCnnLayer->insData[0].lock()->getInputTo().erase(iter->name);
693 ssCnnLayer->insData[0].lock()->getInputTo()[iter->name] = ssCnnLayer;
695 ssCnnLayer->outData.push_back(iter->outData[0]);
696 ssCnnLayer->outData[0]->getCreatorLayer() = ssCnnLayer;
698 iter->insData.clear();
699 iter->outData.clear();
704 void CNNNetworkInt8Normalizer::ScaleDataToInt(const float* srcData, size_t srcSize, Blob::Ptr int8blob,
705 const std::vector<float>& scales) {
706 if (scales.size() == 0 || /*srcblob->size()*/ srcSize % scales.size() != 0) {
707 THROW_IE_EXCEPTION << "Wrong number of scale factors";
710 size_t channels = scales.size();
711 size_t channelSize = /*srcblob->size()*/ srcSize / channels;
713 const float* data = srcData;
714 if (int8blob->getTensorDesc().getPrecision() == Precision::I8) {
715 int8_t* int8data = static_cast<int8_t*>(int8blob->buffer());
716 int minValue = std::numeric_limits<int8_t>::min();
717 int maxValue = std::numeric_limits<int8_t>::max();
723 for (size_t ch = 0; ch < channels; ch++) {
724 offset = channelSize * ch;
726 for (size_t i = 0; i < channelSize; i++) {
727 val = data[offset + i] * scales[ch];
729 if (val > maxValue) {
731 } else if (val < minValue) {
735 int8data[offset + i] = round(val);
738 } else if (int8blob->getTensorDesc().getPrecision() == Precision::I32) {
739 int32_t* int32data = static_cast<int32_t*>(int8blob->buffer());
740 int maxValue = std::numeric_limits<int32_t>::max();
741 int minValue = std::numeric_limits<int32_t>::min();
747 for (size_t ch = 0; ch < channels; ch++) {
748 offset = channelSize * ch;
750 for (size_t i = 0; i < channelSize; i++) {
751 val = data[offset + i] * scales[ch];
753 if (val > maxValue) {
755 } else if (val < minValue) {
759 int32data[offset + i] = round(val);
765 CNNLayer::Ptr CNNNetworkInt8Normalizer::createDWConvolutionForScale(const std::string& layerName, size_t channels,
766 float* ssWValues, float* ssSValues) {
767 // create new Convolution layer
769 params.name = layerName;
770 params.precision = Precision::FP32;
771 params.type = "Convolution";
773 CNNLayerPtr lptr = std::make_shared<ConvolutionLayer>(params);
774 auto* pConv = dynamic_cast<ConvolutionLayer*>(lptr.get());
775 if (pConv == nullptr) {
776 THROW_IE_EXCEPTION << "Layer " << lptr->name << " is not instance of ConvolutionLayer class";
779 pConv->_kernel.insert(X_AXIS, 1);
780 pConv->_kernel.insert(Y_AXIS, 1);
781 pConv->_stride.insert(X_AXIS, 1);
782 pConv->_stride.insert(Y_AXIS, 1);
783 pConv->_padding.insert(X_AXIS, 0);
784 pConv->_padding.insert(Y_AXIS, 0);
785 pConv->_pads_end.insert(X_AXIS, 0);
786 pConv->_pads_end.insert(Y_AXIS, 0);
787 pConv->_dilation.insert(X_AXIS, 1);
788 pConv->_dilation.insert(Y_AXIS, 1);
790 pConv->_out_depth = channels;
791 // mkl-dnn does not have i8 depthwise convolution accepting signed i8 input
792 // when it is available, need to uncomment below lines
794 // workaround - creation of new weights for simple convolution
795 if (pConv->_out_depth % 16 == 0) {
796 pConv->_group = pConv->_out_depth / 16;
797 Blob::Ptr weights = nullptr;
798 std::shared_ptr<Data> wData =
799 std::shared_ptr<Data>(new Data("weights", {Precision::FP32, {pConv->_out_depth * 16}, Layout::C}));
800 weights = CreateBlobFromData(wData);
802 float* buffer = weights->buffer().as<float*>();
803 size_t iDist = 0, iSrc = 0;
804 for (size_t g = 0; g < pConv->_group; g++) {
805 for (size_t k = 0; k < 16; k++) {
806 for (size_t s = 0; s < 16; s++) {
807 buffer[iDist++] = (s == k) ? ssWValues[iSrc++] : 0.f;
811 pConv->_weights = weights;
812 pConv->blobs["weights"] = weights;
814 Blob::Ptr weights = nullptr;
815 std::shared_ptr<Data> wData = std::shared_ptr<Data>(
816 new Data("weights", {Precision::FP32, {pConv->_out_depth * pConv->_out_depth}, Layout::C}));
817 weights = CreateBlobFromData(wData);
819 float* buffer = weights->buffer().as<float*>();
820 for (size_t i = 0, idx = 0; i < pConv->_out_depth; i++) {
821 for (size_t j = 0; j < pConv->_out_depth; j++) {
823 buffer[idx] = ssWValues[i];
830 pConv->_weights = weights;
831 pConv->blobs["weights"] = weights;
837 Blob::Ptr biasesBlob = nullptr;
838 std::shared_ptr<Data> bData =
839 std::shared_ptr<Data>(new Data("biases", {Precision::FP32, {pConv->_out_depth}, Layout::C}));
840 biasesBlob = CreateBlobFromData(bData);
841 biasesBlob->allocate();
842 float* bufferBiases = biasesBlob->buffer().as<float*>();
843 for (size_t c = 0; c < pConv->_out_depth; c++) {
844 bufferBiases[c] = ssSValues[c];
846 pConv->_biases = biasesBlob;
848 pConv->blobs["weights"] = pConv->_weights;
849 pConv->blobs["biases"] = pConv->_biases;
853 void CNNNetworkInt8Normalizer::replaceScaleShiftByDWConvolution(CNNNetwork& net) {
854 std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
855 for (auto layer : sortedLayers) {
856 if (CaselessEq<std::string>()(layer->type, "scaleshift") &&
857 layer->insData[0].lock()->getCreatorLayer().lock() &&
858 !CaselessEq<std::string>()(layer->insData[0].lock()->getCreatorLayer().lock()->type, "input") &&
859 layer->outData[0]->getInputTo().size() > 0) {
860 const auto dims = layer->insData[0].lock()->getTensorDesc().getDims();
861 // only four or five dimensions Convolution layers are supported
862 if ((dims.size() == 4) || (dims.size() == 5)) {
863 // verification if this layer does not pass data to PriorBox, if it passes, we do not substitute
864 bool notToPriorBox = true;
865 for (auto o : layer->outData[0]->getInputTo()) {
866 if (CaselessEq<std::string>()(o.second->type, "priorbox") ||
867 CaselessEq<std::string>()(o.second->type, "priorboxclustered")) {
868 notToPriorBox = false;
872 ScaleShiftLayer* pSS = dynamic_cast<ScaleShiftLayer*>(layer.get());
873 float* ssWValues = pSS->_weights->buffer().as<float*>();
874 float* ssSValues = pSS->_biases->buffer().as<float*>();
875 CNNLayer::Ptr newLayer = createDWConvolutionForScale(
876 layer->name, layer->outData[0]->getTensorDesc().getDims()[1], ssWValues, ssSValues);
878 newLayer->outData = layer->outData;
879 newLayer->outData[0]->getCreatorLayer() = newLayer;
880 newLayer->insData = layer->insData;
881 if (newLayer->insData[0].lock() == nullptr)
883 newLayer->insData[0].lock()->getInputTo().erase(layer->name);
884 newLayer->insData[0].lock()->getInputTo()[newLayer->name] = newLayer;
891 void CNNNetworkInt8Normalizer::QuantizeConvolutionOrFullyConnected(CNNLayer::Ptr target_layer,
892 CNNStatisticHelper& statHelper) {
893 size_t inputChannels = target_layer->insData[0].lock()->getTensorDesc().getDims()[1];
894 size_t outputChannels = target_layer->outData[0]->getTensorDesc().getDims()[1];
896 auto iScale = statHelper.getInputScale(target_layer);
897 if (iScale == nullptr)
898 THROW_IE_EXCEPTION << "Layer '" << target_layer->name << "'has invalid scale";
900 target_layer->blobs["i-scale"] = iScale;
902 Blob::Ptr weights = nullptr;
903 Blob::Ptr biases = nullptr;
905 Blob::Ptr int8weights = nullptr;
906 Blob::Ptr int32biases = nullptr;
908 if (target_layer->blobs.find("weights") != target_layer->blobs.end()) {
909 weights = target_layer->blobs["weights"];
911 // Creating int8 weights blob
912 std::shared_ptr<Data> int8WeightsData =
913 std::shared_ptr<Data>(new Data("weights", TensorDesc(Precision::I8, weights->getTensorDesc().getDims(),
914 weights->getTensorDesc().getLayout())));
915 int8weights = CreateBlobFromData(int8WeightsData);
916 int8weights->allocate();
917 target_layer->blobs["weights"] = int8weights;
920 if (target_layer->blobs.find("biases") != target_layer->blobs.end()) {
921 biases = target_layer->blobs["biases"];
923 // Creating int8 biases blob
924 std::shared_ptr<Data> int32BiasesData =
925 std::shared_ptr<Data>(new Data("biases", TensorDesc(Precision::I32, biases->getTensorDesc().getDims(),
926 biases->getTensorDesc().getLayout())));
927 int32biases = CreateBlobFromData(int32BiasesData);
928 int32biases->allocate();
929 target_layer->blobs["biases"] = int32biases;
932 std::vector<float> weightScalers;
934 // Creating w-scale blob
936 const float* weight = static_cast<const float*>(weights->buffer());
938 ConvolutionLayer* pConv1 = dynamic_cast<ConvolutionLayer*>(target_layer.get());
940 if (pConv1 != nullptr && pConv1->_group == 0) {
941 THROW_IE_EXCEPTION << "Convolution '" << target_layer->name << "'has wrong groups number == 0";
944 if (pConv1 != nullptr && pConv1->_group != 1) {
945 group = pConv1->_group;
948 std::vector<float> newWeights; // "new" weights are weights multiplied by i-scale
950 size_t W_CO = outputChannels / group, W_CI = inputChannels / group,
951 W_HW = weights->size() / W_CI / W_CO / group;
954 float* iScaleMemory = static_cast<float*>(iScale->buffer());
955 for (size_t g = 0; g < group; g++) {
956 for (size_t co = 0; co < W_CO; co++) {
957 for (size_t ci = 0; ci < W_CI; ci++) {
958 size_t kernelBase = g * W_CO * W_CI * W_HW + co * W_CI * W_HW + ci * W_HW;
959 for (size_t hw = 0; hw < W_HW; hw++) {
960 newWeights.push_back(weight[kernelBase + hw] * iScaleMemory[g * W_CI + ci]);
966 if (newWeights.empty())
967 THROW_IE_EXCEPTION << "Could not quantize layer '" << target_layer->name << "'. Invalid layer parameters.";
968 size_t outChannelSize = weights->getTensorDesc().getDims().back() / W_CO / group;
970 // Calculating weights normalization scale factor (w-scale)
972 std::set<double> individualsG;
974 float* weight_convolution;
975 bool bwquantized = false;
976 double symQuant = 0.f;
978 for (co = 0, weight_convolution = &newWeights[0]; co < outputChannels;
979 co++, weight_convolution += outChannelSize) {
980 for (size_t i = 0; i < outChannelSize && individualsG.size() < 256; i++) {
981 individualsG.insert(static_cast<double>(weight_convolution[i]));
984 // If we have 256 quantums for all filters in convolution, it can be already int8 quantized weights
985 // We can support symmetric quantization
986 // Below conditions verify if weights are symmetric quantized around 0, what are min/max borders
987 // These parameters are required to repeat exactly the same quantum as model was trained
988 // The algorithm of restoring min/max parameters has couple assumptions which might not work for 100%
989 // cases. We want to explicitly define them. We assume that
990 // 1. All convolutions have 1st quantum either from positive or negative side. See how we calculate symQuant
991 // 2. If quantization is not symmetric, there should be quant on one of the side which demonstrate this
992 if (individualsG.size() < 256) {
993 // going over weights and verify that weights stay on quant positions
994 std::set<double> intervals;
996 for (auto it = individualsG.begin(); it != individualsG.end(); it++) {
998 intervals.insert(*it - prev);
1002 if (!intervals.empty()) {
1003 symQuant = *(intervals.begin());
1005 std::set<double> divs;
1006 if (symQuant != 0.) {
1008 for (auto it = individualsG.begin(); it != individualsG.end(); it++) {
1010 divs.insert((*it - prev) / symQuant);
1017 for (auto it3 = divs.begin(); it3 != divs.end(); it3++) {
1018 if (fabs(round(*it3) - *it3) > 0.001) {
1019 bwquantized = false;
1023 // we want to make sure that quantization is symmetric. this way we are looking for the
1024 // value in weights matching to the quant (positive or negative
1026 // take the minimal and maximum values on calculated symQuant and compare with data from individuals
1027 double minCalc = symQuant * -128.0f;
1028 double maxCalc = symQuant * 128.0f;
1029 for (auto it = individualsG.begin(); it != individualsG.end(); it++) {
1030 if (*it < minCalc || *it > maxCalc) {
1031 bwquantized = false;
1036 if (bwquantized && symQuant != 0.0f) {
1037 float max = symQuant * 127.0f;
1038 for (co = 0, weight_convolution = &newWeights[0]; co < outputChannels;
1039 co++, weight_convolution += outChannelSize) {
1040 float scaler = static_cast<float>(statHelper.getMaxSignValue()) / max;
1041 weightScalers.push_back(scaler);
1044 for (co = 0, weight_convolution = &newWeights[0]; co < outputChannels;
1045 co++, weight_convolution += outChannelSize) {
1046 float max = FLT_MIN;
1047 DataStats::GetDataAbsMax(weight_convolution, outChannelSize, max);
1049 float scaler = static_cast<float>(statHelper.getMaxSignValue()) / max;
1050 weightScalers.push_back(scaler);
1054 std::shared_ptr<Data> wScaleData =
1055 std::shared_ptr<Data>(new Data("w-scale", {Precision::FP32, {outputChannels}, Layout::C}));
1056 auto wScale = CreateBlobFromData(wScaleData);
1059 float* wScaleMemory = static_cast<float*>(wScale->buffer());
1061 for (size_t i = 0; i < outputChannels; i++) {
1062 wScaleMemory[i] = 1.0 / weightScalers[i];
1064 target_layer->blobs["w-scale"] = wScale;
1066 auto oScale = statHelper.getOutputScale(statHelper.getLatestInFuse(target_layer));
1068 // there might not be o-scale if we do not have statistic after convolution that means
1069 // returning to float precision after convolution
1070 target_layer->blobs["o-scale"] = oScale;
1072 // debug scales. Need to compare with actual values in FP32 scoring
1073 target_layer->blobs["ext-scale"] = target_layer->blobs["o-scale"];
1075 // we do not have statistics here, we cannot calculate requantizatin scales,
1076 // next layer will be calculated in fp32
1077 // it's time to return forcedly edge to fp32 as well
1078 target_layer->outData[0]->setPrecision(Precision::FP32);
1081 // Normalizing the weights
1082 ScaleDataToInt(&newWeights[0], weights->size(), int8weights, weightScalers);
1085 // Normalizing the biases
1087 const float* bias = static_cast<const float*>(biases->buffer());
1088 ScaleDataToInt(bias, biases->size(), int32biases, weightScalers);
1092 bool CNNNetworkInt8Normalizer::layerProducesFloat(const CNNLayer::Ptr layer) {
1093 // currently we support only case of layers which have one output port
1094 if (layer->outData.size() > 1) {
1098 bool consumersFP32 = true;
1099 for (const auto dOut : layer->outData[0]->getInputTo()) {
1100 if (dOut.second->precision != Precision::FP32) {
1101 consumersFP32 = false;
1104 return consumersFP32;
1107 void CNNNetworkInt8Normalizer::returnTailToFP32(const CNNLayer::Ptr layer) {
1108 std::set<CNNLayer::Ptr> layersToReturn;
1109 if (layerProducesFloat(layer)) {
1110 layersToReturn.insert(layer);
1113 while (!layersToReturn.empty()) {
1114 CNNLayer::Ptr layerA = *layersToReturn.begin();
1115 layersToReturn.erase(layerA);
1116 // 1. if it is Pooling layer, or concat layer, we can return it to FP32 as well
1117 // we need to return it's out data
1118 if ((CaselessEq<std::string>()(layerA->type, "pooling") || CaselessEq<std::string>()(layerA->type, "concat")) &&
1119 layerA->outData.size() == 1) {
1120 layerA->precision = Precision::FP32;
1121 layerA->outData[0]->setPrecision(Precision::FP32);
1124 if ((CaselessEq<std::string>()(layerA->type, "convolution") ||
1125 CaselessEq<std::string>()(layerA->type, "fullyconnected") ||
1126 CaselessEq<std::string>()(layerA->type, "relu") || isReLULikeClamp(layerA)) &&
1127 layerA->outData.size() == 1) {
1128 layerA->outData[0]->setPrecision(Precision::FP32);
1129 if (CaselessEq<std::string>()(layerA->type, "relu")
1130 && layerA->insData[0].lock() != nullptr
1131 && canLayerBeI8(layerA->insData[0].lock()->getCreatorLayer().lock())) {
1132 layerA->precision = Precision::FP32;
1133 layerA->insData[0].lock()->getCreatorLayer().lock()->outData[0]->setPrecision(Precision::FP32);
1137 // adding parents for analysis
1138 if (!CaselessEq<std::string>()(layerA->type, "convolution") &&
1139 !CaselessEq<std::string>()(layerA->type, "fullyconnected")) {
1140 // for all parents, if they produce data to only FP32 layers
1141 for (auto i : layerA->insData) {
1142 DataPtr d = i.lock();
1143 if (d != nullptr && d->getCreatorLayer().lock()->precision != Precision::FP32 &&
1144 (CaselessEq<std::string>()(layerA->type, "pooling") ||
1145 CaselessEq<std::string>()(layerA->type, "relu") || isReLULikeClamp(layerA) ||
1146 CaselessEq<std::string>()(layerA->type, "concat"))) {
1147 if (layerProducesFloat(d->getCreatorLayer().lock())) {
1148 layersToReturn.insert(d->getCreatorLayer().lock());
1156 bool CNNNetworkInt8Normalizer::canLayerBeI8(const CNNLayer::Ptr& layer) {
1157 // fusion can happen only if initial layer supplies data to only one layer
1158 // if it sends to several layers - it is safe to execute initial layer in any precision
1159 if (layer->outData[0]->getInputTo().size() == 1) {
1160 std::string aType = layer->outData[0]->getInputTo().begin()->second->type;
1161 if (CaselessEq<std::string>()(aType, "relu")) {
1163 } else if (CaselessEq<std::string>()(aType, "clamp")) {
1164 if (!isReLULikeClamp(layer->outData[0]->getInputTo().begin()->second)) {
1168 static const InferenceEngine::details::caseless_set<std::string> nonSuportedActivations = {
1169 "elu", "clamp", "tanh", "logistic", "square", "abs",
1170 "sqrt", "linear", "bounded_elu", "sort_relu", "relu6"};
1171 return nonSuportedActivations.find(aType) == nonSuportedActivations.end();
1177 bool CNNNetworkInt8Normalizer::isNextFusionAllowed(const CNNLayer::Ptr& layer) {
1178 // fusion can happen only if initial layer supplies data to only one layer
1179 // if it sends to several layers - it is safe to execute initial layer in any precision
1180 if (layer->outData[0]->getInputTo().size() == 1) {
1181 std::string aType = layer->outData[0]->getInputTo().begin()->second->type;
1182 if (CaselessEq<std::string>()(aType, "relu")) {
1183 ReLULayer* rL = dynamic_cast<ReLULayer*>(layer->outData[0]->getInputTo().begin()->second.get());
1184 if (rL == nullptr) {
1185 THROW_IE_EXCEPTION << "Layer " << layer->outData[0]->getInputTo().begin()->second->name
1186 << " is not instance of ReLULayer class";
1188 if (rL->negative_slope != 0.f) {
1191 } else if (CaselessEq<std::string>()(aType, "clamp")) {
1192 if (!isReLULikeClamp(layer->outData[0]->getInputTo().begin()->second)) {
1196 static const InferenceEngine::details::caseless_set<std::string> nonSuportedActivations = {
1197 "elu", "clamp", "tanh", "logistic", "square", "abs",
1198 "sqrt", "linear", "bounded_elu", "sort_relu", "relu6"};
1199 return nonSuportedActivations.find(aType) == nonSuportedActivations.end();
1202 if (CaselessEq<std::string>()(layer->type, "eltwise")) {
1209 bool CNNNetworkInt8Normalizer::isReLULikeClamp(CNNLayer::Ptr layer) {
1210 if (CaselessEq<std::string>()(layer->type, "Clamp")) {
1211 ClampLayer* clamp = dynamic_cast<ClampLayer*>(layer.get());
1212 if (clamp == nullptr) {
1213 THROW_IE_EXCEPTION << "Int8 Normalizer error: cannot cast layer '" << layer->name << "' to Clamp";
1215 return clamp->min_value == 0;
1220 void CNNNetworkInt8Normalizer::DefinesExecutionPrecision(CNNNetwork& net, CNNStatisticHelper& statHelper) {
1221 std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
1223 // Converting layers to Int8. Calculating the multipliers if needed
1224 for (auto iter : sortedLayers) {
1225 if (iter->params.find("quantization_level") != iter->params.end() &&
1226 (iter->params["quantization_level"] == "FP32" || iter->params["quantization_level"] == "FP16")) {
1230 // Legacy: FullyConnected should not be converted to Int8,
1231 // if it isn't explicitly marked to.
1232 if (iter->params.find("quantization_level") == iter->params.end() &&
1233 CaselessEq<std::string>()(iter->type, "fullyconnected")) {
1237 if (!statHelper.canLayerBeQuantized(iter)) {
1241 if (CaselessEq<std::string>()(iter->type, "convolution") ||
1242 CaselessEq<std::string>()(iter->type, "fullyconnected")) {
1243 if (canLayerBeI8(iter)) {
1244 iter->precision = Precision::I8;
1245 // we will override I8 to U8 during analysing of Conv-ReLU and Conv-Sum-ReLU fusions
1246 iter->outData[0]->setPrecision(Precision::I8);
1248 } else if (CaselessEq<std::string>()(iter->type, "relu") || isReLULikeClamp(iter)) {
1250 ReLULayer* rL = dynamic_cast<ReLULayer*>(iter.get());
1251 DataPtr outData = iter->outData.size() ? iter->outData[0] : nullptr;
1252 auto inputData = iter->insData[0].lock();
1253 if (inputData && inputData->getCreatorLayer().lock()->precision != Precision::FP32 &&
1254 outData->getPrecision() == Precision::FP32) {
1255 iter->precision = Precision::I8;
1256 if (rL != nullptr && rL->negative_slope != 0.0f) {
1257 outData->setPrecision(Precision::I8);
1259 outData->setPrecision(Precision::U8);
1260 // if convolution is a predecessor, change its data to U8 also
1261 CNNLayer::Ptr prevLayer = inputData->getCreatorLayer().lock();
1262 if (prevLayer && (CaselessEq<std::string>()(prevLayer->type, "convolution") ||
1263 CaselessEq<std::string>()(prevLayer->type, "fullyconnected") ||
1264 CaselessEq<std::string>()(prevLayer->type, "eltwise"))) {
1265 if (!isNextFusionAllowed(prevLayer) && inputData->getPrecision() == Precision::I8) {
1266 outData->setPrecision(Precision::I8);
1268 inputData->setPrecision(Precision::U8);
1271 // if there is a patter A0 -> Eltwise -> ReLU and Convolution -> Eltwise -> ReLU,
1272 // need to mark data after conv as U8
1273 if (prevLayer && CaselessEq<std::string>()(prevLayer->type, "eltwise")) {
1274 // decising which input will be used for fusion conv-sum-relu
1275 CNNLayer::Ptr input1 = prevLayer->insData[0].lock()->getCreatorLayer().lock();
1276 CNNLayer::Ptr input2 = prevLayer->insData[1].lock()->getCreatorLayer().lock();
1277 CNNLayer::Ptr convLayer = nullptr;
1278 CNNLayer::Ptr sumLayer = nullptr;
1280 if (!CaselessEq<std::string>()(input1->type, "convolution")) {
1284 // it covers a case when both inputs are convolutions or when first input is not convolution
1288 convLayer->outData[0]->setPrecision(sumLayer->outData[0]->getPrecision());
1292 } else if (CaselessEq<std::string>()(iter->type, "pooling")) {
1293 auto pool = dynamic_cast<PoolingLayer*>(iter.get());
1294 if (pool == nullptr) {
1295 THROW_IE_EXCEPTION << "Int8 Normalizer error: cannot cast layer '" << iter->name << "' to pooling";
1298 if (pool->_type == PoolingLayer::MAX || (pool->_type == PoolingLayer::AVG && pool->outData.size() == 1)) {
1299 auto prevLayer = iter->insData[0].lock()->getCreatorLayer().lock();
1300 if (prevLayer && (prevLayer->precision == Precision::I8 || prevLayer->precision == Precision::U8)) {
1301 iter->precision = Precision::I8;
1302 iter->outData[0]->setPrecision(statHelper.hasNegativeOutput(iter->name) ? Precision::I8
1306 } else if (CaselessEq<std::string>()(iter->type, "concat")) {
1308 // casting to concat and take axis parameter
1309 // we can concat scales only if concat does concatination by feature maps
1310 bool axisFeatureMaps = false;
1311 auto concatLayer = dynamic_cast<ConcatLayer*>(iter.get());
1313 if (concatLayer->_axis == 1 && concatLayer->insData.size() &&
1314 concatLayer->insData[0].lock()->getTensorDesc().getDims().size() == 4) {
1315 axisFeatureMaps = true;
1318 THROW_IE_EXCEPTION << "Int8 Normalizer error: cannot cast layer " << iter->name << " to concat";
1321 if (axisFeatureMaps) {
1322 // verification of input data types
1323 bool inputFP32 = false;
1324 bool inputI8 = false;
1325 bool inputU8 = false;
1327 for (auto inputData : iter->insData) {
1328 auto data = inputData.lock();
1329 if (data->getPrecision() == Precision::FP32) {
1331 } else if (data->getPrecision() == Precision::I8) {
1333 } else if (data->getPrecision() == Precision::U8) {
1336 // Is it a case of input, i.e. passing I16 to concat?
1337 // TODO(amalyshe) to handle inputs as a separate usecase
1338 THROW_IE_EXCEPTION << "I8 normalizer: input data has unknown precision on the edge for concat: "
1344 for (auto i : iter->insData) {
1345 if (i.lock()->getCreatorLayer().lock()->precision != Precision::FP32) {
1346 returnTailToFP32(i.lock()->getCreatorLayer().lock());
1350 iter->precision = Precision::I8;
1352 // we set outpout precision to U8 only if all inputs are U8, in other case it will be I8
1353 auto outputPrecision = (inputU8 && !inputI8) ? Precision::U8 : Precision::I8;
1355 // if we have mixed input for I8 and U8, we have to insert scale to edges having U8 to convert to I8
1356 // Yes, it leads to loosing of some precision and might lead to some performance degradation
1357 // until we have scale supporting s8/u8 input and s8/u8 output.
1358 if (inputU8 && inputI8) {
1359 // looking for all edges having U8
1360 for (size_t d = 0; d < iter->insData.size(); d++) {
1361 auto data = iter->insData[d].lock();
1362 if (data->getPrecision() == Precision::U8) {
1363 const size_t c = static_cast<size_t>(data->getDims()[1]);
1364 std::vector<float> ssWValues(c, 1.0f);
1365 std::vector<float> ssSValues(c, 0.0f);
1367 std::string layerName =
1368 data->getCreatorLayer().lock()->name + "_Concat_ScaleShift_U8I8_" + iter->name;
1369 CNNLayer::Ptr newLayer =
1370 createDWConvolutionForScale(layerName, c, ssWValues.data(), ssSValues.data());
1371 newLayer->precision = Precision::I8;
1372 AddLayerToCNNNetworkBeforeLayer(newLayer, iter, d);
1374 // update statistic to pass quantization smoothly
1375 std::string inputLayerName =
1376 newLayer->insData[0].lock()->getCreatorLayer().lock()->name;
1377 statHelper.copyStatistics(inputLayerName, layerName);
1378 newLayer->outData[0]->setPrecision(Precision::I8);
1383 if (iter->outData.size() == 1) {
1384 for (auto&& out : iter->outData) {
1385 out->setPrecision(outputPrecision);
1390 } else if (CaselessEq<std::string>()(iter->type, "eltwise")) {
1391 // we decide which of the layers will be in int-8 mode and initialize special scale which will be used
1392 // later in "conv-sum-relu" fuse. i8 execution of eltwise always assume this fusion
1393 if (canLayerBeI8(iter)) {
1394 if (iter->insData.size() == 2) {
1395 CNNLayer::Ptr input1 = iter->insData[0].lock()->getCreatorLayer().lock();
1396 CNNLayer::Ptr input2 = iter->insData[1].lock()->getCreatorLayer().lock();
1397 if ((CaselessEq<std::string>()(input1->type, "convolution") ||
1398 CaselessEq<std::string>()(input2->type, "convolution")) &&
1399 !CaselessEq<std::string>()(input1->type, "concat") &&
1400 !CaselessEq<std::string>()(input2->type, "concat") && input1->precision != Precision::FP32 &&
1401 input2->precision != Precision::FP32) {
1402 // understand which layer will be used for sum
1403 CNNLayer::Ptr sumLayer = nullptr;
1404 CNNLayer::Ptr convLayer = nullptr;
1406 if (!CaselessEq<std::string>()(input1->type, "convolution")) {
1410 // it covers a case when both inputs are convolutions or when first input is not convolution
1415 // if we find supported activation, mark it's output as I8 or U8 depending on statistics
1416 if (iter->outData.size() == 1 && iter->outData[0]->getInputTo().size() == 1 &&
1417 (CaselessEq<std::string>()(iter->outData[0]->getInputTo().begin()->second->type, "ReLU") ||
1418 CNNNetworkInt8Normalizer::isReLULikeClamp(
1419 iter->outData[0]->getInputTo().begin()->second))) {
1420 auto activation = iter->outData[0]->getInputTo().begin()->second;
1421 activation->precision = Precision::I8;
1422 if (!statHelper.hasNegativeOutput(statHelper.getLatestInFuse(convLayer)->name)) {
1423 activation->outData[0]->setPrecision(Precision::U8);
1424 iter->outData[0]->setPrecision(Precision::U8);
1426 activation->outData[0]->setPrecision(Precision::I8);
1427 iter->outData[0]->setPrecision(Precision::I8);
1430 iter->outData[0]->setPrecision(Precision::I8);
1433 if (convLayer->outData[0]->getTensorDesc().getPrecision() == Precision::I8) {
1434 // verify precision on input edges before and after eltwise fusion
1435 // if we have i8/u8 missmatch between sum layer input and conv-sum-activation output,
1436 // then in this case we have to add requantization to i8 on sum input edge
1437 auto latestInFuse = statHelper.getLatestInFuse(convLayer);
1438 if (latestInFuse->outData[0]->getTensorDesc().getPrecision() == Precision::I8) {
1439 if (input1 == sumLayer &&
1440 iter->insData[0].lock()->getTensorDesc().getPrecision() == Precision::U8) {
1441 sumLayer = addU8ToI8Conversion(iter->insData[0].lock(), iter, statHelper);
1442 } else if (input2 == sumLayer &&
1443 iter->insData[1].lock()->getTensorDesc().getPrecision() == Precision::U8) {
1444 sumLayer = addU8ToI8Conversion(iter->insData[0].lock(), iter, statHelper);
1447 THROW_IE_EXCEPTION << "I8 normalizer had to add U8->I8 conversion before "
1448 << iter->name << " but failed to do this";
1452 // mark eltwise as a I8 executable, mark out data as I8
1453 iter->precision = Precision::I8;
1454 convLayer->outData[0]->setPrecision(sumLayer->outData[0]->getPrecision());
1455 // calculate the only scale
1456 Blob::Ptr sumLayerScales = statHelper.getOutputScale(statHelper.getLatestInFuse(sumLayer));
1457 Blob::Ptr convLayerScales =
1458 statHelper.getOutputScale(statHelper.getLatestInFuse(convLayer));
1459 float* sumScale = sumLayerScales->buffer().as<float*>();
1460 float* convScale = convLayerScales->buffer().as<float*>();
1461 for (size_t i = 0; i < sumLayerScales->size(); i++) {
1462 sumScale[i] /= convScale[i];
1465 iter->blobs["eltwise-sum-scale"] = sumLayerScales;
1470 // if there are convolutions are inputs to this eltwise, we forcedly move them to FP32
1471 for (auto i : iter->insData) {
1472 auto type = i.lock()->getCreatorLayer().lock()->type;
1473 if (CaselessEq<std::string>()(type, "convolution") ||
1474 CaselessEq<std::string>()(type, "fullyconnected")) {
1475 i.lock()->getCreatorLayer().lock()->precision = Precision::FP32;
1476 i.lock()->setPrecision(Precision::FP32);
1480 } else if (CaselessEq<std::string>()(iter->type, "resample")) {
1481 iter->precision = Precision::I8;
1482 iter->outData[0]->setPrecision(iter->insData[0].lock()->getPrecision());
1486 // quantization of weights/biases
1487 sortedLayers = CNNNetSortTopologically(net);
1488 for (auto iter : sortedLayers) {
1489 if (iter->precision == Precision::I8 && (CaselessEq<std::string>()(iter->type, "convolution") ||
1490 CaselessEq<std::string>()(iter->type, "fullyconnected"))) {
1491 QuantizeConvolutionOrFullyConnected(iter, statHelper);
1495 // Returning of tails to FP32 mode if optimistic approach marked them as I8
1496 // no sense to do pooling in i8, we can return just after convolution
1497 for (auto iter : sortedLayers) {
1498 // TODO(amalyshe) here is a handling of case when iter provides data to the only one next layer
1499 // need to extend to cases when it provides data to many layers
1500 if (iter->precision == Precision::I8 && iter->outData.size() == 1) {
1501 if ((iter->outData[0]->getInputTo().size() == 1 &&
1502 iter->outData[0]->getInputTo().begin()->second->precision == Precision::FP32) ||
1503 iter->outData[0]->getInputTo().size() == 0) {
1504 returnTailToFP32(iter);
1510 void CNNNetworkInt8Normalizer::PropagateScaleFactors(CNNNetwork& net, const CNNStatisticHelper& statHelper) {
1511 std::vector<CNNLayerPtr> sortedLayers = CNNNetSortTopologically(net);
1513 // Moving o-scales down
1514 for (auto iter : sortedLayers) {
1515 if (iter->type == "Concat" && iter->precision == Precision::I8) {
1516 // Checking if all inputs are INT8
1517 bool all_inputs_are_int8 = true;
1518 for (int k = 0; k < iter->insData.size(); k++) {
1519 auto prevKLayer = iter->insData[k].lock()->getCreatorLayer().lock();
1520 if ((prevKLayer->precision != Precision::I8 && prevKLayer->precision != Precision::U8) ||
1521 prevKLayer->blobs.find("i-concat-scale") == prevKLayer->blobs.end()) {
1522 all_inputs_are_int8 = false;
1527 if (all_inputs_are_int8) {
1528 // Merging o-scales of the inputs to make one for the Concat
1529 // Creating the o-scale for the Concat by concatenating the input concats
1530 size_t outputChannels = iter->outData[0]->getTensorDesc().getDims()[1];
1532 std::shared_ptr<Data> oScaleData =
1533 std::shared_ptr<Data>(new Data("o-scale", {Precision::FP32, {outputChannels}, Layout::C}));
1534 auto oScale = CreateBlobFromData(oScaleData);
1537 float* oScaleMemory = static_cast<float*>(oScale->buffer());
1539 for (int in = 0; in < iter->insData.size(); in++) {
1540 auto prevOScale = iter->insData[in].lock()->getCreatorLayer().lock()->blobs["i-concat-scale"];
1541 float* prevOScaleMemory = static_cast<float*>(prevOScale->buffer());
1543 for (int c = 0; c < prevOScale->size(); c++) {
1544 oScaleMemory[cc] = prevOScaleMemory[c];
1548 if (cc != outputChannels)
1549 THROW_IE_EXCEPTION << "Size of o-scale after " << iter->name
1550 << " isn't equal to the channels count";
1552 iter->precision = Precision::I8;
1553 iter->blobs["o-scale"] = oScale;
1557 if (iter->blobs.find("o-scale") != iter->blobs.end()) {
1558 int int8Consumers = 0;
1559 int fp32Consumers = 0;
1560 if (iter->outData.size() > 1) {
1561 THROW_IE_EXCEPTION << "normalization algorithm for int8 found layer having o-scale and multiple ports";
1563 if (iter->outData.size() == 1) {
1564 for (auto l : iter->outData[0]->getInputTo()) {
1565 if (l.second->precision == Precision::I8 || l.second->precision == Precision::U8) {
1566 if (CaselessEq<std::string>()(l.second->type, "Pooling") ||
1567 CaselessEq<std::string>()(l.second->type, "ReLU") ||
1568 CNNNetworkInt8Normalizer::isReLULikeClamp(l.second)) {
1569 l.second->blobs["o-scale"] = iter->blobs["o-scale"];
1570 // debug scales. Need to compare with actual values in FP32 scoring
1571 l.second->blobs["ext-scale"] = l.second->blobs["o-scale"];
1573 } else if (l.second->type == "Convolution") {
1574 l.second->blobs.erase("i-scale");
1576 } else if (CaselessEq<std::string>()(l.second->type, "Eltwise")) {
1577 if (statHelper.getLatestInFuse(iter) != iter) {
1578 l.second->blobs["o-scale"] = iter->blobs["o-scale"];
1581 } else if ((l.second->precision == Precision::I8 || l.second->precision == Precision::U8) &&
1582 CaselessEq<std::string>()(l.second->type, "Resample")) {
1583 // If resample has concat as input layer it should inherit it's
1585 if (l.second->insData.size() == 1) {
1586 CNNLayerPtr creator = l.second->insData[0].lock()->getCreatorLayer().lock();
1587 if (CaselessEq<std::string>()(creator->type, "Concat")) {
1588 l.second->blobs["o-scale"] = creator->blobs["o-scale"];
1589 l.second->blobs["i-concat-scale"] = l.second->blobs["o-scale"];
1593 // No concat found, let use statistics
1594 if (l.second->blobs.find("o-scale") == l.second->blobs.end()) {
1595 auto oScale = statHelper.getOutputScale(l.second);
1596 l.second->blobs["o-scale"] = oScale;
1597 l.second->blobs["i-concat-scale"] = l.second->blobs["o-scale"];
1600 } else if ((l.second->precision == Precision::I8) &&
1601 CaselessEq<std::string>()(l.second->type, "concat")) {
1602 // if concat is i8, we can propagate oscale further to concat.
1603 // The logic around o-scale assumes that if we have it in the layer after iteration
1604 // in this loop it means that it must not be removed and we need to place
1605 // scale. While for concat we return to one layer back and again need to analyze o-scale
1606 // and it is not clear if we need to return o-scale or it was only for concat.
1607 // Having all of this in mind, it's better to rename o-scale to i-concat-scale
1608 iter->blobs["i-concat-scale"] = iter->blobs["o-scale"];
1613 } else if (CaselessEq<std::string>()(l.second->type, "priorbox") ||
1614 CaselessEq<std::string>()(l.second->type, "priorboxclustered")) {
1616 // we are leaving o-scale still for adding of scale-shift before FP32 layer
1621 if (iter->outData[0]->getInputTo().empty()) {
1625 if (CaselessEq<std::string>()(iter->type, "Convolution") ||
1626 CaselessEq<std::string>()(iter->type, "FullyConnected")) {
1627 if (int8Consumers) {
1628 iter->blobs["oi-scale"] = iter->blobs["o-scale"];
1630 iter->outData[0]->setPrecision(Precision::FP32);
1633 if (!fp32Consumers) {
1634 iter->blobs.erase("o-scale");
1640 // fixing cornercases when o-scale was propagated through linear tail but it is more efficient to leave
1641 // conversion to de-normalized values in convolution
1642 for (auto iter : sortedLayers) {
1643 if (iter->blobs.find("o-scale") != iter->blobs.end()) {
1644 // go over out data. if all outputs are fp32, continue this optimization
1645 bool canOptimize = true;
1647 // current layer must not be convolution
1648 if (CaselessEq<std::string>()(iter->type, "convolution")) {
1649 canOptimize = false;
1651 for (auto o : iter->outData) {
1652 for (auto ol : o->getInputTo()) {
1653 if (ol.second->precision == Precision::I8) {
1654 canOptimize = false;
1661 // trying to go up until convolution
1662 auto curLayer = iter;
1663 bool eliminateOScale = true;
1664 while (curLayer && curLayer->blobs.find("oi-scale") == curLayer->blobs.end() && eliminateOScale) {
1665 if (curLayer->insData.size() == 1 && curLayer->insData[0].lock()->getCreatorLayer().lock() &&
1666 curLayer->insData[0].lock()->getCreatorLayer().lock()->outData.size() == 1 &&
1667 curLayer->insData[0].lock()->getInputTo().size() == 1) {
1668 curLayer = curLayer->insData[0].lock()->getCreatorLayer().lock();
1669 if (!CaselessEq<std::string>()(curLayer->type, "Pooling") &&
1670 !CaselessEq<std::string>()(curLayer->type, "ReLU") && !isReLULikeClamp(curLayer) &&
1671 !CaselessEq<std::string>()(curLayer->type, "Convolution")) {
1672 eliminateOScale = false;
1675 eliminateOScale = false;
1678 if (eliminateOScale && curLayer) {
1679 for (auto o : iter->outData) {
1680 o->setPrecision(Precision::FP32);
1682 for (auto o : curLayer->outData) {
1683 o->setPrecision(Precision::FP32);
1686 curLayer->blobs.erase("oi-scale");
1687 iter->blobs.erase("o-scale");
1689 while (iLayer != curLayer) {
1690 if (iLayer->type == "Pooling") {
1691 iLayer->precision = Precision::FP32;
1693 iLayer = iLayer->insData[0].lock()->getCreatorLayer().lock();
1700 std::string getBlobDimention(const Blob::Ptr blob) {
1701 size_t idx = blob->getTensorDesc().getDims().size();
1703 std::stringstream blobDimention;
1704 blobDimention << "[";
1705 for (auto& dim : blob->getTensorDesc().getDims()) {
1706 blobDimention << dim << ((--idx) != 0u ? ", " : "");
1708 blobDimention << "]";
1710 return blobDimention.str();
1713 void precisionColoring(const CNNLayerPtr layer, ordered_properties& printed_properties,
1714 ordered_properties& node_properties) {
1715 // looking for the w-scale
1716 if (layer->blobs.find("w-scale") != layer->blobs.end()) {
1717 printed_properties.insert(
1718 printed_properties.begin(),
1719 std::pair<std::string, std::string>("w-scale", getBlobDimention(layer->blobs.find("w-scale")->second)));
1722 // looking for the oi-scale
1723 if (layer->blobs.find("oi-scale") != layer->blobs.end()) {
1724 printed_properties.insert(
1725 printed_properties.begin(),
1726 std::pair<std::string, std::string>("oi-scale", getBlobDimention(layer->blobs.find("oi-scale")->second)));
1729 // looking for the o-scale
1730 if (layer->blobs.find("o-scale") != layer->blobs.end()) {
1731 printed_properties.insert(
1732 printed_properties.begin(),
1733 std::pair<std::string, std::string>("o-scale", getBlobDimention(layer->blobs.find("o-scale")->second)));
1735 // looking for the i-scale
1736 if (layer->blobs.find("i-scale") != layer->blobs.end()) {
1737 printed_properties.insert(
1738 printed_properties.begin(),
1739 std::pair<std::string, std::string>("i-scale", getBlobDimention(layer->blobs.find("i-scale")->second)));
1742 printed_properties.insert(
1743 printed_properties.begin(),
1744 std::pair<std::string, std::string>("Precision", layer->precision == Precision::FP32 ? "FP32" : "I8"));
1746 if (layer->precision == Precision::FP32) {
1747 node_properties.emplace_back("fillcolor", "#5A5DF0");
1749 node_properties.emplace_back("fillcolor", "#20F608");
1753 void CNNNetworkInt8Normalizer::NormalizeNetwork(ICNNNetwork& network, ICNNNetworkStats& netStats) {
1754 CNNNetwork cnnn(ICNNNetwork::Ptr(&network, [](void*) {}));
1757 int maxUnsign = 0xFF;
1759 // Applying int8-conversion
1760 StatsMap statsMap = netStats.getNodesStats();
1762 CNNStatisticHelper statHelper(cnnn, statsMap, maxSign, maxUnsign);
1764 replaceScaleShiftByDWConvolution(cnnn);
1766 DefinesExecutionPrecision(cnnn, statHelper);
1767 PropagateScaleFactors(cnnn, statHelper);
1768 ClampsToReLU(cnnn, statHelper);
1769 AddScaleShifts(cnnn, statHelper);
1771 std::ofstream file("i8_normalized.dot");
1772 saveGraphToDot(cnnn, file, precisionColoring);