[LPT] integration: issue #42391 & issue #43001 (#3201)
[platform/upstream/dldt.git] / inference-engine / src / low_precision_transformations / src / layer_transformation.cpp
1 // Copyright (C) 2020 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #include <low_precision/layer_transformation.hpp>
6 #include <low_precision/network_helper.hpp>
7
8
9 #include <algorithm>
10 #include <cmath>
11 #include <limits>
12 #include <map>
13 #include <memory>
14 #include <string>
15 #include <utility>
16 #include <unordered_set>
17 #include <vector>
18 #include <queue>
19
20 namespace ngraph {
21 namespace pass {
22 namespace low_precision {
23
24 const char LayerTransformation::originalLayerPostfix[] = "_original";
25
26 LayerTransformation::LayerTransformation(const Params& params) :
27     updatePrecisions(params.updatePrecisions),
28     quantizedTensorAlignmentOnActivations(params.quantizedTensorAlignmentOnActivations),
29     quantizedTensorAlignmentOnWeights(params.quantizedTensorAlignmentOnWeights),
30     supportAsymmetricQuantization(params.supportAsymmetricQuantization),
31     precisionsOnActivations(params.precisionsOnActivations),
32     precisionsOnWeights(params.precisionsOnWeights),
33     layerTransformationsManager(nullptr),
34     paramsManager(nullptr),
35     quantizationIntervalAsymmetryThreshold(0.002f),
36     zeroThreshold(1.e-6f),
37     minQuantizationLevels(2ul) {}
38
39 void LayerTransformation::setParamsManager(IParamsManager* paramsManager) noexcept {
40     this->paramsManager = paramsManager;
41 }
42
43 void LayerTransformation::setLayerTransformationsManager(ILayerTransformationsManager* layerTransformationsManager) noexcept {
44     this->layerTransformationsManager = layerTransformationsManager;
45 }
46
47 void LayerTransformation::setUpdatePrecisions(const bool updatePrecisions) {
48     this->updatePrecisions = updatePrecisions;
49 }
50
51 void LayerTransformation::setQuantizedTensorAlignmentOnActivations(
52     const QuantizedTensorAlignment quantizedTensorAlignmentOnActivations) {
53     this->quantizedTensorAlignmentOnActivations = quantizedTensorAlignmentOnActivations;
54 }
55
56 void LayerTransformation::setQuantizedTensorAlignmentOnWeights(
57     const QuantizedTensorAlignment quantizedTensorAlignmentOnWeights) {
58     this->quantizedTensorAlignmentOnWeights = quantizedTensorAlignmentOnWeights;
59 }
60
61 const std::vector<element::Type>& LayerTransformation::getPrecisionsOnActivations() const {
62     return precisionsOnActivations;
63 }
64
65 const std::vector<element::Type>& LayerTransformation::getPrecisionsOnWeights() const {
66     return precisionsOnWeights;
67 }
68
69 bool LayerTransformation::canBeTransformed(const TransformationContext& context, std::shared_ptr<Node> layer) const {
70     if (!isQuantized(layer)) {
71         return false;
72     }
73
74     for (const auto& output : layer->outputs()) {
75         const size_t size = output.get_shape().size();
76         if ((size < 2ul) || (size > 5ul)) {
77             return false;
78         }
79     }
80
81     const auto dequantization = NetworkHelper::getDequantization(layer);
82     if (!dequantization.empty()) {
83         auto perChannelQuantization = [](const Shape dataShape, Shape constShape) {
84             if ((dataShape.size() - constShape.size()) == 1ul) {
85                 constShape.insert(constShape.begin(), 1ul);
86             }
87
88             if ((constShape.size() >= 2ul) && (constShape[0] != 1ul)) {
89                 return false;
90             }
91
92             for (size_t i = 2; i < constShape.size(); ++i) {
93                 if (constShape[i] != 1ul) {
94                     return false;
95                 }
96             }
97             return true;
98         };
99
100         if ((dequantization.subtract != nullptr) && (!perChannelQuantization(
101             dequantization.subtract->output(0).get_shape(),
102             dequantization.subtract->input(1).get_shape()))) {
103             return false;
104         }
105
106         if ((dequantization.multiply != nullptr) && (!perChannelQuantization(
107             dequantization.multiply->output(0).get_shape(),
108             dequantization.multiply->input(1).get_shape()))) {
109             return false;
110         }
111     }
112
113     return true;
114 }
115
116 bool LayerTransformation::canSubtractBeHandled(const std::shared_ptr<Node>& op, const size_t parentIndex) const {
117     return canSubtractBeHandled(op, NetworkHelper::getDequantization(op, parentIndex));
118 }
119
120 bool LayerTransformation::canSubtractBeHandled(const std::shared_ptr<Node>& op, const FakeQuantizeDequantization& dequantization) const {
121     if (dequantization.empty() || (dequantization.subtract == nullptr)) {
122         return true;
123     }
124
125     if (!supportAsymmetricQuantization) {
126         return false;
127     }
128
129     if (!updatePrecisions) {
130         return true;
131     }
132
133     const element::Type operationType = dequantization.convert == nullptr ?
134         dequantization.subtract->input(0).get_element_type() :
135         dequantization.convert->input(0).get_element_type();
136
137     if ((operationType != element::i8) && (operationType != element::u8)) {
138         return false;
139     }
140
141     return true;
142 }
143
144 #ifdef LPT_PRINT_DEQUANTIZATION_INFO
145 std::stringstream toStream(const std::vector<float>& dequantizationValues) {
146     std::stringstream ss;
147     const size_t scalesCount = dequantizationValues.size() > 9ul ? 9ul : dequantizationValues.size();
148     ss << "{";
149     for (size_t i = 0ul; i < scalesCount; ++i) {
150         ss << dequantizationValues[i] << (i < (scalesCount - 1) ? "," : "");
151     }
152     ss << "}";
153     return ss;
154 }
155
156 void LayerTransformation::printDequantizationInfo(const std::shared_ptr<Node>& layer) {
157     const QuantizationDetails quantizationDetails = QuantizationDetails::getDetails(as_type_ptr<opset1::FakeQuantize>(layer));
158     std::cout <<
159         layer->get_type_name() << (NetworkHelper::onWeights(layer) ? " on weights " : " on activations ") <<
160         layer->get_friendly_name() << ":" << std::endl <<
161         "   details  : " << quantizationDetails << std::endl;
162 }
163
164 void LayerTransformation::printDequantizationInfo(const DataPrecision& dataPrecision) {
165     std::cout << "   precision: " << dataPrecision << std::endl;
166 }
167
168 void LayerTransformation::printDequantizationValues(
169     const std::vector<float>& dequantizationScales,
170     const std::vector<float>& dequantizationShifts) {
171     std::cout <<
172         "   scales   : " << toStream(dequantizationScales).str() << std::endl <<
173         "   shifts   : " << toStream(dequantizationShifts).str() << std::endl;
174 }
175 #endif
176
177 void LayerTransformation::setQuantizationIntervalAsymmetryThreshold(const float value) {
178     this->quantizationIntervalAsymmetryThreshold = value;
179 }
180
181 void LayerTransformation::setZeroThreshold(const float value) {
182     this->zeroThreshold = value;
183 }
184
185 void LayerTransformation::setMinQuantizationLevels(const size_t levels) {
186     this->minQuantizationLevels = levels;
187 }
188
189 LayerTransformation::PrecisionDetails LayerTransformation::getPrecisionDetails(const QuantizationDetails& quantizationDetails) const {
190     const float asymmetricIntervalSideRatio256 = -128.f / 127.f;
191     bool hasNegative = false;
192     bool signedPrecision = true;
193     bool unsignedPrecision = true;
194
195     bool hasZeroPoint = false;
196     for (size_t i = 0; i < quantizationDetails.outputLowValues.size(); ++i) {
197         const bool signedInterval = std::signbit(quantizationDetails.outputLowValues[i]) != std::signbit(quantizationDetails.outputHighValues[i]);
198         const bool boundaryValuesAreNotZero =
199             (std::fabs(quantizationDetails.outputLowValues[i]) >= zeroThreshold) &&
200             (std::fabs(quantizationDetails.outputHighValues[i]) >= zeroThreshold);
201         if (signedInterval && boundaryValuesAreNotZero) {
202             // signed
203             unsignedPrecision = false;
204             hasNegative = true;
205
206             const float expectedRatio = quantizationDetails.levels == 256 ? asymmetricIntervalSideRatio256 : -1.f;
207             const float actualRatio = quantizationDetails.outputLowValues[i] / quantizationDetails.outputHighValues[i];
208             const float actual = std::fabs((actualRatio - expectedRatio) / std::min(actualRatio, expectedRatio));
209             if (actual > quantizationIntervalAsymmetryThreshold) {
210                 hasZeroPoint = true;
211             }
212 #ifdef LPT_PRINT_DEQUANTIZATION_INFO
213             if (hasZeroPoint) {
214                 std::cout << "   actual: " << actual << ", threshold: " << quantizationIntervalAsymmetryThreshold << std::endl;
215                 std::cout << "   hasZeroPoint: " << (hasZeroPoint ? "True" : "False") << std::endl;
216             }
217 #endif
218         } else {
219             // unsigned
220             signedPrecision = false;
221             if (boundaryValuesAreNotZero) {
222                 hasZeroPoint = boundaryValuesAreNotZero;
223             }
224
225 #ifdef LPT_PRINT_DEQUANTIZATION_INFO
226             if (hasZeroPoint) {
227                 const float actual = quantizationDetails.outputLowValues[i] > 0.f ?
228                     quantizationDetails.outputLowValues[i] :
229                     quantizationDetails.outputHighValues[i];
230                 std::cout << "   actual: " << actual << ", threshold: 0.0" << std::endl;
231                 std::cout << "   hasZeroPoint: " << (hasZeroPoint ? "True" : "False") << std::endl;
232             }
233 #endif
234         }
235     }
236
237     if (!hasZeroPoint) {
238         if (signedPrecision && (!unsignedPrecision)) {
239             return LayerTransformation::PrecisionDetails(element::i8, hasNegative, hasZeroPoint);
240         }
241
242         if ((!signedPrecision) && unsignedPrecision) {
243             return LayerTransformation::PrecisionDetails(element::u8, hasNegative, hasZeroPoint);
244         }
245     }
246
247     return LayerTransformation::PrecisionDetails(element::undefined, hasNegative, hasZeroPoint);
248 }
249
250 bool LayerTransformation::isQuantized(std::shared_ptr<Node> layer) const noexcept {
251     return true;
252 }
253
254 DataPrecision LayerTransformation::getDataPrecision(
255         std::shared_ptr<Node> layer,
256         const QuantizationDetails& quantizationDetails,
257         const bool onWeights) const {
258 #ifdef LPT_PRINT_DEQUANTIZATION_INFO
259     printDequantizationInfo(layer);
260 #endif
261     std::vector<element::Type> precisions = onWeights ? precisionsOnWeights : precisionsOnActivations;
262     PrecisionDetails precisionDetailsAtOutputIntervals = getPrecisionDetails(quantizationDetails);
263     {
264         if (precisionDetailsAtOutputIntervals.precision != element::undefined) {
265             if (!onWeights) {
266                 fillAvailablePrecisions(layer, precisions);
267             }
268
269             // if supportedPrecisions is empty then use the first available, not supported layer will be in original precision
270             if (!precisions.empty()) {
271                 const auto foundIt = std::find(precisions.begin(), precisions.end(), precisionDetailsAtOutputIntervals.precision);
272                 const element::Type resultPrecision = foundIt != precisions.end() ?
273                                                   precisionDetailsAtOutputIntervals.precision :
274                                                   *precisions.begin();
275
276                 const DataPrecision dataPrecision(
277                         resultPrecision,
278                         DataPrecision::getMinValue(resultPrecision, quantizationDetails.levels),
279                         DataPrecision::getMaxValue(resultPrecision, quantizationDetails.levels),
280                         foundIt != precisions.end() ? precisionDetailsAtOutputIntervals.hasZeroPoint : true);
281
282 #ifdef LPT_PRINT_DEQUANTIZATION_INFO
283                 printDequantizationInfo(dataPrecision);
284 #endif
285                 return dataPrecision;
286             }
287         }
288     }
289
290     const DataPrecision dataPrecision = precisions.empty() ?
291                                         DataPrecision(element::undefined, 0.f, 0.f, false) :
292                                         DataPrecision(
293                                                 *precisions.begin(),
294                                                 DataPrecision::getMinValue(*precisions.begin(), quantizationDetails.levels),
295                                                 DataPrecision::getMaxValue(*precisions.begin(), quantizationDetails.levels),
296                                                 true);
297 #ifdef LPT_PRINT_DEQUANTIZATION_INFO
298     printDequantizationInfo(dataPrecision);
299 #endif
300     return dataPrecision;
301 }
302
303 void LayerTransformation::fillAvailablePrecisions(std::shared_ptr<Node> layer, std::vector<element::Type>& availablePrecisions) const {
304     if (availablePrecisions.empty()) {
305         return;
306     }
307
308     const std::vector<std::shared_ptr<Node>> children = NetworkHelper::consumers(layer);
309     for (auto child : children) {
310         if (child->get_type_info().is_castable(opset1::FakeQuantize::get_type_info_static())) {
311             // FakeQuantize layer updates precision
312             continue;
313         }
314
315         if (!layerTransformationsManager->isQuantized(child)) {
316             // low precision chain is interrupted here: next operation supported precisions are ignored
317             continue;
318         }
319
320         const std::vector<element::Type> childPrecisionsOnActivations = paramsManager->getPrecisionsOnActivations(*child);
321         if (childPrecisionsOnActivations.size() == 0ul) {
322             continue;
323         }
324
325         for (size_t index = 0ul; index < availablePrecisions.size();) {
326             const element::Type availablePrecision = availablePrecisions[index];
327             if (!std::any_of(
328                     childPrecisionsOnActivations.begin(),
329                     childPrecisionsOnActivations.end(),
330                     [&](const element::Type precision) { return availablePrecision == precision; })) {
331                 availablePrecisions.erase(availablePrecisions.begin() + index);
332             } else {
333                 ++index;
334             }
335         }
336
337         if (!layerTransformationsManager->isPrecisionPreserved(child)) {
338             continue;
339         }
340
341         fillAvailablePrecisions(child, availablePrecisions);
342         if (availablePrecisions.empty()) {
343             return;
344         }
345     }
346 }
347
348 std::vector<std::shared_ptr<Node>> LayerTransformation::getChildrenRecursivelyExceptPrecisionPreserved(
349         const std::shared_ptr<Node>& op) const noexcept {
350     std::queue<std::shared_ptr<Node>> notHandledChildren;
351
352     for (const auto& output : op->outputs()) {
353         for (const auto& input : output.get_target_inputs()) {
354             std::shared_ptr<Node> child = input.get_node()->shared_from_this();
355             notHandledChildren.emplace(child);
356         }
357     }
358
359     std::vector<std::shared_ptr<Node>> resultChildren;
360
361     while (!notHandledChildren.empty()) {
362         const std::shared_ptr<ngraph::Node> operation = notHandledChildren.front();
363         notHandledChildren.pop();
364
365         if (!this->layerTransformationsManager->isPrecisionPreserved(operation)) {
366             resultChildren.push_back(operation);
367             continue;
368         }
369
370         for (const auto& output : operation->outputs()) {
371             for (const auto& input : output.get_target_inputs()) {
372                 std::shared_ptr<Node> child = input.get_node()->shared_from_this();
373                 notHandledChildren.emplace(child);
374             }
375         }
376     }
377
378     return resultChildren;
379 }
380
381
382 std::shared_ptr<ngraph::Node> LayerTransformation::separateInStandaloneBranch(std::shared_ptr<ngraph::Node> node) const {
383     FakeQuantizeDequantization dequantization = NetworkHelper::getDequantization(node);
384     if (dequantization.isShared()) {
385         Output<Node> parent = dequantization.data;
386         if (dequantization.convert != nullptr) {
387             parent = dequantization.convert->clone_with_new_inputs({ parent });
388             parent.get_node_shared_ptr()->set_friendly_name(parent.get_node_shared_ptr()->get_name() + "_new");
389         }
390
391         if (dequantization.subtract != nullptr) {
392             parent = dequantization.subtract->clone_with_new_inputs({
393                 parent,
394                 dequantization.subtract->get_input_node_shared_ptr(1)->clone_with_new_inputs({}) });
395             parent.get_node_shared_ptr()->set_friendly_name(parent.get_node_shared_ptr()->get_name() + "_new");
396         }
397
398         if (dequantization.multiply != nullptr) {
399             parent = dequantization.multiply->clone_with_new_inputs({
400                 parent,
401                 dequantization.multiply->get_input_node_shared_ptr(1)->clone_with_new_inputs({}) });
402             parent.get_node_shared_ptr()->set_friendly_name(parent.get_node_shared_ptr()->get_name() + "_new");
403         }
404
405         std::vector<Output<Node>> inputs = NetworkHelper::getInputs(node);
406         const size_t inputIndex = NetworkHelper::getChildInputIndex(dequantization.multiply, node);
407         inputs[inputIndex] = parent;
408         const std::shared_ptr<Node> newNode = node->clone_with_new_inputs(inputs);
409
410         replace_node(node, newNode);
411         newNode->set_friendly_name(node->get_friendly_name());
412
413         return newNode;
414     }
415
416     return node;
417 }
418
419 std::shared_ptr<ngraph::Node> LayerTransformation::moveDequantizationAfter(
420     TransformationContext &context,
421     const std::shared_ptr<ngraph::Node>& operation,
422     const FakeQuantizeDequantization& dequantization,
423     const bool updatePrecision,
424     const bool moveSubtract) const {
425     const auto result = ngraph::pass::low_precision::NetworkHelper::moveDequantizationAfter(operation, dequantization, updatePrecision, moveSubtract);
426     updateOutput(context, result.lastDequantization, result.newOperation);
427     return result.newOperation;
428 }
429
430 void LayerTransformation::fuseConvertIfPossible(const std::shared_ptr<ngraph::Node>& operation) const {
431     FakeQuantizeDequantization dequantization = NetworkHelper::getDequantization(operation, 0);
432     if ((dequantization.subtract != nullptr) &&
433         NetworkHelper::checkConstantValuePrecision(
434             dequantization.convert->get_output_element_type(0),
435             dequantization.subtract->get_input_node_shared_ptr(1))) {
436         auto newOperation = separateInStandaloneBranch(operation);
437         dequantization = NetworkHelper::getDequantization(operation, 0);
438         // TODO: It is correct to use optimizeSubtract here: uncomment following rows and fix it
439         //auto newSubtract = NetworkHelper::optimizeSubtract(dequantization.subtract);
440         //replace_node(dequantization.subtract, newSubtract);
441         NetworkHelper::removeConvertIfPossible(operation, dequantization);
442     }
443 }
444
445 void LayerTransformation::updateOutput(
446     TransformationContext &context,
447     std::shared_ptr<ngraph::Node> lastNode,
448     std::shared_ptr<ngraph::Node> originalNode) const {
449     const size_t outputSize = context.function->get_output_size();
450     for (size_t i = 0; i < outputSize; ++i) {
451         std::shared_ptr<ngraph::Node> result = context.function->get_output_op(i);
452         std::shared_ptr<ngraph::Node> outputNode = result->get_input_node_shared_ptr(0);
453         if (outputNode.get() == lastNode.get()) {
454             const std::string originalName = originalNode->get_friendly_name();
455             originalNode->set_friendly_name(originalName + LayerTransformation::originalLayerPostfix);
456             lastNode->set_friendly_name(originalName);
457             break;
458         }
459     }
460 }
461
462 void LayerTransformation::updateOutput(
463     TransformationContext& context,
464     std::shared_ptr<ngraph::Node> lastNode,
465     std::string originalName) const {
466     const size_t outputSize = context.function->get_output_size();
467     for (size_t i = 0; i < outputSize; ++i) {
468         std::shared_ptr<ngraph::Node> result = context.function->get_output_op(i);
469         std::shared_ptr<ngraph::Node> outputNode = result->get_input_node_shared_ptr(0);
470         if (outputNode.get() == lastNode.get()) {
471             lastNode->set_friendly_name(originalName);
472             break;
473         }
474     }
475 }
476
477 void LayerTransformation::addPattern(ngraph::pass::GraphRewrite& pass, TransformationContext& context, std::shared_ptr<Node> patternRoot) const {
478     ngraph::graph_rewrite_callback internal_callback = [this, &context](ngraph::pattern::Matcher &m) {
479         const bool result = transform(context, m);
480 #ifdef LPT_DISPLAY_PRECISION
481         if (result) {
482             auto operationNode = m.get_match_root();
483             std::cout << "Operation was transformed: " <<
484                 operationNode->get_type_name() << ", " <<
485                 operationNode->get_friendly_name() << ", output operation precision: " <<
486                 ((operationNode->get_output_size() == 1u) ? operationNode->get_output_element_type(0) : ngraph::element::Type()) <<
487                 std::endl;
488         }
489 #endif
490         return false;
491     };
492     // TODO: better name for matcher? required?
493     auto m = std::make_shared<ngraph::pattern::Matcher>(patternRoot, "SingleNodeMatcher");
494     pass.add_matcher(m, internal_callback, ngraph::pass::PassProperty::CHANGE_DYNAMIC_STATE);
495 }
496
497 }  // namespace low_precision
498 }  // namespace pass
499 }  // namespace ngraph