1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
12 #include "gna_layer_info.hpp"
13 #include "ie_layers.h"
14 #include "gna_plugin_log.hpp"
16 namespace GNAPluginNS {
18 using namespace InferenceEngine;
19 struct ScaleFactorUpdateResult {
20 CNNLayer *restartLayer = nullptr;
21 ScaleFactorUpdateResult() = default;
22 explicit ScaleFactorUpdateResult(CNNLayer * restartlayer) : restartLayer(restartlayer) {
25 return restartLayer == nullptr;
30 * @brief calculates output scale factor per layer
34 class ScaleFactorPerLayer {
37 * @brief calculates weights scale factor for fit dynamic range into target bitsize,
38 * also calculates output scale factor for the given layer
41 * @param inputScaleFactor
45 bool operator()(T cnnLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
51 class ScaleFactorPerLayer<InferenceEngine::CNNLayer *> {
53 const float activation_scale_factor = 2048.f;
54 const float identity_scale_factor = 2049.0f;
56 const float k_identity = 6;
59 static bool fp32eq(float p1, float p2) {
60 return (std::abs(p1 - p2) <= 0.00001f * std::min(std::abs(p1), std::abs(p2)));
62 float getActivationScale(GNAPluginNS::LayerInfo const& layer, QuantizedLayerParams const* qunatizedParams) {
63 // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
64 // set the initial value
66 result = (layer.isIdentity()) ? identity_scale_factor : activation_scale_factor;
67 // if activation is one from relu family, we need to apply heuruistic to avoid activation output overflow
69 static_cast<uint64_t>(result * qunatizedParams->_src_quant.scale)
70 > std::numeric_limits<int32_t>::max()-1) {
71 result = (result * 0.5);
77 bool operator()(InferenceEngine::CNNLayer *cnnLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
79 THROW_IE_EXCEPTION << "Incorrect Convolutional Layer pointer \n";
81 LayerInfo layerInfo(*cnnLayer);
82 // TODO: current approach set input scale factor for true input layer(s) equals to provided factor,
83 auto quant = getInjectedData<QuantizedLayerParams>(*cnnLayer);
84 if (InferenceEngine::details::CaselessEq<std::string>()(cnnLayer->type, "Memory")) {
85 if (CNNNetHasPrevLayer(cnnLayer)) {
86 auto prevLayer = CNNNetPrevLayer(cnnLayer);
87 auto prevInfo = LayerInfo(prevLayer);
88 auto inputQuant = getInjectedData<QuantizedLayerParams>(prevLayer);
89 // locating corresponding memory layers ith same ID
90 for (auto && input : CNNNetGetAllInputLayers(cnnLayer)) {
93 !InferenceEngine::details::CaselessEq<std::string>()(input->params["id"], cnnLayer->params["id"])) {
97 auto quantSibling = getInjectedData<QuantizedLayerParams>(input);
99 // after restarting from memory input - quant is fine
100 if (fp32eq(quantSibling->_dst_quant.scale, inputQuant->_dst_quant.scale)) {
101 quant->_src_quant.scale = quant->_dst_quant.scale = inputQuant->_dst_quant.scale;
105 if (!fp32eq(quantSibling->_dst_quant.scale, 1)) {
106 // means we already restarted propagation from that memory layer - we cannot do mach here
107 THROW_GNA_EXCEPTION << "quantization error : input scale factor ( " << inputQuant->_dst_quant.scale <<") "
108 << " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : "
109 << activation_scale_factor;
112 gnawarn() << "[INFO] quantization : input scale factor (" << inputQuant->_dst_quant.scale <<")"
113 << " for " << cnnLayer->name << ", that is child of " << prevLayer->name <<" doesnt match : "
114 << activation_scale_factor << ", restarting from corresponding memory: "<< input->name << std::endl;
116 // try updating memory input layer scale factor and restart from it
117 quantSibling->_src_quant.scale = quantSibling->_dst_quant.scale = inputQuant->_dst_quant.scale;
118 result = ScaleFactorUpdateResult(input.get());
125 if (!CNNNetHasPrevLayer(cnnLayer)) {
126 quant->_dst_quant.scale = inputScaleFactor;
127 return ScaleFactorUpdateResult();
130 // by default layer is pass thru its scale factor
131 auto inputQuant = getInjectedData<QuantizedLayerParams>(CNNNetPrevLayer(cnnLayer));
132 quant->_dst_quant.scale = inputQuant->_dst_quant.scale;
133 quant->_src_quant.scale = inputQuant->_dst_quant.scale;
135 if (layerInfo.isActivation()) {
136 // todo: calculate proper scale factor where we need to expand it a bit to be safe to stay in int16 weights
137 // set the initial value
138 quant->_dst_quant.scale = getActivationScale(layerInfo, quant);
145 class ScaleFactorPerLayer<InferenceEngine::EltwiseLayer*> {
147 bool operator()(InferenceEngine::EltwiseLayer* eltwiseLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
148 if ( !eltwiseLayer ) {
149 THROW_GNA_EXCEPTION << "Incorrect Eltwise Layer pointer \n";
151 auto in0 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 0);
152 auto in1 = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, 1);
154 auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
155 auto quantParams1 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in1);
156 auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*eltwiseLayer);
158 switch (eltwiseLayer->_operation) {
159 case InferenceEngine::EltwiseLayer::Prod: {
160 quantData->_weights_quant.scale = quantParams1->_dst_quant.scale;
161 quantData->_dst_quant.scale = quantParams0->_dst_quant.scale * quantParams1->_dst_quant.scale;
164 case InferenceEngine::EltwiseLayer::Sum: {
165 // detect which input will be used as biases
166 if (LayerInfo(in0).has32BOutput()) {
168 std::swap(quantParams0, quantParams1);
171 // this path might result in significant data loss
172 quantData->_weights_quant.scale = quantParams1->_dst_quant.scale / quantParams0->_dst_quant.scale;
173 quantData->_dst_quant.scale = quantParams1->_dst_quant.scale;
175 // eltwise will always work in int16
176 auto maxValue = std::numeric_limits<int16_t>::max() - 1;
177 if (quantData->_weights_quant.scale > maxValue + 1) {
178 // rescaling it's activation input
179 // iterating thru previous layers of eltwise
180 for (uint8_t i = 0; i < 2; ++i) {
181 InferenceEngine::CNNLayerPtr in = InferenceEngine::CNNNetPrevLayer(eltwiseLayer, i);
182 // trick to get opposite index (for 0 -> 1 for 1 -> 0) by inversing i.
184 InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(eltwiseLayer, !i));
186 for (; InferenceEngine::CNNNetHasPrevLayer(in.get()); in = CNNNetPrevLayer(in)) {
187 auto info = LayerInfo(in);
188 // we skipping only split layers so far, also need to work on memory layers
189 // this case for input from port 0
190 if (info.isSplit() || info.isSlice()) {
192 } else if (info.has16BOutput() && info.isActivation()) {
193 auto newOutputScale = quantParams->_dst_quant.scale / maxValue;
194 if (newOutputScale > std::numeric_limits<int16_t>::max() / 2) {
197 auto quantDataForActivation = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
198 gnawarn() << "[WARNING] saturated weights for " << eltwiseLayer->name
199 << ". Layer new output scale: " << in->name << ", output_scale=" << newOutputScale
200 << ", was " << quantDataForActivation->_dst_quant.scale <<"\n" << std::flush;
201 quantDataForActivation->_dst_quant.scale = newOutputScale;
202 result = ScaleFactorUpdateResult(in.get());
204 } else if (info.has16BOutput()) {
208 // if we are here it means that we are in the port 1
209 if (info.isFullyConnected() || info.isConvolution()) {
210 auto quantDataForInputLayer = InferenceEngine::getInjectedData<QuantizedLayerParams>(*in);
211 auto newOutputScale = quantParams->_dst_quant.scale * maxValue;
212 auto newWeightScale = newOutputScale / quantDataForInputLayer->_src_quant.scale;
213 quantDataForInputLayer->_dst_quant.scale = newOutputScale;
214 quantDataForInputLayer->_weights_quant.scale = newWeightScale;
215 result = ScaleFactorUpdateResult(in.get());
220 // we unable to rescale the input - results might be bad
221 gnawarn() << "[INFO] weights saturated for " << eltwiseLayer->name << "\n";
225 default : THROW_GNA_EXCEPTION << "Unsupported Eltwise layer for quantisation: " << eltwiseLayer->_operation;
232 class ScaleFactorPerLayer<InferenceEngine::ConcatLayer*> {
234 bool operator()(InferenceEngine::ConcatLayer* concatLayer, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
235 if ( !concatLayer ) {
236 THROW_GNA_EXCEPTION << "Incorrect Concat Layer pointer \n";
238 auto in0 = InferenceEngine::CNNNetPrevLayer(concatLayer, 0);
239 auto in1 = InferenceEngine::CNNNetPrevLayer(concatLayer, 1);
240 auto infoIn0 = LayerInfo(in0);
241 auto infoIn1 = LayerInfo(in1);
242 auto quantParams0 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in0);
243 auto quantParams1 = InferenceEngine::getInjectedData<QuantizedLayerParams>(in1);
244 GNAPluginNS::QuantizedLayerParams* sourceQuantParams = NULL;
245 auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*concatLayer);
247 if (quantParams0->_dst_quant.scale == quantParams1->_dst_quant.scale) {
249 } else if (infoIn0.isInput() && infoIn1.isInput()) {
250 THROW_GNA_EXCEPTION << "Two Input layers has different scales in concat!!! \n";
254 if (infoIn0.isInput()) {
255 sourceQuantParams = quantParams0;
256 } else if (infoIn1.isInput()) {
258 sourceQuantParams = quantParams1;
261 if (!sourceQuantParams) {
262 THROW_GNA_EXCEPTION << "Concat quantization for this case need to be implemented!!! \n";
264 auto destinationQuantParams =
265 InferenceEngine::getInjectedData<QuantizedLayerParams>(InferenceEngine::CNNNetPrevLayer(concatLayer, !i));
266 InferenceEngine::CNNLayerPtr in = InferenceEngine::CNNNetPrevLayer(concatLayer, !i);
268 quantData->_dst_quant.scale = sourceQuantParams->_dst_quant.scale;
269 quantData->_src_quant.scale = sourceQuantParams->_dst_quant.scale;
271 destinationQuantParams->_dst_quant.scale = sourceQuantParams->_dst_quant.scale;
272 result = ScaleFactorUpdateResult(in.get());
279 class ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
281 float const _scale_reduction_50 = 0.50;
282 float const _scale_reduction_45 = 0.45;
283 float const _scale_reduction_40 = 0.40;
284 float const _scale_reduction_35 = 0.35;
286 uint16_t const _scale_change_req_threshold = 30;
287 uint16_t const _scale_change_threshold_100 = 100;
288 uint16_t const _scale_change_threshold_150 = 150;
289 uint16_t const _scale_change_threshold_200 = 200;
292 bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
294 THROW_GNA_EXCEPTION << "Incorrect Weightable Layer pointer \n";
295 } else if (!wl->_weights) {
296 THROW_GNA_EXCEPTION << "Incorrect weight value for " << wl->name << ":" << wl->type << "\n";
299 auto prevLayer = CNNNetPrevLayer(wl);
300 auto quantDataForInputLayer =
301 InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
303 auto quant = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
304 // TODO: pass 8 bits somehow
305 if (quant->_weights_quant.scale == 1.0f) {
306 size_t scaleRange = 0;
307 if (weightsSize == 2) {
308 scaleRange = MAX_VAL_2B_WEIGHT;
309 } else if (weightsSize == 1) {
310 scaleRange = MAX_VAL_1B_WEIGHT;
312 THROW_GNA_EXCEPTION << "Unsupported weights size of: " << weightsSize;
314 quant->_weights_quant.scale =
315 ScaleFactorForQuantization(wl->_weights->buffer().as<float *>(), scaleRange, wl->_weights->size());
317 // TODO: findout why ???
318 if (weightsSize == 1) {
319 quant->_weights_quant.scale *= MAX_OUT_MULTIPLIER;
323 quant->_src_quant.scale = quantDataForInputLayer->_dst_quant.scale;
325 double tmp_dst_quant_scale = quant->_weights_quant.scale * quantDataForInputLayer->_dst_quant.scale;
327 if (weightsSize == 1 &&
328 static_cast<uint64_t>(tmp_dst_quant_scale * quant->_src_quant.scale) >
329 static_cast<uint64_t>(std::numeric_limits<int32_t>::max()-1) * _scale_change_req_threshold) {
330 gnawarn() << "Output scale for " << wl->name
331 << " too large and are being reduced. Else saturations likely will happen \n";
332 // reduce weight scale according experimentatl heuruistic
333 if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_100) {
334 quant->_weights_quant.scale *= _scale_reduction_50;
335 tmp_dst_quant_scale *= _scale_reduction_50;
336 } else if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_150) {
337 quant->_weights_quant.scale *= _scale_reduction_45;
338 tmp_dst_quant_scale *= _scale_reduction_45;
339 } else if (quant->_dst_quant.scale * quant->_src_quant.scale / std::numeric_limits<int32_t>::max() < _scale_change_threshold_200) {
340 quant->_weights_quant.scale *= _scale_reduction_40;
341 tmp_dst_quant_scale *= _scale_reduction_40;
343 quant->_weights_quant.scale *= _scale_reduction_35;
344 tmp_dst_quant_scale *= _scale_reduction_35;
348 quant->_dst_quant.scale = tmp_dst_quant_scale;
355 class ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> : public ScaleFactorPerLayer<InferenceEngine::WeightableLayer*> {
357 bool operator()(InferenceEngine::WeightableLayer *wl, int weightsSize, float inputScaleFactor, ScaleFactorUpdateResult &result) {
358 return ScaleFactorPerLayer<InferenceEngine::WeightableLayer*>::operator()(wl, 2, inputScaleFactor, result);
363 * GNA convolutions cannot be quantized in int8, remove when library starts support that
366 class ScaleFactorPerLayer<InferenceEngine::ConvolutionLayer*> : public ScaleFactorPerLayer<InferenceEngine::ScaleShiftLayer*> {
370 } // namespace details
373 * @brief scale factor calculator will calculate only output scale factors for the layer
374 * if scale factor propagation not possible, it will fall indicate a restart condition
376 class ScaleFactorCalculator {
377 using Cnt = std::vector<InferenceEngine::CNNLayerPtr>;
379 mutable Cnt::const_iterator idx;
380 float inputScaleFactor;
381 mutable bool needRestart = false;
382 int weightsBytesSize;
385 ScaleFactorCalculator(Cnt &net, int weightsBytesSize, float inputScaleFactor)
386 : net(net), inputScaleFactor(inputScaleFactor), weightsBytesSize(weightsBytesSize) {
387 idx = std::begin(this->net);
389 bool needToRestart() const {
392 bool allLayersProcessed() const {
393 return idx == std::end(net);
395 std::vector<InferenceEngine::CNNLayerPtr> getStartLayers() const {
396 return std::vector<InferenceEngine::CNNLayerPtr>(idx, std::end(net));
399 bool operator()(T ptr) const {
401 details::ScaleFactorUpdateResult result;
402 if (!details::ScaleFactorPerLayer<T>()(ptr, weightsBytesSize, inputScaleFactor, result)) {
410 idx = std::find_if(net.begin(), net.end(), [&](InferenceEngine::CNNLayerPtr cnnLayer) {
412 return result.restartLayer == cnnLayer.get();
414 return ptr == cnnLayer.get();
422 } // namespace GNAPluginNS