1 // Copyright (C) 2018 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
14 #include <ie_icnn_network.hpp>
15 #include <ie_icnn_network_stats.hpp>
16 #include <cpp/ie_cnn_network.h>
18 namespace InferenceEngine {
22 * We have raw statistic from stat collection tool and this statistic should be processed to get best
23 * accuracy. This transformation depends on the topology, depends on the parameters of layers.
24 * i.e. data going to regular and depth-wise convolution would be scaled differently. In case of
25 * regular convolution it should be scaled for tensor wide approach, for depth-wise convolution it
26 * should be scaled by channel approach.
27 * This class contains logic of getting scales
29 class CNNStatisticHelper {
32 * We need to have topology to make a decision about scales
33 * @param network initial network to be quantized, the topology can be changed during quantization
34 * @param internalNodesStats initial statistic
35 * @param maxSign - maximal signed value to be used for calculation of scales
36 * @param maxUnsign - maximal unsigned value to be used for calculation of scales
39 CNNStatisticHelper(CNNNetwork& network,
40 const std::map<std::string, NetworkNodeStatsPtr>& internalNodesStats,
45 * Returns if we can quantize layer basing on information of existing statistic before and after
48 bool canLayerBeQuantized(const std::string &layerName) const;
51 * The topology is allowed to be changed, we need to modify statistic accordingly
53 * Currently there is a need in copy of statistic only
55 * @param srcName name of layer from statistic needs to be taken
56 * @param dstName name of layer which statistic will be applied
58 void copyStatistics(const std::string& srcName, const std::string& dstName);
61 * Returns boolean values if layer produce negative data according collected statistic
62 * true means that layer produices negative values
63 * false means that layer produces only positive numbers
64 * @param layer - layer of interest
65 * @param outputPort - number of port to verify. -1 stands forverification of all outputs from
68 bool hasNegativeOutput(const std::string &layerName, int outputPort = -1) const;
71 * Returns input scale for layer based on statistic
72 * @return blob with scales per channel
74 InferenceEngine::Blob::Ptr getInputScale(CNNLayer::Ptr layer) const;
77 * Returns output scale for layer based on statistic
78 * @return blob with scales per channel
80 InferenceEngine::Blob::Ptr getOutputScale(CNNLayer::Ptr layer) const;
83 * provides max signed value as the only place for synchronization with other algorithms in
84 * normalizer which require this
86 int getMaxSignValue() const;
89 * Returns a latest layer in fusion, the data from returned layer will go to anopther, this mean
90 * that for all layers which will be fused we will have to use only statistic from that latest layer
91 * @param layer - layer of interest
93 * @return returns layer which statistic should be used for calculatio of all scales for layer
94 * passed as a parameter for this method
96 CNNLayer::Ptr getLatestInFuse(CNNLayer::Ptr layer) const;
100 * Calculates scale factor according statistic for layer passed to this function. No other logic for
101 * selection another layer is implemented here.
103 * @param channels redundant parameter, should be removed
104 * @param stats redundant parameter, should be removed
105 * @param maxInt - we can quantize to I8 even if data is unsigned, need to provide such max number
108 * @return InferenceEngine::Blob::Ptr
110 InferenceEngine::Blob::Ptr calculateScaleFactor(size_t channels,
111 NetworkNodeStatsPtr stats,
115 * Select the latet layer in the fusion and returns its statistic
117 NetworkNodeStatsPtr getStatistic(CNNLayer::Ptr layer) const;
120 * Pass over alls statistic and normalize it to the only scale per tenso, individual per channel or
121 * mix depenging on the pattern in the network
123 void NormalizeStatistic();
126 std::map<std::string, NetworkNodeStatsPtr> internalNodesStats_;
132 * This class normalizes and quantizes network to "Int8" state
133 * The converted network will have
134 * 1) scaleshifts which will normalize activation values to int8 (S8/U8) range
135 * 2) quantize weigths and biases of convolution
136 * 3) adds special attributes to layers because semantic of int8 layer are different vs floating
137 * point ones. For example, after convolution we need to return back to denormalized values and
138 * there should be special scale here
139 * 4) Transforms some layers to another ones. For example if i8 to i8 Scaleshift is not supported
140 * by backend, this scaleshift will be converted to grouped/(depth-wise in ideal case) convolution
142 * This class very depends on backend and its fusion. It assumes that fusion must be executed all
143 * the time, we cannot for split it to independent execution of two layers in int8 mode. This is
144 * done to calculate normalization factors the most optimal way to save accuracy.
145 * Currently supported fusion
147 * 2. Conv-Sum-ReLU which is appeared from the pattern
152 * Here, the output form "Something" will be used as in-place storge for accumulation of the
153 * results for convolution. That lead to tricky case in int8 when we have signed int8 input and
156 class INFERENCE_ENGINE_API_CLASS(CNNNetworkInt8Normalizer) {
158 CNNNetworkInt8Normalizer() {
161 /** Helper function for filling of scaleshift weights for normalization of activation */
162 static void fillInScaleShift(ScaleShiftLayer* scshLayer, size_t c, float* weightsN, float* weightsD);
165 /** main function for calling of quantization */
166 void NormalizeNetwork(ICNNNetwork& network, ICNNNetworkStats& netStats);
169 /** Helper function to add scaleshifts and other layers for transformatin of topology */
170 void AddLayerToCNNNetworkBeforeLayer(CNNLayer::Ptr newLayer, CNNLayer::Ptr successor, size_t port);
171 /** Helper function to add scaleshifts and other layers for transformatin of topology */
172 void AddLayerToCNNNetworkAfterData(DataPtr pData, CNNLayer::Ptr layer, const std::string& nextLayerName);
173 /** Adds ScaleShift between two specified layers */
174 void AddScaleShiftBetween(CNNNetwork& net, const CNNLayerPtr layer1, const CNNLayerPtr layer2, CNNStatisticHelper& statHelper);
178 * Function which recalculate weights according to input scales, and quantize weights, biases and
179 * adds o-scale and w-scale
180 * w-scale - multiplication on this scale of i8 convolution result will produce denormalized fp32
182 * o-scale - multiplication on this scale will convert above denormalized fp32 to i8 for next layer
184 void QuantizeConvolution(CNNLayer::Ptr convolution, CNNStatisticHelper& statHelper);
186 /** Adds ScaleShifts everywhere */
187 void AddScaleShifts(CNNNetwork& net, CNNStatisticHelper& statHelper);
190 * Goes over all layers and mark which layers will be executed in FP32/I8 and marks data between
191 * layers to I8/U8/FP32
193 void DefinesExecutionPrecision(CNNNetwork& net, CNNStatisticHelper& statHelper);
196 * Since o-scales exist only for convolutins, we need to propagate them down oever concats and
199 void PropagateScaleFactors(CNNNetwork& net, const CNNStatisticHelper& statHelper);
202 * Normalizes and quantizes srcData using scales for normalization and int8blob precision for
205 void ScaleDataToInt(const float* srcData, size_t srcSize, Blob::Ptr int8blob, const std::vector<float>& scales);
208 * Replaces all ScaleShifts layers met in the model to the depth-wise convolution with the same
209 * weights and biases.
212 * 1. ScaleShift following after Input layer, it is not converted to depth-wise convolution
213 * 2. Scaleshift producing output of network
214 * 3. Scaleshift passing data to Priorbox
216 * This conversion allows to avoid introductin one more i8 primitive - ScaleShift accepting i8 input
217 * and producing i8 output
219 void replaceScaleShiftByDWConvolution(CNNNetwork& net);
221 /** Helper function which creates DW/Grouped/regular convolution by passed weights and biases */
222 CNNLayer::Ptr createDWConvolutionForScale(const std::string& layerName, size_t channels, float *weights, float *biases);
225 * Returns tails from I8 to FP32 until convolution - it is the most performed approach because
226 * convolution can convert to FP32 for free, while adding one more scale will decrease performance
228 void returnTailToFP32(CNNLayer::Ptr layer);
231 * Verifies if next layer has type which potentially can be fused with convolution
232 * and if activation is supported for int8
233 * @return true if layer does not have improper activation for fusion
235 bool isNextFusionAllowed(CNNLayer::Ptr layer) const;
238 typedef std::shared_ptr<CNNNetworkInt8Normalizer> CNNNetworkNormalizerPtr;
240 } // namespace details
241 } // namespace InferenceEngine