inference-engine/src/inference_engine/cnn_network_int8_normalizer.hpp

   1 // Copyright (C) 2018 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #pragma once
   6
   7 #include <map>
   8 #include <memory>
   9 #include <float.h>
  10
  11 #include <string>
  12 #include <vector>
  13
  14 #include <ie_icnn_network.hpp>
  15 #include <ie_icnn_network_stats.hpp>
  16 #include <cpp/ie_cnn_network.h>
  17
  18 namespace InferenceEngine {
  19 namespace details {
  20
  21 /**
  22 * We have raw statistic from stat collection tool and this statistic should be processed to get best
  23 * accuracy. This transformation depends on the topology, depends on the parameters of layers.
  24 * i.e. data going to regular and depth-wise convolution would be scaled differently. In case of
  25 * regular convolution it should be scaled for tensor wide approach, for depth-wise convolution it
  26 * should be scaled by channel approach.
  27 * This class contains logic of getting scales
  28 */
  29 class CNNStatisticHelper {
  30 public:
  31     /**
  32     * We need to have topology to make a decision about scales
  33     * @param network initial network to be quantized, the topology can be changed during quantization
  34     * @param internalNodesStats initial statistic
  35     * @param maxSign - maximal signed value to be used for calculation of scales
  36     * @param maxUnsign - maximal unsigned value to be used for calculation of scales
  37     *
  38     */
  39     CNNStatisticHelper(CNNNetwork& network,
  40                        const std::map<std::string, NetworkNodeStatsPtr>& internalNodesStats,
  41                        int maxSign,
  42                        int maxUnsign);
  43
  44     /**
  45     * Returns if we can quantize layer basing on information of existing statistic before and after
  46     * layers
  47     */
  48     bool canLayerBeQuantized(const std::string &layerName) const;
  49
  50     /**
  51      * The topology is allowed to be changed, we need to modify statistic accordingly
  52      *
  53      * Currently there is a need in copy of statistic only
  54
  55      * @param srcName name of layer from statistic needs to be taken
  56      * @param dstName name of layer which statistic will be applied
  57      */
  58     void copyStatistics(const std::string& srcName, const std::string& dstName);
  59
  60     /**
  61     * Returns boolean values if layer produce negative data according collected statistic
  62     * true means that layer produices negative values
  63     * false means that layer produces only positive numbers
  64     * @param layer - layer of interest
  65     * @param outputPort - number of port to verify. -1 stands forverification of all outputs from
  66     * layer
  67     */
  68     bool hasNegativeOutput(const std::string &layerName, int outputPort = -1) const;
  69
  70     /**
  71      * Returns input scale for layer based on statistic
  72      * @return blob with scales per channel
  73      */
  74     InferenceEngine::Blob::Ptr getInputScale(CNNLayer::Ptr layer) const;
  75
  76     /**
  77      * Returns output scale for layer based on statistic
  78      * @return blob with scales per channel
  79      */
  80     InferenceEngine::Blob::Ptr getOutputScale(CNNLayer::Ptr layer) const;
  81
  82     /**
  83      * provides max signed value as the only place for synchronization with other algorithms in
  84      * normalizer which require this
  85      */
  86     int getMaxSignValue() const;
  87
  88     /**
  89      * Returns a latest layer in fusion, the data from returned layer will go to anopther, this mean
  90      * that for all layers which will be fused we will have to use only statistic from that latest layer
  91      * @param layer - layer of interest
  92      *
  93      * @return returns layer which statistic should be used for calculatio of all scales for layer
  94      *         passed as a parameter for this method
  95      */
  96     CNNLayer::Ptr getLatestInFuse(CNNLayer::Ptr layer) const;
  97
  98 private:
  99     /**
 100      * Calculates scale factor according statistic for layer passed to this function. No other logic for
 101      * selection another layer is implemented here.
 102      *
 103      * @param channels redundant parameter, should be removed
 104      * @param stats redundant parameter, should be removed
 105      * @param maxInt - we can quantize to I8 even if data is unsigned, need to provide such max number
 106      *               explicitly
 107      *
 108      * @return InferenceEngine::Blob::Ptr
 109      */
 110     InferenceEngine::Blob::Ptr calculateScaleFactor(size_t channels,
 111                                                     NetworkNodeStatsPtr stats,
 112                                                     int maxInt) const;
 113
 114     /**
 115      * Select the latet layer in the fusion and returns its statistic
 116     */
 117     NetworkNodeStatsPtr  getStatistic(CNNLayer::Ptr layer) const;
 118
 119     /**
 120      * Pass over alls statistic and normalize it to the only scale per tenso, individual per channel or
 121      * mix depenging on the pattern in the network
 122      */
 123     void NormalizeStatistic();
 124
 125     CNNNetwork network_;
 126     std::map<std::string, NetworkNodeStatsPtr> internalNodesStats_;
 127     int maxSign_;
 128     int maxUnsign_;
 129 };
 130
 131 /**
 132  * This class normalizes and quantizes network to "Int8" state
 133  * The converted network will have
 134  *  1) scaleshifts which will normalize activation values to int8 (S8/U8) range
 135  *  2) quantize weigths and biases of convolution
 136  *  3) adds special attributes to layers because semantic of int8 layer are different vs floating
 137  *  point ones. For example, after convolution we need to return back to denormalized values and
 138  *  there should be special scale here
 139  *  4) Transforms some layers to another ones. For example if i8 to i8 Scaleshift is not supported
 140  *  by backend, this scaleshift will be converted to grouped/(depth-wise in ideal case) convolution
 141  *
 142  *  This class very depends on backend and its fusion. It assumes that fusion must be executed all
 143  *  the time, we cannot for split it to independent execution of two layers in int8 mode. This is
 144  *  done to calculate normalization factors the most optimal way to save accuracy.
 145  *  Currently supported fusion
 146  *  1. Conv-ReLU
 147  *  2. Conv-Sum-ReLU which is appeared from the pattern
 148  *  Conv        Something
 149  *    \            /
 150  *        Eltwise
 151  *         ReLU
 152  *  Here, the output form "Something" will be used as in-place storge for accumulation of the
 153  *  results for convolution. That lead to tricky case in int8 when we have signed int8 input and
 154  *  unsigned u8 output
 155  *  */
 156 class INFERENCE_ENGINE_API_CLASS(CNNNetworkInt8Normalizer) {
 157 public:
 158     CNNNetworkInt8Normalizer() {
 159     }
 160 private:
 161     /** Helper function for filling of scaleshift weights for normalization of activation */
 162     static void fillInScaleShift(ScaleShiftLayer* scshLayer, size_t c, float* weightsN, float* weightsD);
 163
 164 public:
 165     /** main function for calling of quantization */
 166     void NormalizeNetwork(ICNNNetwork& network, ICNNNetworkStats& netStats);
 167
 168 protected:
 169     /** Helper function to add scaleshifts and other layers for transformatin of topology */
 170     void AddLayerToCNNNetworkBeforeLayer(CNNLayer::Ptr newLayer, CNNLayer::Ptr successor, size_t port);
 171     /** Helper function to add scaleshifts and other layers for transformatin of topology */
 172     void AddLayerToCNNNetworkAfterData(DataPtr pData, CNNLayer::Ptr layer, const std::string& nextLayerName);
 173     /**  Adds ScaleShift between two specified layers  */
 174     void AddScaleShiftBetween(CNNNetwork& net, const CNNLayerPtr layer1, const CNNLayerPtr layer2, CNNStatisticHelper& statHelper);
 175
 176
 177     /**
 178      * Function which recalculate weights according to input scales, and quantize weights, biases and
 179      * adds o-scale and w-scale
 180      * w-scale - multiplication on this scale of i8 convolution result will produce denormalized fp32
 181      * data
 182      * o-scale - multiplication on this scale will convert above denormalized fp32 to i8 for next layer
 183      */
 184     void QuantizeConvolution(CNNLayer::Ptr convolution, CNNStatisticHelper& statHelper);
 185
 186     /**  Adds ScaleShifts everywhere */
 187     void AddScaleShifts(CNNNetwork& net, CNNStatisticHelper& statHelper);
 188
 189     /**
 190      * Goes over all layers and mark which layers will be executed in FP32/I8 and marks data between
 191      * layers to I8/U8/FP32
 192      */
 193     void DefinesExecutionPrecision(CNNNetwork& net, CNNStatisticHelper& statHelper);
 194
 195     /**
 196      * Since o-scales exist only for convolutins, we need to propagate them down oever concats and
 197      * linear layers
 198      */
 199     void PropagateScaleFactors(CNNNetwork& net, const CNNStatisticHelper& statHelper);
 200
 201     /**
 202      * Normalizes and quantizes srcData using scales for normalization and int8blob precision for
 203      * quantization
 204      */
 205     void ScaleDataToInt(const float* srcData, size_t srcSize, Blob::Ptr int8blob, const std::vector<float>& scales);
 206
 207     /**
 208      * Replaces all ScaleShifts layers met in the model to the depth-wise convolution with the same
 209      * weights and biases.
 210      *
 211      * Exceptions:
 212      * 1. ScaleShift following after Input layer, it is not converted to depth-wise convolution
 213      * 2. Scaleshift producing output of network
 214      * 3. Scaleshift passing data to Priorbox
 215      *
 216      * This conversion allows to avoid introductin one more i8 primitive - ScaleShift accepting i8 input
 217      * and producing i8 output
 218      */
 219     void replaceScaleShiftByDWConvolution(CNNNetwork& net);
 220
 221     /** Helper function which creates DW/Grouped/regular convolution by passed weights and biases */
 222     CNNLayer::Ptr createDWConvolutionForScale(const std::string& layerName, size_t channels, float *weights, float *biases);
 223
 224     /**
 225     * Returns tails from I8 to FP32 until convolution - it is the most performed approach because
 226     * convolution can convert to FP32 for free, while adding one more scale will decrease performance
 227     */
 228     void returnTailToFP32(CNNLayer::Ptr layer);
 229
 230     /**
 231      * Verifies if next layer has type which potentially can be fused with convolution
 232      * and if activation is supported for int8
 233      * @return true if layer does not have improper activation for fusion
 234      */
 235     bool isNextFusionAllowed(CNNLayer::Ptr layer) const;
 236 };
 237
 238 typedef std::shared_ptr<CNNNetworkInt8Normalizer> CNNNetworkNormalizerPtr;
 239
 240 }  // namespace details
 241 }  // namespace InferenceEngine