src/backends/reference/workloads/ConvImpl.hpp

   1 //
   2 // Copyright © 2017 Arm Ltd. All rights reserved.
   3 // SPDX-License-Identifier: MIT
   4 //
   5
   6 #pragma once
   7
   8 #include "RefWorkloadUtils.hpp"
   9
  10 #include <armnn/Tensor.hpp>
  11
  12 #include <boost/assert.hpp>
  13 #include <boost/numeric/conversion/cast.hpp>
  14
  15 #include <cmath>
  16 #include <limits>
  17
  18 namespace armnn
  19 {
  20
  21 /// Performs multiplication of an integer with a multiplier which is less than one,
  22 /// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor.
  23 struct QuantizedMultiplierSmallerThanOne
  24 {
  25 public:
  26     /// Constructs a QuantizedMultiplierSmallerThanOne which will multiply by the given multiplier.
  27     /// This stores the appropriate integer quantities (derived from the given multiplier) for later use.
  28     /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne().
  29     QuantizedMultiplierSmallerThanOne(float multiplier);
  30
  31     /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne().
  32     int32_t operator*(int32_t rhs) const;
  33
  34 private:
  35     /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul().
  36     static int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b);
  37
  38     /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT().
  39     static int32_t RoundingDivideByPOT(int32_t x, int exponent);
  40
  41     int32_t m_Multiplier;
  42     int32_t m_RightShift;
  43 };
  44
  45 /// An implementation shared by normal and depthwise convolution.
  46 template<typename ConvData, typename InputType, typename BiasType, typename AccumulatorType>
  47 static void ConvImpl(ConvData data,
  48                      const InputType* inputData,
  49                      float inputScale,
  50                      int32_t inputOffset,
  51                      const InputType* filterData,
  52                      float filterScale,
  53                      int32_t filterOffset,
  54                      const BiasType* biasData,
  55                      InputType* outputData,
  56                      float outputScale,
  57                      int32_t outputOffset,
  58                      const TensorInfo& filterInfo,
  59                      bool depthwise = false)
  60 {
  61     if (data.m_Parameters.m_BiasEnabled && !biasData)
  62     {
  63         throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
  64     }
  65
  66     const TensorInfo& inputInfo0  = GetTensorInfo(data.m_Inputs[0]);
  67     const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]);
  68
  69     const DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout);
  70     const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
  71     const unsigned int heightIndex   = dataLayoutIndexed.GetHeightIndex();
  72     const unsigned int widthIndex    = dataLayoutIndexed.GetWidthIndex();
  73
  74     unsigned int depthMult      = depthwise ? filterInfo.GetShape()[0] : 1;
  75     unsigned int channelsInput  = filterInfo.GetShape()[channelsIndex];
  76     unsigned int channelsOutput = depthwise ? channelsInput * depthMult : filterInfo.GetShape()[0];
  77
  78     unsigned int batchSize    = outputInfo0.GetShape()[0];
  79     unsigned int heightOutput = outputInfo0.GetShape()[heightIndex];
  80     unsigned int widthOutput  = outputInfo0.GetShape()[widthIndex];
  81     unsigned int heightInput  = inputInfo0.GetShape()[heightIndex];
  82     unsigned int widthInput   = inputInfo0.GetShape()[widthIndex];
  83
  84     unsigned int heightFilter = filterInfo.GetShape()[heightIndex];
  85     unsigned int widthFilter  = filterInfo.GetShape()[widthIndex];
  86
  87     unsigned int paddingTop = data.m_Parameters.m_PadTop;
  88     unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
  89     unsigned int hStride  = data.m_Parameters.m_StrideY;
  90     unsigned int xStride  = data.m_Parameters.m_StrideX;
  91
  92     // The world's least efficient convolution.
  93     for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
  94     {
  95         for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++)
  96         {
  97             for (unsigned int yOutput = 0; yOutput < heightOutput; yOutput++)
  98             {
  99                 for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++)
 100                 {
 101                     // This loop goes over each output element.
 102                     AccumulatorType sum = AccumulatorType();
 103
 104                     // For depthwise, each output channel corresponds to exactly one input channel.
 105                     // For normal, must loop over each input channel.
 106                     for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++)
 107                     {
 108                         unsigned int depthwiseMultiplierIdx = 0;
 109                         if (depthwise)
 110                         {
 111                             cInput = cOutput / depthMult;
 112                             depthwiseMultiplierIdx = cOutput % depthMult;
 113                         }
 114
 115                         for (unsigned int yFilter = 0; yFilter < heightFilter; yFilter++)
 116                         {
 117                             for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++)
 118                             {
 119                                 // This loop goes over each input element for each output element.
 120
 121                                 unsigned int filterIndex;
 122
 123                                 // Since dimensionality of kernel depends on depthwiseness, so does index.
 124                                 if (depthwise)
 125                                 {
 126                                     filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter * channelsInput +
 127                                                   cInput * widthFilter * heightFilter +
 128                                                   yFilter * widthFilter +
 129                                                   xFilter;
 130                                 }
 131                                 else
 132                                 {
 133                                     filterIndex = cOutput * widthFilter * heightFilter * channelsInput +
 134                                                   cInput  * widthFilter * heightFilter +
 135                                                   yFilter * widthFilter +
 136                                                   xFilter;
 137                                 }
 138                                 AccumulatorType filterValue = filterData[filterIndex] -
 139                                     boost::numeric_cast<AccumulatorType>(filterOffset);
 140
 141                                 unsigned int yInput = yOutput * hStride + yFilter;
 142                                 unsigned int xInput = xOutput * xStride + xFilter;
 143
 144                                 AccumulatorType inputValue;
 145
 146                                 // Check if we're in the padding.
 147                                 if (yInput < paddingTop || yInput >= heightInput + paddingTop ||
 148                                     xInput < paddingLeft || xInput >= widthInput + paddingLeft )
 149                                 {
 150                                     inputValue = AccumulatorType();
 151                                 }
 152                                 else
 153                                 {
 154                                     inputValue = inputData[batchIdx * widthInput * heightInput * channelsInput +
 155                                                                       widthInput * heightInput * cInput +
 156                                                                       widthInput * (yInput - paddingTop) +
 157                                                                       xInput - paddingLeft] -
 158                                         boost::numeric_cast<AccumulatorType>(inputOffset);
 159                                 }
 160                                 sum += filterValue * inputValue;
 161                             }
 162                         }
 163                     }
 164
 165                     if (data.m_Parameters.m_BiasEnabled)
 166                     {
 167                         sum += biasData[cOutput];
 168                     }
 169
 170                     if (outputScale != 0.0f)
 171                     {
 172                         float multiplier = (inputScale * filterScale) / outputScale;
 173                         // Apply the multiplier to sum, but do so using some quantized arithmetic which is consistent
 174                         // with the AndroidNN CPU implementation. This should be (roughly) equivalent to:
 175                         //  sum = std::round(multiplier * sum + outputOffset);
 176                         sum = boost::numeric_cast<AccumulatorType>(
 177                                 QuantizedMultiplierSmallerThanOne(multiplier) * boost::numeric_cast<int32_t>(sum))
 178                             + boost::numeric_cast<AccumulatorType>(outputOffset);
 179                         sum = std::min<AccumulatorType>(std::max<AccumulatorType>(sum, 0), 255);
 180                     }
 181
 182                     outputData[batchIdx * widthOutput * heightOutput * channelsOutput +
 183                                           widthOutput * heightOutput * cOutput +
 184                                           widthOutput * yOutput +
 185                                           xOutput] = boost::numeric_cast<InputType>(sum);
 186                 }
 187             }
 188         }
 189     }
 190 }
 191
 192 } //namespace armnn