IVGCVSW-1865 - Support NHWC for Convolution2D (CpuRef)
[platform/upstream/armnn.git] / src / backends / reference / workloads / ConvImpl.hpp
1 //
2 // Copyright © 2017 Arm Ltd. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5
6 #pragma once
7
8 #include "RefWorkloadUtils.hpp"
9
10 #include <armnn/Tensor.hpp>
11
12 #include <boost/assert.hpp>
13 #include <boost/numeric/conversion/cast.hpp>
14
15 #include <cmath>
16 #include <limits>
17
18 namespace armnn
19 {
20
21 /// Performs multiplication of an integer with a multiplier which is less than one,
22 /// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor.
23 struct QuantizedMultiplierSmallerThanOne
24 {
25 public:
26     /// Constructs a QuantizedMultiplierSmallerThanOne which will multiply by the given multiplier.
27     /// This stores the appropriate integer quantities (derived from the given multiplier) for later use.
28     /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne().
29     QuantizedMultiplierSmallerThanOne(float multiplier);
30
31     /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne().
32     int32_t operator*(int32_t rhs) const;
33
34 private:
35     /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul().
36     static int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b);
37
38     /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT().
39     static int32_t RoundingDivideByPOT(int32_t x, int exponent);
40
41     int32_t m_Multiplier;
42     int32_t m_RightShift;
43 };
44
45 /// An implementation shared by normal and depthwise convolution.
46 template<typename ConvData, typename InputType, typename BiasType, typename AccumulatorType>
47 static void ConvImpl(ConvData data,
48                      const InputType* inputData,
49                      float inputScale,
50                      int32_t inputOffset,
51                      const InputType* filterData,
52                      float filterScale,
53                      int32_t filterOffset,
54                      const BiasType* biasData,
55                      InputType* outputData,
56                      float outputScale,
57                      int32_t outputOffset,
58                      const TensorInfo& filterInfo,
59                      bool depthwise = false)
60 {
61     if (data.m_Parameters.m_BiasEnabled && !biasData)
62     {
63         throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
64     }
65
66     const TensorInfo& inputInfo0  = GetTensorInfo(data.m_Inputs[0]);
67     const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]);
68
69     const DataLayoutIndexed dataLayoutIndexed(data.m_Parameters.m_DataLayout);
70     const unsigned int channelsIndex = dataLayoutIndexed.GetChannelsIndex();
71     const unsigned int heightIndex   = dataLayoutIndexed.GetHeightIndex();
72     const unsigned int widthIndex    = dataLayoutIndexed.GetWidthIndex();
73
74     unsigned int depthMult      = depthwise ? filterInfo.GetShape()[0] : 1;
75     unsigned int channelsInput  = filterInfo.GetShape()[channelsIndex];
76     unsigned int channelsOutput = depthwise ? channelsInput * depthMult : filterInfo.GetShape()[0];
77
78     unsigned int batchSize    = outputInfo0.GetShape()[0];
79     unsigned int heightOutput = outputInfo0.GetShape()[heightIndex];
80     unsigned int widthOutput  = outputInfo0.GetShape()[widthIndex];
81     unsigned int heightInput  = inputInfo0.GetShape()[heightIndex];
82     unsigned int widthInput   = inputInfo0.GetShape()[widthIndex];
83
84     unsigned int heightFilter = filterInfo.GetShape()[heightIndex];
85     unsigned int widthFilter  = filterInfo.GetShape()[widthIndex];
86
87     unsigned int paddingTop = data.m_Parameters.m_PadTop;
88     unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
89     unsigned int hStride  = data.m_Parameters.m_StrideY;
90     unsigned int xStride  = data.m_Parameters.m_StrideX;
91
92     // The world's least efficient convolution.
93     for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
94     {
95         for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++)
96         {
97             for (unsigned int yOutput = 0; yOutput < heightOutput; yOutput++)
98             {
99                 for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++)
100                 {
101                     // This loop goes over each output element.
102                     AccumulatorType sum = AccumulatorType();
103
104                     // For depthwise, each output channel corresponds to exactly one input channel.
105                     // For normal, must loop over each input channel.
106                     for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++)
107                     {
108                         unsigned int depthwiseMultiplierIdx = 0;
109                         if (depthwise)
110                         {
111                             cInput = cOutput / depthMult;
112                             depthwiseMultiplierIdx = cOutput % depthMult;
113                         }
114
115                         for (unsigned int yFilter = 0; yFilter < heightFilter; yFilter++)
116                         {
117                             for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++)
118                             {
119                                 // This loop goes over each input element for each output element.
120
121                                 unsigned int filterIndex;
122
123                                 // Since dimensionality of kernel depends on depthwiseness, so does index.
124                                 if (depthwise)
125                                 {
126                                     filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter * channelsInput +
127                                                   cInput * widthFilter * heightFilter +
128                                                   yFilter * widthFilter +
129                                                   xFilter;
130                                 }
131                                 else
132                                 {
133                                     filterIndex = cOutput * widthFilter * heightFilter * channelsInput +
134                                                   cInput  * widthFilter * heightFilter +
135                                                   yFilter * widthFilter +
136                                                   xFilter;
137                                 }
138                                 AccumulatorType filterValue = filterData[filterIndex] -
139                                     boost::numeric_cast<AccumulatorType>(filterOffset);
140
141                                 unsigned int yInput = yOutput * hStride + yFilter;
142                                 unsigned int xInput = xOutput * xStride + xFilter;
143
144                                 AccumulatorType inputValue;
145
146                                 // Check if we're in the padding.
147                                 if (yInput < paddingTop || yInput >= heightInput + paddingTop ||
148                                     xInput < paddingLeft || xInput >= widthInput + paddingLeft )
149                                 {
150                                     inputValue = AccumulatorType();
151                                 }
152                                 else
153                                 {
154                                     inputValue = inputData[batchIdx * widthInput * heightInput * channelsInput +
155                                                                       widthInput * heightInput * cInput +
156                                                                       widthInput * (yInput - paddingTop) +
157                                                                       xInput - paddingLeft] -
158                                         boost::numeric_cast<AccumulatorType>(inputOffset);
159                                 }
160                                 sum += filterValue * inputValue;
161                             }
162                         }
163                     }
164
165                     if (data.m_Parameters.m_BiasEnabled)
166                     {
167                         sum += biasData[cOutput];
168                     }
169
170                     if (outputScale != 0.0f)
171                     {
172                         float multiplier = (inputScale * filterScale) / outputScale;
173                         // Apply the multiplier to sum, but do so using some quantized arithmetic which is consistent
174                         // with the AndroidNN CPU implementation. This should be (roughly) equivalent to:
175                         //  sum = std::round(multiplier * sum + outputOffset);
176                         sum = boost::numeric_cast<AccumulatorType>(
177                                 QuantizedMultiplierSmallerThanOne(multiplier) * boost::numeric_cast<int32_t>(sum))
178                             + boost::numeric_cast<AccumulatorType>(outputOffset);
179                         sum = std::min<AccumulatorType>(std::max<AccumulatorType>(sum, 0), 255);
180                     }
181
182                     outputData[batchIdx * widthOutput * heightOutput * channelsOutput +
183                                           widthOutput * heightOutput * cOutput +
184                                           widthOutput * yOutput +
185                                           xOutput] = boost::numeric_cast<InputType>(sum);
186                 }
187             }
188         }
189     }
190 }
191
192 } //namespace armnn