2 // Copyright © 2017 Arm Ltd. All rights reserved.
3 // See LICENSE file in the project root for full license information.
8 #include "RefWorkloadUtils.hpp"
10 #include <armnn/Tensor.hpp>
12 #include <boost/assert.hpp>
13 #include <boost/numeric/conversion/cast.hpp>
21 /// Performs multiplication of an integer with a multiplier which is less than one,
22 /// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor.
23 struct QuantizedMultiplierSmallerThanOne
26 /// Constructs a QuantizedMultiplierSmallerThanOne which will multiply by the given multiplier.
27 /// This stores the appropriate integer quantities (derived from the given multiplier) for later use.
28 /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne().
29 QuantizedMultiplierSmallerThanOne(float multiplier);
31 /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne().
32 int32_t operator*(int32_t rhs) const;
35 /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul().
36 static int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b);
38 /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT().
39 static int32_t RoundingDivideByPOT(int32_t x, int exponent);
45 /// An implementation shared by normal and depthwise convolution.
46 template<typename ConvData, typename InputType, typename BiasType, typename AccumulatorType>
47 static void ConvImpl(ConvData data,
48 const InputType* inputData,
51 const InputType* filterData,
54 const BiasType* biasData,
55 InputType* outputData,
58 const TensorInfo& filterInfo,
59 bool depthwise = false)
61 if (data.m_Parameters.m_BiasEnabled && !biasData)
63 throw InvalidArgumentException("Bias is enabled but the bias data is invalid");
66 const TensorInfo& inputInfo0 = GetTensorInfo(data.m_Inputs[0]);
67 const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]);
69 unsigned int depthMult = depthwise ? filterInfo.GetShape()[0] : 1;
70 unsigned int channelsInput = filterInfo.GetShape()[1];
71 unsigned int channelsOutput = depthwise ? channelsInput * depthMult : filterInfo.GetShape()[0];
73 unsigned int batchSize = outputInfo0.GetShape()[0];
74 unsigned int heightOutput = outputInfo0.GetShape()[2];
75 unsigned int widthOutput = outputInfo0.GetShape()[3];
76 unsigned int heightInput = inputInfo0.GetShape()[2];
77 unsigned int widthInput = inputInfo0.GetShape()[3];
79 unsigned int heightFilter = filterInfo.GetShape()[2];
80 unsigned int widthFilter = filterInfo.GetShape()[3];
82 unsigned int paddingTop = data.m_Parameters.m_PadTop;
83 unsigned int paddingLeft = data.m_Parameters.m_PadLeft;
84 unsigned int hStride = data.m_Parameters.m_StrideY;
85 unsigned int xStride = data.m_Parameters.m_StrideX;
87 // The world's least efficient convolution.
88 for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++)
90 for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++)
92 for (unsigned int yOutput = 0; yOutput < heightOutput; yOutput++)
94 for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++)
96 // This loop goes over each output element.
97 AccumulatorType sum = AccumulatorType();
99 // For depthwise, each output channel corresponds to exactly one input channel.
100 // For normal, must loop over each input channel.
101 for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++)
103 unsigned int depthwiseMultiplierIdx = 0;
106 cInput = cOutput / depthMult;
107 depthwiseMultiplierIdx = cOutput % depthMult;
110 for (unsigned int yFilter = 0; yFilter < heightFilter; yFilter++)
112 for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++)
114 // This loop goes over each input element for each output element.
116 unsigned int filterIndex;
118 // Since dimensionality of kernel depends on depthwiseness, so does index.
121 filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter * channelsInput +
122 cInput * widthFilter * heightFilter +
123 yFilter * widthFilter +
128 filterIndex = cOutput * widthFilter * heightFilter * channelsInput +
129 cInput * widthFilter * heightFilter +
130 yFilter * widthFilter +
133 AccumulatorType filterValue = filterData[filterIndex] -
134 boost::numeric_cast<AccumulatorType>(filterOffset);
136 unsigned int yInput = yOutput * hStride + yFilter;
137 unsigned int xInput = xOutput * xStride + xFilter;
139 AccumulatorType inputValue;
141 // Check if we're in the padding.
142 if (yInput < paddingTop || yInput >= heightInput + paddingTop ||
143 xInput < paddingLeft || xInput >= widthInput + paddingLeft )
145 inputValue = AccumulatorType();
149 inputValue = inputData[batchIdx * widthInput * heightInput * channelsInput +
150 widthInput * heightInput * cInput +
151 widthInput * (yInput - paddingTop) +
152 xInput - paddingLeft] -
153 boost::numeric_cast<AccumulatorType>(inputOffset);
155 sum += filterValue * inputValue;
160 if (data.m_Parameters.m_BiasEnabled)
162 sum += biasData[cOutput];
165 if (outputScale != 0.0f)
167 float multiplier = (inputScale * filterScale) / outputScale;
168 // Apply the multiplier to sum, but do so using some quantized arithmetic which is consistent
169 // with the AndroidNN CPU implementation. This should be (roughly) equivalent to:
170 // sum = std::round(multiplier * sum + outputOffset);
171 sum = boost::numeric_cast<AccumulatorType>(
172 QuantizedMultiplierSmallerThanOne(multiplier) * boost::numeric_cast<int32_t>(sum))
173 + boost::numeric_cast<AccumulatorType>(outputOffset);
174 sum = std::min<AccumulatorType>(std::max<AccumulatorType>(sum, 0), 255);
177 outputData[batchIdx * widthOutput * heightOutput * channelsOutput +
178 widthOutput * heightOutput * cOutput +
179 widthOutput * yOutput +
180 xOutput] = boost::numeric_cast<InputType>(sum);