1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
8 #include <gna-api-types-xnn.h>
10 #include "quantized_layer_params.hpp"
11 #include "quantization.h"
12 #include "details/caseless.hpp"
13 #include "graph_tools.hpp"
14 #include "blob_factory.hpp"
15 #include "precision_ex.hpp"
17 #include "gna_layer_info.hpp"
19 namespace GNAPluginNS {
23 * @brief description of quantisation precision
24 * @tparam Ip - input precision
25 * @tparam Wp - weights precision
26 * @tparam Bp - biases precision
27 * @tparam Np - network precision - can be auto generated in future
29 template <class Ip, class Op, class Wp, class Bp, class Np>
30 struct QuantDescTmpl {
31 using WeightsPrecision = Wp;
32 using BiasesPrecision = Bp;
34 InferenceEngine::TPrecision<Ip> _Ip;
35 InferenceEngine::TPrecision<Op> _Op;
36 InferenceEngine::TPrecision<Wp> _Wp;
37 InferenceEngine::TPrecision<Bp> _Bp;
38 InferenceEngine::TPrecision<Np> _Np;
40 QuantDescTmpl() = default;
41 QuantDescTmpl(InferenceEngine::TPrecision<Ip> _Ip,
42 InferenceEngine::TPrecision<Op> _Op,
43 InferenceEngine::TPrecision<Wp> _Wp,
44 InferenceEngine::TPrecision<Bp> _Bp,
45 InferenceEngine::TPrecision<Np> _Np) : _Op(_Op), _Ip(_Ip), _Wp(_Wp), _Bp(_Bp), _Np(_Np) {
48 InferenceEngine::Precision getInputPrecision() const {
51 InferenceEngine::Precision getWeightsPrecision() const {
54 InferenceEngine::Precision getBiasesPrecision() const {
57 InferenceEngine::Precision getNetPrecision() const {
60 InferenceEngine::Precision getOutputPrecision() const {
66 typename InferenceEngine::PrecisionTrait<InferenceEngine::Precision::X>::value_type
68 #define PRECISION_TYPE(A, B, C, D, E)\
69 P_TYPE(A), P_TYPE(B), P_TYPE(C), P_TYPE(D), P_TYPE(E)
72 struct QuantI16 : public QuantDescTmpl<PRECISION_TYPE(I16, I32, I16, I32, MIXED)> {
74 _Np = InferenceEngine::Precision::MIXED;
77 struct QuantI8 : public QuantDescTmpl<P_TYPE(I16), P_TYPE(I32), P_TYPE(I8), intel_compound_bias_t, P_TYPE(MIXED)> {
79 _Np = InferenceEngine::Precision::MIXED;
83 template <class A, class B>
85 using MandatoryType = A;
86 using OptionalType = B;
87 static A mandatory () { return A();}
88 static B optional () { return B();}
92 * @brief should allocated blob for specific data type, in case of src blob is nullptr
97 inline bool shouldAlwaysAllocate() {
102 inline bool shouldAlwaysAllocate<intel_compound_bias_t>() {
108 #undef PRECISION_TYPE
111 * @brief designate actual data quantisation functions trait
116 template<class ...Args>
117 void operator()(Args && ... args) const { }
121 class Quant<QuantI16> {
123 template<class ...Args>
124 void operator()(Args && ... args) const {
125 QuantizeAffine16(std::forward<Args>(args)...);
130 class Quant<QuantI8> {
132 template<class ...Args>
133 void operator()(Args && ... args) const {
134 QuantizeAffine8(std::forward<Args>(args)...);
138 template<class QuantDesc, class QuantFunc>
139 inline void quantizeWeightsBiases(const QuantDesc & quantDesc,
140 InferenceEngine::WeightableLayer *wl,
141 const QuantFunc &fnc,
142 bool isDiagonal = false) { // for diagonal layer number of weights and biases significatly smaller
143 // for quantized weights
145 make_custom_blob<typename QuantDesc::WeightsPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({wl->_weights->size()}));
146 intWeights->allocate();
147 if (intWeights->buffer() == nullptr) {
148 THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
149 << "cannot copy weights for layer :"<< wl->name << " of size" << intWeights->byteSize();
153 auto getBiasSizeForLayer = [](InferenceEngine::WeightableLayer *wl) {
155 return wl->_biases->size();
157 // calculating biases len using weight dims
158 auto & dims = wl->outData.front()->getDims();
162 using BiasesPrecision = typename QuantDesc::BiasesPrecision;
163 auto biasMaker = [&] () {
164 InferenceEngine::Blob::Ptr zero;
165 if (!wl->_biases && !shouldAlwaysAllocate<BiasesPrecision>()) {
168 auto bias = make_custom_blob<BiasesPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({
169 getBiasSizeForLayer(wl)
172 if (bias->buffer() == nullptr) {
173 THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
174 << "cannot copy bias for layer :"<< wl->name <<"of size" << bias->byteSize();
177 memset(bias->buffer(), 0, bias->byteSize());
181 auto intBiases = biasMaker();
183 float input_scale_factor = 1.f;
184 if (InferenceEngine::CNNNetHasPrevLayer(wl)) {
185 auto quantDataForInputLayer =
186 InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(wl).get());
187 input_scale_factor = quantDataForInputLayer->_dst_quant.scale;
188 if (std::isnan(input_scale_factor) ||
189 std::isinf(input_scale_factor)) {
190 THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
193 if (wl->outData[0]->getDims().size() < 2) {
194 THROW_IE_EXCEPTION << "Unsupported output dims size for " << wl->name <<", should be > 1, but " << wl->outData[0]->getDims().size();
196 if (wl->insData[0].lock().get()->getDims().size() < 2) {
197 THROW_IE_EXCEPTION << "Unsupported input dims size for " << wl->name << ", should be > 1, but " << wl->insData[0].lock().get()->getDims().size();
199 uint32_t num_rows = isDiagonal ? 1 : wl->outData[0]->getDims()[1];
200 uint32_t num_columns = wl->insData[0].lock().get()->getDims()[1];
202 if (wl->type == "AffineFilter") {
203 // for affine filter layer insdata size not equal to actual coded in input layer
204 num_columns = wl->_weights->size() / num_rows;
208 std::swap(num_rows, num_columns);
211 uint32_t num_rows_padded = num_rows;
212 uint32_t num_columns_padded = num_columns;
214 // TODO: replace this into fixed scale quantizer then
216 auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*wl);
218 fnc(wl->_weights->buffer().as<float *>(),
219 wl->_biases ? wl->_biases->buffer().as<float *>() : nullptr,
220 intWeights->buffer(),
221 intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
223 &quantData->_weights_quant.scale,
224 &quantData->_dst_quant.scale,
230 wl->_weights = intWeights;
231 wl->_biases = intBiases;
234 * correcting precision for outdata
236 wl->precision = quantDesc.getWeightsPrecision();
237 for (auto &&outData : wl->outData) {
238 outData->setPrecision(quantDesc.getOutputPrecision());
243 template<class QuantDesc, class QuantFunc>
244 inline void quantizeWeightsBiasesConv(const QuantDesc & quantDesc,
245 InferenceEngine::WeightableLayer *conv,
246 const QuantFunc &fnc) {
247 // for quantized weights
248 auto intWeights = make_custom_blob<typename QuantDesc::WeightsPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({conv->_weights->size()}));
249 intWeights->allocate();
250 if (intWeights->buffer() == nullptr) {
251 THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
252 << "cannot copy weights for layer :"<< conv->name << " of size" << intWeights->byteSize();
256 auto getBiasSizeForLayer = [](InferenceEngine::WeightableLayer *wl) {
258 return wl->_biases->size();
260 // calculating biases len using weight dims
261 auto & dims = wl->outData.front()->getDims();
265 using BiasesPrecision = typename QuantDesc::BiasesPrecision;
266 auto biasMaker = [&] () {
267 InferenceEngine::Blob::Ptr zero;
268 if (!conv->_biases && !shouldAlwaysAllocate<BiasesPrecision>()) {
271 auto bias = make_custom_blob<BiasesPrecision>(InferenceEngine::C, InferenceEngine::SizeVector({
272 getBiasSizeForLayer(conv)
275 if (bias->buffer() == nullptr) {
276 THROW_GNA_EXCEPTION << InferenceEngine::details::as_status << InferenceEngine::NOT_ALLOCATED
277 << "cannot copy bias for layer :"<< conv->name <<"of size" << bias->byteSize();
279 memset(bias->buffer(), 0, bias->byteSize());
283 auto intBiases = biasMaker();
285 float input_scale_factor = 1.f;
286 if (InferenceEngine::CNNNetHasPrevLayer(conv)) {
287 auto quantDataForInputLayer =
288 InferenceEngine::getInjectedData<QuantizedLayerParams>(*InferenceEngine::CNNNetPrevLayer(conv).get());
289 input_scale_factor = quantDataForInputLayer->_dst_quant.scale;
290 if (std::isnan(input_scale_factor) ||
291 std::isinf(input_scale_factor)) {
292 THROW_IE_EXCEPTION << "Unsupported input scale factor value " << input_scale_factor;
295 if (conv->outData[0]->getDims().size() < 2) {
296 THROW_IE_EXCEPTION << "Unsupported output dims size for " << conv->name <<", should be > 1, but " << conv->outData[0]->getDims().size();
298 if (conv->insData[0].lock().get()->getDims().size() < 2) {
299 THROW_IE_EXCEPTION << "Unsupported input dims size for " << conv->name << ", should be > 1, but " << conv->insData[0].lock().get()->getDims().size();
301 auto inputData = conv->insData[0].lock();
303 uint32_t num_rows = getBiasSizeForLayer(conv);
304 uint32_t num_columns = conv->_weights->size() / num_rows;
306 uint32_t num_rows_padded = num_rows;
307 uint32_t num_columns_padded = num_columns;
309 // TODO: replace this into fixed scale quantizer then
311 auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*conv);
313 fnc(conv->_weights->buffer().as<float *>(),
314 conv->_biases ? conv->_biases->buffer().as<float *>() : nullptr,
315 intWeights->buffer(),
316 intBiases ? intBiases->buffer() : static_cast<BiasesPrecision *>(nullptr),
318 &quantData->_weights_quant.scale,
319 &quantData->_dst_quant.scale,
325 conv->_weights = intWeights;
326 conv->_biases = intBiases;
329 * correcting precision for outdata
331 conv->precision = quantDesc.getWeightsPrecision();
332 for (auto &&outData : conv->outData) {
333 outData->setPrecision(quantDesc.getOutputPrecision());
338 class DataQuantizerBase {
340 explicit DataQuantizerBase(float scaleFactor) : scaleFactor(scaleFactor) {
343 float scaleFactor = 1.0;
346 * Helper class to use partial specialisation of Layer type
350 template<class Desc, class Layer>
351 class DataQuantizer : public DataQuantizerBase {
353 explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
354 bool operator()(Layer cnnLayer) const {
360 class DataQuantizer<Desc, InferenceEngine::CNNLayer *> : public DataQuantizerBase {
362 explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
364 bool operator()(InferenceEngine::CNNLayer *cnnLayer) const {
365 for (auto &&outData : cnnLayer->outData) {
366 outData->setPrecision(Desc::mandatory().getOutputPrecision());
368 // set scale factor for input layers
369 auto quantData = InferenceEngine::getInjectedData<QuantizedLayerParams>(*cnnLayer);
370 if (cnnLayer->insData.empty()) {
371 for (auto &&outData : cnnLayer->outData) {
372 outData->setPrecision(Desc::mandatory().getInputPrecision());
375 if (LayerInfo(*cnnLayer).isActivation() ||
376 LayerInfo(*cnnLayer).isCopy()) {
377 // precision of activation layers is always equal input precision
378 for (auto &&outData : cnnLayer->outData) {
379 outData->setPrecision(Desc::mandatory().getInputPrecision());
383 cnnLayer->precision = Desc::mandatory().getInputPrecision();
391 class DataQuantizer<Desc, InferenceEngine::SplitLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
392 using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
394 explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
395 bool operator()(InferenceEngine::SplitLayer *splitLayer) const {
396 base::operator()(splitLayer);
397 // split layer doesnt change it's data at all
398 for (auto &&outData : splitLayer->outData) {
399 outData->setPrecision(Desc::mandatory().getInputPrecision());
406 class DataQuantizer<Desc, InferenceEngine::ConcatLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
407 using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
409 explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
410 bool operator()(InferenceEngine::ConcatLayer *concatLayer) const {
411 base::operator()(concatLayer);
412 for (auto &&outData : concatLayer->outData) {
413 outData->setPrecision(Desc::mandatory().getInputPrecision());
420 class DataQuantizer<Desc, InferenceEngine::CropLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
421 using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
423 explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
424 bool operator()(InferenceEngine::CropLayer *cropLayer) const {
425 base::operator()(cropLayer);
426 for (auto &&outData : cropLayer->outData) {
427 outData->setPrecision(Desc::mandatory().getInputPrecision());
434 class DataQuantizer<Desc, InferenceEngine::ReshapeLayer *> : public DataQuantizer<Desc, InferenceEngine::CNNLayer *> {
435 using base = DataQuantizer<Desc, InferenceEngine::CNNLayer *>;
437 explicit DataQuantizer(float scaleFactor) : base(scaleFactor) {}
438 bool operator()(InferenceEngine::ReshapeLayer *reshapeLayer) const {
439 base::operator()(reshapeLayer);
440 // reshape layer doesnt change it's data at all
441 for (auto &&outData : reshapeLayer->outData) {
442 outData->setPrecision(Desc::mandatory().getInputPrecision());
449 class DataQuantizer<Desc, InferenceEngine::WeightableLayer *> : public DataQuantizerBase {
451 explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
452 bool operator()(InferenceEngine::WeightableLayer *wl) const {
453 quantizeWeightsBiases<typename Desc::MandatoryType>(Desc::mandatory(), wl, Quant<typename Desc::MandatoryType>());
459 class DataQuantizer<Desc, InferenceEngine::ConvolutionLayer *> : public DataQuantizerBase {
461 explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
462 bool operator()(InferenceEngine::WeightableLayer *wl) const {
463 quantizeWeightsBiasesConv<typename Desc::OptionalType>(Desc::optional(), wl, Quant<typename Desc::OptionalType>());
469 class DataQuantizer<Desc, InferenceEngine::ScaleShiftLayer *> : public DataQuantizerBase {
471 explicit DataQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
472 bool operator()(InferenceEngine::ScaleShiftLayer *wl) const {
473 quantizeWeightsBiases<typename Desc::OptionalType>(Desc::optional(), wl, Quant<typename Desc::OptionalType>(), true);
478 } // namespace details
481 class LayersQuantizer : public details::DataQuantizerBase {
483 explicit LayersQuantizer(float scaleFactor) : DataQuantizerBase(scaleFactor) {}
485 bool operator()(T input) const {
486 return details::DataQuantizer<Desc, T>(scaleFactor)(input);
490 using QuantI16 = details::QuantPair<details::QuantI16, details::QuantI16>;
491 using QuantI8 = details::QuantPair<details::QuantI8, details::QuantI16>;
493 } // namespace GNAPluginNS