From: Vladimir Paramuzov Date: Thu, 3 Sep 2020 14:41:29 +0000 (+0300) Subject: [IE CLDNN] Fallback to FP16 for non-quantized layers in quantized FP16+INT8 IR (... X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=b976782c70616f7202cc9eae2ab2c90e162ceb78;p=platform%2Fupstream%2Fdldt.git [IE CLDNN] Fallback to FP16 for non-quantized layers in quantized FP16+INT8 IR (#941) --- diff --git a/inference-engine/include/cldnn/cldnn_config.hpp b/inference-engine/include/cldnn/cldnn_config.hpp index 5397af3..dcb2efd 100644 --- a/inference-engine/include/cldnn/cldnn_config.hpp +++ b/inference-engine/include/cldnn/cldnn_config.hpp @@ -59,9 +59,12 @@ DECLARE_CLDNN_CONFIG_KEY(GRAPH_DUMPS_DIR); DECLARE_CLDNN_CONFIG_KEY(SOURCES_DUMPS_DIR); /** -* @brief This key turns usage of int8 optimizations and qunatized models on. +* @brief This key enables FP16 precision for quantized models. +* By default the model is converted to FP32 precision before running LPT. If this key is enabled (default), then non-quantized layers +* will be converted back to FP16 after LPT, which might imrpove the performance if a model has a lot of compute operations in +* non-quantized path. This key has no effect if current device doesn't have INT8 optimization capabilities. */ -DECLARE_CLDNN_CONFIG_KEY(INT8_ENABLED); +DECLARE_CLDNN_CONFIG_KEY(ENABLE_FP16_FOR_QUANTIZED_MODELS); /** * @brief This key should be set to correctly handle NV12 input without pre-processing. diff --git a/inference-engine/src/cldnn_engine/cldnn_config.cpp b/inference-engine/src/cldnn_engine/cldnn_config.cpp index 139c295..cd685ed 100644 --- a/inference-engine/src/cldnn_engine/cldnn_config.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_config.cpp @@ -189,6 +189,14 @@ void Config::UpdateFromMap(const std::map& configMap) } else { THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported NV12 flag value: " << val; } + } else if (key.compare(CLDNNConfigParams::KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS) == 0) { + if (val.compare(PluginConfigParams::YES) == 0) { + enable_fp16_for_quantized_models = true; + } else if (val.compare(PluginConfigParams::NO) == 0) { + enable_fp16_for_quantized_models = false; + } else { + THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS flag value: " << val; + } } else { THROW_IE_EXCEPTION << NOT_FOUND_str << "Unsupported property key by plugin: " << key; } @@ -228,6 +236,11 @@ void Config::adjustKeyMapValues() { else key_config_map[CLDNNConfigParams::KEY_CLDNN_NV12_TWO_INPUTS] = PluginConfigParams::NO; + if (enable_fp16_for_quantized_models) + key_config_map[CLDNNConfigParams::KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS] = PluginConfigParams::YES; + else + key_config_map[CLDNNConfigParams::KEY_CLDNN_ENABLE_FP16_FOR_QUANTIZED_MODELS] = PluginConfigParams::NO; + { std::string qp = "0"; switch (queuePriority) { diff --git a/inference-engine/src/cldnn_engine/cldnn_config.h b/inference-engine/src/cldnn_engine/cldnn_config.h index 9acd9d5..8bc782e 100644 --- a/inference-engine/src/cldnn_engine/cldnn_config.h +++ b/inference-engine/src/cldnn_engine/cldnn_config.h @@ -27,6 +27,7 @@ struct Config { enableDynamicBatch(false), enableInt8(true), nv12_two_inputs(false), + enable_fp16_for_quantized_models(true), queuePriority(cldnn::priority_mode_types::disabled), queueThrottle(cldnn::throttle_mode_types::disabled), max_dynamic_batch(1), @@ -49,6 +50,7 @@ struct Config { bool enableDynamicBatch; bool enableInt8; bool nv12_two_inputs; + bool enable_fp16_for_quantized_models; cldnn::priority_mode_types queuePriority; cldnn::throttle_mode_types queueThrottle; int max_dynamic_batch; diff --git a/inference-engine/src/cldnn_engine/cldnn_program.cpp b/inference-engine/src/cldnn_engine/cldnn_program.cpp index 14a4646..fa57a9d 100644 --- a/inference-engine/src/cldnn_engine/cldnn_program.cpp +++ b/inference-engine/src/cldnn_engine/cldnn_program.cpp @@ -78,6 +78,7 @@ #include #include #include +#include #include "cldnn_infer_request.h" #include #include "caseless.hpp" @@ -98,6 +99,176 @@ using namespace InferenceEngine; using namespace InferenceEngine::details; +namespace { + +std::vector BFSSort(const ICNNNetwork& network) { + std::vector ordered; + std::unordered_set used; + + OutputsDataMap outputs; + network.getOutputsInfo(outputs); + + InputsDataMap inputs; + network.getInputsInfo(inputs); + + auto get_consumers = [](const CNNLayerPtr& node) -> std::vector { + std::vector consumers; + for (const auto & output : node->outData) { + for (const auto &consumer : getInputTo(output)) { + consumers.push_back(consumer.second); + } + } + return consumers; + }; + auto bfs = [&used, &ordered, &get_consumers](const CNNLayerPtr& start_node, bool traverse_via_outputs = false) { + if (!start_node) return; + std::deque q; + q.push_front(start_node); + while (!q.empty()) { + auto node = q.front(); + q.pop_front(); + if (used.insert(node->name).second) { + ordered.push_back(node); + } + + // Traverse via inputs + for (const auto & input : node->insData) { + auto locked_input = input.lock(); + if (!locked_input) { + THROW_IE_EXCEPTION << "insData for " << node->name << " is not valid."; + } + if (auto next_node = getCreatorLayer(locked_input).lock()) { + if (!used.count(next_node->name)) { + // Check that all consumers were used + bool all_consumers_used(true); + for (const auto & consumer : get_consumers(next_node)) { + if (!used.count(consumer->name)) all_consumers_used = false; + } + if (all_consumers_used) { + q.push_front(next_node); + } + } + } + } + + // Traverse via outputs + if (traverse_via_outputs) { + for (const auto &consumer : get_consumers(node)) { + if (!used.count(consumer->name)) { + q.push_front(consumer); + } + } + } + } + }; + + // First we run bfs starting from outputs that provides deterministic graph traverse + for (const auto & output : outputs) { + if (!used.count(output.first)) { + bfs(getCreatorLayer(output.second).lock()); + } + } + + // For cases when graph has no outputs we start bfs from inputs to ensure topological sort + for (const auto & input : inputs) { + const auto data_ptr = input.second->getInputData(); + for (const auto & consumer : getInputTo(data_ptr)) + if (!used.count(consumer.first)) { + bfs(consumer.second, true); + } + } + + std::reverse(ordered.begin(), ordered.end()); + return ordered; +} + +template +void convertArrayPrecision(typename PrecisionTrait::value_type* dst, + const typename PrecisionTrait::value_type* src, size_t nelem) { + using dst_type = typename PrecisionTrait::value_type; + + for (size_t i = 0; i < nelem; i++) { + dst[i] = static_cast(src[i]); + } +} + +template <> +void convertArrayPrecision(float* dst, const short* src, size_t nelem) { + InferenceEngine::PrecisionUtils::f16tof32Arrays(dst, src, nelem, 1.0f, 0.0f); +} + +template <> +void convertArrayPrecision(short* dst, const float* src, size_t nelem) { + InferenceEngine::PrecisionUtils::f32tof16Arrays(dst, src, nelem, 1.0f, 0.0f); +} + +template +Blob::Ptr convertBlobPrecision(const Blob::Ptr& blob) { + using from_d_type = typename PrecisionTrait::value_type; + using to_d_type = typename PrecisionTrait::value_type; + + auto tensor_desc = blob->getTensorDesc(); + Blob::Ptr new_blob = make_shared_blob(TensorDesc {PREC_TO, tensor_desc.getDims(), tensor_desc.getLayout()}); + new_blob->allocate(); + auto target = new_blob->buffer().as(); + auto source = blob->buffer().as(); + convertArrayPrecision(target, source, blob->size()); + return new_blob; +} + +template +void convertLayerPrecision(const CNNLayerPtr& layer, bool isOutput = false) { + if (layer->type == "TensorIterator" && dynamic_cast(layer.get()) != nullptr) { + return; + } + + using LayerType = CLDNNPlugin::Program::LayerType; + + if (!isOutput) { + for (auto &out_data : layer->outData) { + if (PREC_FROM == out_data->getPrecision()) + out_data->setPrecision(PREC_TO); + } + } + + for (auto &in_data : layer->insData) { + auto data = in_data.lock(); + if (PREC_FROM == data->getPrecision()) + data->setPrecision(PREC_TO); + + auto prev_layer = getCreatorLayer(data).lock(); + + if (CLDNNPlugin::Program::LayerTypeFromStr(prev_layer->type) == LayerType::ConstantBlob && + CLDNNPlugin::Program::LayerTypeFromStr(layer->type) != LayerType::Quantize) { + convertLayerPrecision(prev_layer, false); + } + } + + if (layer->precision == PREC_FROM) + layer->precision = PREC_TO; + + auto wLayer = dynamic_cast(layer.get()); + if (wLayer) { + if (wLayer->_weights && wLayer->_weights->getTensorDesc().getPrecision() == PREC_FROM) { + wLayer->_weights = convertBlobPrecision(wLayer->_weights); + } + if (wLayer->_biases && wLayer->_biases->getTensorDesc().getPrecision() == PREC_FROM) { + wLayer->_biases = convertBlobPrecision(wLayer->_biases); + } + } + + for (auto &blob : layer->blobs) { + auto &data = blob.second; + if (nullptr != data) { + if (data->getTensorDesc().getPrecision() == PREC_FROM) { + data = convertBlobPrecision(data); + } + } + } +} + +} // namespace + namespace CLDNNPlugin { const cldnn::primitive_id Program::m_preProcessTag("_cldnn_input_preprocess"); @@ -242,29 +413,103 @@ Program::Program(InferenceEngine::ICNNNetwork& network, std::shared_ptr(LayerTransformation::Params(params).setSupportAsymmetricQuantization(false), "FullyConnected") .add(LayerTransformation::Params(params).setSupportAsymmetricQuantization(false), "GEMM"); - auto it = details::CNNNetworkIterator(&network); - auto end = details::CNNNetworkIterator(); bool fqFound = false; bool allFQareSupported = true; - while (it != end) { - if (CaselessEq()((*it)->type, "FakeQuantize")) { - fqFound = true; - auto levels = (*it)->GetParamAsUInt("levels"); - if (levels != 255 && levels != 256) { - allFQareSupported = false; - break; + bool baselineIsFP16 = false; + { + auto it = details::CNNNetworkIterator(&network); + auto end = details::CNNNetworkIterator(); + while (it != end) { + auto& layer = *it; + if (layer->precision == Precision::FP16) { + baselineIsFP16 = true; + } + + if (CaselessEq()(layer->type, "FakeQuantize")) { + fqFound = true; + auto levels = layer->GetParamAsUInt("levels"); + if (levels != 255 && levels != 256) { + allFQareSupported = false; + } } + it++; } - it++; } - // [WA] Convert quantized FP16 model to FP32 to avoid possible overflow and mixed precision errors + // [WA part1] Convert quantized FP16 model to FP32 to avoid possible overflow and mixed precision errors if (fqFound && allFQareSupported) { NetPass::ConvertPrecision(network, Precision::FP16, Precision::FP32); } LowPrecisionTransformer transformer(transforms); transformer.transform(network); + + // [WA part2] Try to find non-quantized layers and convert them back to FP16 + if (fqFound && baselineIsFP16 && config.enable_fp16_for_quantized_models) { + auto layersSorted = BFSSort(network); + + for (auto& layer : layersSorted) { + if (layer == nullptr) + continue; + + if (layer->outData.empty() || layer->insData.empty()) + continue; + + auto canReduceOutputPrecision = [](const CNNLayerPtr& l) -> bool { + auto type = LayerTypeFromStr(l->type); + // Don't do conversion for outputs + auto next = GetNextLayers(l); + if (next.empty()) { + return false; + } + + if (type == LayerType::ScaleShift) { + // ScaleShift is supposed to return Dequantized values, so in most of the cases we can convert it's output to FP16 + // The exception is when the next node is Eltwise, so LPT keeps modified ScaleShift node on one of the branches + // and this node doesn't do requantization, thus we have to keep the result in FP32 precision. + for (auto n : next) { + if (LayerTypeFromStr(n->type) == LayerType::Eltwise) + return false; + } + return true; + } + + if (type == LayerType::Quantize) { + auto in = getCreatorLayer(l->insData[0].lock()).lock(); + if (l->outData[0]->getPrecision() == Precision::FP32 && in->type != "Input") + return true; + } + + return false; + }; + + auto canReducePrecision = [](const CNNLayerPtr& l) -> bool { + auto layerType = LayerTypeFromStr(l->type); + + bool result = true; + for (auto& in : l->insData) { + auto input = in.lock(); + auto precision = input->getPrecision(); + auto in_type = LayerTypeFromStr(getCreatorLayer(input).lock()->type); + if (precision != Precision::FP16 && in_type != LayerType::ConstantBlob) { + result = false; + break; + } + } + + return result; + }; + + if (canReducePrecision(layer)) { + convertLayerPrecision(layer, GetNextLayers(layer).empty()); + } else if (canReduceOutputPrecision(layer)) { + for (auto &out_data : layer->outData) { + if (out_data->getPrecision() == Precision::FP32) + out_data->setPrecision(Precision::FP16); + } + } + } + } } NetPass::CombineRNNSeq(network); diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp index c3fb2d8..e180e8a 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/eltwise/eltwise_kernel_b_fs_yx_fsv16.cpp @@ -135,7 +135,7 @@ JitConstants EltwiseKernel_b_fs_yx_fsv16::GetJitConstants(const eltwise_params& jit.AddConstant(MakeJitConstant("INPUT_STRIDED", 1)); } - jit.Merge(MakeActivationJitConstants(params.activations, GetAccumulatorType(params), "_TYPED")); + jit.Merge(MakeActivationJitConstants(params.activations, params.output.GetDType(), "_TYPED")); if (params.output.Feature().v % 16 != 0) jit.AddConstant(MakeJitConstant("LEFTOVERS", params.output.Feature().v % 16));