From 28ffbf08578538d5e6ce33a2b48b556ae4e57a5e Mon Sep 17 00:00:00 2001 From: Vladimir Paramuzov Date: Thu, 4 Jun 2020 10:30:46 +0300 Subject: [PATCH] [IE CLDNN] Remove unused fused deps for FQ (#712) Remove unused fused FQ kernel arguments to avoid extra setArg() calls which significantly reduces host overhead --- .../quantize/quantize_kernel_params.h | 29 +++++++++++++- .../clDNN/kernel_selector/core/common/jitter.cpp | 46 ++++++++++++---------- inference-engine/thirdparty/clDNN/src/program.cpp | 26 +++++++++++- 3 files changed, 78 insertions(+), 23 deletions(-) diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_params.h b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_params.h index 58ae821..6bcd271 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_params.h +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/actual_kernels/quantize/quantize_kernel_params.h @@ -1,4 +1,4 @@ -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2019-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -117,7 +117,25 @@ struct quantize_fuse_params : fuse_params { , in_scale(in_scale) , in_shift(in_shift) , out_scale(out_scale) - , out_shift(out_shift) { } + , out_shift(out_shift) { + size_t index = 0; + if (has_clamp) { + in_range_lo_idx = index++; + in_range_hi_idx = index++; + } + if (!per_tensor_input_scale) { + in_scale_idx = index++; + } + if (!per_tensor_input_shift && has_pre_shift) { + in_shift_idx = index++; + } + if (!per_tensor_output_scale && has_post_scale) { + out_scale_idx = index++; + } + if (!per_tensor_output_shift && has_post_shift) { + out_shift_idx = index++; + } + } bool scale_shift_opt; bool has_post_scale; @@ -137,6 +155,13 @@ struct quantize_fuse_params : fuse_params { float in_shift; float out_scale; float out_shift; + + size_t in_range_lo_idx; + size_t in_range_hi_idx; + size_t in_scale_idx; + size_t in_shift_idx; + size_t out_scale_idx; + size_t out_shift_idx; }; } // namespace kernel_selector diff --git a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp index a1ad008..61cd0c2 100644 --- a/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp +++ b/inference-engine/thirdparty/clDNN/kernel_selector/core/common/jitter.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2019 Intel Corporation +// Copyright (c) 2019-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -1155,26 +1155,32 @@ JitConstants FusedOpsCodeGenerator::MakeOpJitConstants(const FusedOpsConfigurati // We can't convert inputs to output data type, because it might be equal to UINT8 or INT8, so we convert the data // to the zero tensor's (input_lo) type - std::string tmp_var = in_var; - std::string tmp_type; std::string in_converted = in_var; - if (in_type != desc.tensors[0].GetDType()) { - tmp_type = GetType(desc.tensors[0].GetDType(), vec_size); - tmp_var = out_var + "_tmp"; + Datatype tmp_type = desc.tensors.empty() ? in_type : desc.tensors[0].GetDType(); + std::string tmp_type_str = GetType(tmp_type, vec_size); + std::string tmp_var = out_var + "_tmp"; + + if (in_type != tmp_type) { in_converted = ConvertToType(in_var, desc.tensors[0].GetDType(), vec_size); } - auto post_scale = p->per_tensor_output_scale ? Broadcast(std::to_string(p->out_scale), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(6); - auto post_shift = p->per_tensor_output_shift ? Broadcast(std::to_string(p->out_shift), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(7); - auto pre_scale = p->per_tensor_input_scale ? Broadcast(std::to_string(p->in_scale), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(4); - auto pre_shift = p->per_tensor_input_shift ? Broadcast(std::to_string(p->in_shift), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(5); - auto in_lo = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_lo), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(0); - auto in_hi = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_hi), desc.tensors[0].GetDType(), vec_size) : GetInputVarName(1); + auto post_scale = p->per_tensor_output_scale ? Broadcast(std::to_string(p->out_scale), tmp_type, vec_size) + : GetInputVarName(p->out_scale_idx); + auto post_shift = p->per_tensor_output_shift ? Broadcast(std::to_string(p->out_shift), tmp_type, vec_size) + : GetInputVarName(p->out_shift_idx); + auto pre_scale = p->per_tensor_input_scale ? Broadcast(std::to_string(p->in_scale), tmp_type, vec_size) + : GetInputVarName(p->in_scale_idx); + auto pre_shift = p->per_tensor_input_shift ? Broadcast(std::to_string(p->in_shift), tmp_type, vec_size) + : GetInputVarName(p->in_shift_idx); + auto in_lo = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_lo), tmp_type, vec_size) + : GetInputVarName(p->in_range_lo_idx); + auto in_hi = p->per_tensor_input_range ? Broadcast(std::to_string(p->in_hi), tmp_type, vec_size) + : GetInputVarName(p->in_range_hi_idx); if (p->has_clamp) { - op_decls += "\\\n\t" + tmp_type + " " + tmp_var + " = min(max(" + in_lo + ", " + in_converted + "), " + in_hi + ");"; + op_decls += "\\\n\t" + tmp_type_str + " " + tmp_var + " = min(max(" + in_lo + ", " + in_converted + "), " + in_hi + ");"; } else { - op_decls += "\\\n\t" + tmp_type + " " + tmp_var + " = " + in_converted + ";"; + op_decls += "\\\n\t" + tmp_type_str + " " + tmp_var + " = " + in_converted + ";"; } op_decls += "\\\n\t" + tmp_var + " = " + tmp_var + "*" + pre_scale + ";"; if (p->has_pre_shift) @@ -1401,17 +1407,17 @@ std::vector FusedOpsCodeGenerator::GetRequiredInputs() const { if (p) { std::vector res = {}; if (!p->per_tensor_input_range && p->has_clamp) { - res.push_back(0); - res.push_back(1); + res.push_back(p->in_range_lo_idx); + res.push_back(p->in_range_hi_idx); } if (!p->per_tensor_input_scale) - res.push_back(4); + res.push_back(p->in_scale_idx); if (p->has_pre_shift && !p->per_tensor_input_shift) - res.push_back(5); + res.push_back(p->in_shift_idx); if (p->has_post_scale && !p->per_tensor_output_scale) - res.push_back(6); + res.push_back(p->out_scale_idx); if (p->has_post_shift && !p->per_tensor_output_shift) - res.push_back(7); + res.push_back(p->out_shift_idx); return res; } diff --git a/inference-engine/thirdparty/clDNN/src/program.cpp b/inference-engine/thirdparty/clDNN/src/program.cpp index d5a2f3b..69c8d78 100644 --- a/inference-engine/thirdparty/clDNN/src/program.cpp +++ b/inference-engine/thirdparty/clDNN/src/program.cpp @@ -1,5 +1,5 @@ /* -// Copyright (c) 2016-2019 Intel Corporation +// Copyright (c) 2016-2020 Intel Corporation // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -36,6 +36,7 @@ #include "binary_convolution_inst.h" #include "resample_inst.h" #include "reshape_inst.h" +#include "quantize_inst.h" #include "activation_inst.h" #include "scale_inst.h" #include "depth_to_space_inst.h" @@ -906,6 +907,29 @@ void program_impl::fuse_nodes(program_node &fused_node, program_node &peer_node) auto& dep = peer_node.get_dependency(i); if (dep.id() == fused_node.id()) continue; + + if (peer_node.is_type()) { + quantize_node& q_node = peer_node.as(); + if (q_node.get_scale_shift_opt()) { + bool can_drop_input = false; + + // Drop input range if clamp is not needed + can_drop_input |= (i == 1 || i == 2) && !q_node.get_need_clamp(); + // Drop output range - it's not used in scale-shift-opt quantize kernel + can_drop_input |= i == 3 || i == 4; + // Drop tensor with input scale when we have per-tensor parameter + can_drop_input |= i == 5 && q_node.get_per_tensor_input_scale(); + // Drop tensor with input shift when we have per-tensor parameter or it's not needed at all + can_drop_input |= i == 6 && (!q_node.get_need_pre_shift() || q_node.get_per_tensor_input_shift()); + // Drop tensor with output scale when we have per-tensor parameter or it's not needed at all + can_drop_input |= i == 7 && (!q_node.get_need_post_scale() || q_node.get_per_tensor_output_scale()); + // Drop tensor with output shift when we have per-tensor parameter or it's not needed at all + can_drop_input |= i == 8 && (!q_node.get_need_post_shift() || q_node.get_per_tensor_output_shift()); + + if (can_drop_input) + continue; + } + } fused_node.dependencies.push_back(&dep); local_desc.deps.push_back(dep.id()); dep.users.push_back(&fused_node); -- 2.7.4