2 // Copyright (c) 2018 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "layout_optimizer.h"
18 #include "topology_impl.h"
19 #include "network_impl.h"
20 #include "primitive_inst.h"
21 #include "error_handler.h"
23 #include "data_inst.h"
24 #include "reorder_inst.h"
25 #include "generic_layer.hpp"
28 #include "eltwise_inst.h"
29 #include "pooling_inst.h"
31 using namespace cldnn;
34 bool should_use_winograd_2x3_s1(std::shared_ptr<const convolution> const& prim, layout const& input_layout, layout const& weights_layout, bool output_size_handling_enabled)
36 //cases when NOT to use winograd
37 if (input_layout.size.feature[0] % 64 != 0 //current algorithm is effective for ifm to be multiply of 64
38 || weights_layout.size.spatial[0] != 3 //weights have to be 3x3 by definiton
39 || weights_layout.size.spatial[1] != 3 //weights have to be 3x3 by definition
40 || weights_layout.size.batch[0] % 64 != 0 //current algorithm is effective for ofm to be multiply of 64
41 || prim->stride != tensor{ 1 } //stride has to be 1x1 by definition
42 || prim->dilation != tensor{ 1 } //no support for dilation
43 || prim->split() != 1 //no support for splitted convolutions
44 || (output_size_handling_enabled && prim->with_output_size) //no support for convolutions with user-specified output size
45 || (input_layout.count() > 3000000) //limit max input size as winograd consumes more memory
46 || (input_layout.count() < 50000) //limit min input size as winograd is not effective for small input
47 || (input_layout.size.spatial[0] < 8 && input_layout.size.spatial[1] < 8)) //disable winograd for small spatials as perf is poor
56 layout_optimizer::layout_optimizer(bool output_size_handling_enabled)
57 : _optimization_attributes()
58 , _output_size_handling_enabled(output_size_handling_enabled)
62 bool layout_optimizer::convolution_bfyx_opt(layout const& output_layout, const layout& weights_layout, std::shared_ptr<const convolution> conv)
64 //A set of rules that define when bfyx mem format has better performance than yxfb
65 if (output_layout.size.batch[0] == 16 || output_layout.size.batch[0] % 16 != 0 ||
66 output_layout.data_type != data_types::f16 || weights_layout.size.batch[0] % 16 != 0 ||
67 !((weights_layout.size.spatial[0] == 1 && weights_layout.size.spatial[1] == 1) ||
68 (weights_layout.size.spatial[0] >= 5 && weights_layout.size.spatial[1] >= 5) ||
69 (conv->stride.spatial[0] > 1 && conv->stride.spatial[1] > 1) ||
70 (weights_layout.size.feature[0] <= 32 && output_layout.size.spatial[0] < 224 && output_layout.size.spatial[1] < 224) ||
71 (weights_layout.size.feature[0] <= 64 && output_layout.size.spatial[0] < 112 && output_layout.size.spatial[1] < 112) ||
72 (weights_layout.size.feature[0] <= 128 && output_layout.size.spatial[0] < 56 && output_layout.size.spatial[1] < 56) ||
73 (weights_layout.size.feature[0] <= 256 && output_layout.size.spatial[0] < 28 && output_layout.size.spatial[1] < 28) ||
74 (weights_layout.size.feature[0] <= 512 && output_layout.size.spatial[0] < 14 && output_layout.size.spatial[1] < 14) ||
75 (weights_layout.size.feature[0] <= 1024 && output_layout.size.spatial[0] <= 7 && output_layout.size.spatial[1] <= 7)) ||
76 //WA for AgeGender, which has one convolution that is better on yxfb, but due to additonal reorder overall performance is worse than bfyx
77 (output_layout.size.spatial[0] == 82 && output_layout.size.spatial[1] == 82) ||
78 (_optimization_attributes.splitted_convolution && output_layout.size.batch[0] == 16) ||
79 (!_optimization_attributes.splitted_convolution && output_layout.size.batch[0] >= 128) ||
80 _optimization_attributes.bfyx_only_layer)
86 bool layout_optimizer::convolution_byxf_opt(layout const& output_layout, const layout& weights_layout, std::shared_ptr<const convolution> conv)
88 //A set of rules that define when byxf mem format has better performance
89 if ((output_layout.data_type == data_types::f16 &&
90 weights_layout.size.spatial[0] == 1 && weights_layout.size.spatial[1] == 1 &&
91 output_layout.size.feature[0] % 64 == 0 && weights_layout.size.batch[0] % 64 == 0 &&
92 conv->stride.spatial[0] == 1 && conv->stride.spatial[1] == 1 &&
93 conv->input_offset.spatial[0] == 0 && conv->input_offset.spatial[1] == 0) ||
95 should_use_winograd_2x3_s1(conv, output_layout, weights_layout, _output_size_handling_enabled))
101 bool layout_optimizer::users_for_convolution_byxf_opt(program_node const& node, uint32_t depth)
103 //This function checks if byxf optimization can be applied to the required depth of node's users.
104 //Setting depth to 1 will check only node's users, depth = 2 are user's users etc.
108 bool use_byxf = false;
109 for (auto& user : node.get_users())
111 //primitives that support transitions byxf->other format and other format->byxf are valid for byxf opt
112 if (user->type() == cldnn::eltwise::type_id() || user->type() == cldnn::pooling::type_id())
113 use_byxf = users_for_convolution_byxf_opt(*user, depth - 1);
114 //convolution that is capable to use byxf and is performant is also valid for byxf opt
115 else if (user->type() == cldnn::convolution::type_id())
117 auto conv_prim = user->as<convolution>().get_primitive();
118 if (convolution_byxf_opt(user->calc_output_layout(), user->get_dependency(1).get_output_layout(), conv_prim))
119 use_byxf = users_for_convolution_byxf_opt(*user, depth - 1);
135 bool layout_optimizer::deps_depth_in_same_format(program_node const& node, const cldnn::format format, uint32_t depth)
137 //This function checks if requested format is the same for node's users in the required depth.
138 //Setting depth to 1 will check only node's dependencies, depth = 2 are dep's dependencies etc.
142 bool same_format = false;
143 for (auto& dep : node.get_dependencies())
145 //skip data and generic_layers
146 if (dep->type() == cldnn::data::type_id() || dep->type() == cldnn::generic_layer::type_id())
149 //if dependency is of type reorder and format is different then skip it and move to its dependency
150 //further in graph such reorders could be optimized out
151 if(dep->type() == cldnn::reorder::type_id() && dep->get_dependencies().size() == 1 && dep->get_output_layout().format != format)
152 same_format = deps_depth_in_same_format(dep->get_dependency(0), format, depth);
153 else if (dep->get_output_layout().format == format)
154 //if dependency is of type reorder and format is the same, check if its users are primitives with support for different input and output formats
155 //if that is true then graph optimizer will optimize such reorder and layout for its dependency will be changed
156 if (dep->type() == cldnn::reorder::type_id() &&
157 (dep->get_dependency(0).type() == cldnn::eltwise::type_id() || dep->get_dependency(0).type() == cldnn::pooling::type_id()) &&
158 dep->get_dependencies().size() == 1)
159 same_format = deps_depth_in_same_format(dep->get_dependency(0), format, depth - 1);
161 same_format = deps_depth_in_same_format(*dep, format, depth - 1);
171 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, convolution_node const& node, layout const& output_or_weights_layout)
173 auto prim = node.get_primitive();
174 auto expected_tensor = current_layout.size;
175 auto expected_data_type = current_layout.data_type;
176 auto expected_format = current_layout.format;
178 if (type == data_type::weights || type == data_type::bias)
180 expected_data_type = output_or_weights_layout.data_type;
185 case data_type::bias: //convolution bias
186 expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
187 expected_format = cldnn::format::bfyx;
190 case data_type::input: //convolution input
192 if (current_layout.data_type == data_types::f16 &&
193 layout_optimizer::convolution_byxf_opt(current_layout, output_or_weights_layout, prim) &&
194 (users_for_convolution_byxf_opt(node, 2) || deps_depth_in_same_format(node, cldnn::format::byxf, 2)) &&
195 //TODO: remove this condition when yxfb optimizations will be disabled
196 current_layout.format != cldnn::format::yxfb &&
197 current_layout.size.batch[0] == 1 &&
198 prim->dilation == tensor{ 1 } &&
199 !node.get_transposed())
201 expected_tensor = current_layout.size;
202 expected_format = cldnn::format::byxf;
205 else if (current_layout.format == format::b_fs_yx_fsv4 ||
206 current_layout.format == format::os_is_yx_osv16_isv4)
208 // Nothing to do, just go out from here.
211 else if (current_layout.data_type == data_types::i8)
213 expected_tensor = current_layout.size;
214 expected_format = current_layout.format;//cldnn::format::byxf_af32;
216 else if (layout_optimizer::convolution_bfyx_opt(current_layout, output_or_weights_layout, prim)
217 || (_output_size_handling_enabled && prim->with_output_size) ||
218 node.get_transposed())
220 // commented out due to performance reasons, maybe enable in future
221 /*if (current_layout.data_type == data_types::f32 &&
222 current_layout.size.batch[0] % 16 == 0 &&
223 current_layout.format == format::bfyx &&
224 output_or_weights_layout.size.spatial[0] == 1 && output_or_weights_layout.size.spatial[1] == 1 &&
225 prim->stride.spatial[0] == 1 && prim->stride.spatial[1] == 1 &&
226 prim->input_offset.spatial[0] == 0 && prim->input_offset.spatial[1] == 0 &&
227 !node.get_transposed())
229 if (!((current_layout.size.feature[0] % 8) == 0 && (current_layout.size.spatial[0] * current_layout.size.spatial[1]) == 16 &&
230 current_layout.data_padding == padding{ { 0,0,0,0 }, 0 }))
232 expected_tensor = current_layout.size.transform(cldnn::format::bf8_xy16, 1);
233 expected_format = cldnn::format::bf8_xy16;
238 expected_tensor = current_layout.size;
239 expected_format = cldnn::format::bfyx;
244 expected_tensor = current_layout.size;
245 expected_format = cldnn::format::yxfb;
251 throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for convolution primitive");
254 return layout(expected_data_type, expected_format, expected_tensor);
257 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, fully_connected_node const& node, layout const& output_or_weights_layout)
259 auto prim = node.get_primitive();
260 auto expected_tensor = current_layout.size;
261 auto expected_data_type = current_layout.data_type;
262 auto expected_format = current_layout.format;
264 if (type == data_type::weights || type == data_type::bias)
266 expected_data_type = output_or_weights_layout.data_type;
271 case data_type::bias: //fc bias
272 expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
273 expected_format = cldnn::format::bfyx;
277 throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for fully-connected primitive");
280 return layout(expected_data_type, expected_format, expected_tensor);
283 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, lstm_gemm_node const& node, layout const& output_or_weights_layout)
285 auto prim = node.get_primitive();
286 auto expected_tensor = current_layout.size;
287 auto expected_data_type = current_layout.data_type;
288 auto expected_format = current_layout.format;
290 if (type == data_type::weights || type == data_type::bias)
292 expected_data_type = output_or_weights_layout.data_type;
297 case data_type::bias:
298 expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
299 expected_format = cldnn::format::bfyx;
303 throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for fully-connected primitive");
306 return layout(expected_data_type, expected_format, expected_tensor);
309 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, deconvolution_node const& node, layout const& output_or_weights_layout)
311 auto prim = node.get_primitive();
312 auto expected_tensor = current_layout.size;
313 auto expected_data_type = current_layout.data_type;
314 auto expected_format = current_layout.format;
316 if (type == data_type::weights || type == data_type::bias)
318 expected_data_type = output_or_weights_layout.data_type;
323 case data_type::bias: //convolution bias
324 expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
325 expected_format = cldnn::format::bfyx;
329 throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for deconvolution primitive");
332 return layout(expected_data_type, expected_format, expected_tensor);
335 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, detection_output_node const& node, layout const& output_or_weights_layout)
337 auto prim = node.get_primitive();
338 auto expected_tensor = current_layout.size;
339 auto expected_data_type = data_types::f32;
340 auto expected_format = output_or_weights_layout.format;
342 if (type != data_type::input)
343 CLDNN_ERROR_MESSAGE(prim->id, "detection_output only supports optimization of its output (no weights/biases)");
345 return layout(expected_data_type, expected_format, expected_tensor);
348 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, embed_node const& node, layout const& output_or_weights_layout)
350 auto prim = node.get_primitive();
351 auto expected_tensor = current_layout.size;
352 auto expected_data_type = current_layout.data_type;
353 auto expected_format = current_layout.format;
355 if (type == data_type::weights || type == data_type::bias)
357 expected_data_type = output_or_weights_layout.data_type;
362 case data_type::bias:
363 expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
364 expected_format = cldnn::format::bfyx;
368 throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for embed primitive");
371 return layout(expected_data_type, expected_format, expected_tensor);
374 std::pair<std::shared_ptr<cldnn::reorder>, bool>
375 layout_optimizer::create_reorder_if_needed(const layout& current_layout, const cldnn::primitive_id& memid, layout const& expected_layout)
377 if (current_layout != expected_layout)
379 cache_key ckey{ memid, expected_layout };
380 auto itr = _cached_reorders.find(ckey);
381 if (itr != _cached_reorders.end())
382 return std::make_pair(itr->second, true);
384 auto count = _cached_reorders.size();
385 std::stringstream ss;
386 ss << "reorder_" << count << "_" << memid;
388 auto reorder = std::make_shared<cldnn::reorder>(ss.str(), memid, expected_layout);
389 _cached_reorders[ckey] = reorder;
390 return std::make_pair(reorder, false);
393 return std::make_pair(nullptr, true);
396 std::pair<std::shared_ptr<cldnn::generic_layer>, bool>
397 layout_optimizer::create_reorder_from_given_source(const cldnn::primitive_id& memid, layout const& expected_layout, const kernel_selector::weights_reorder_params& reorder_params)
399 cache_key ckey{ memid, expected_layout };
400 auto itr = _cached_generic_layers.find(ckey);
401 if (itr != _cached_generic_layers.end())
402 return std::make_pair(itr->second, true);
404 auto count = _cached_generic_layers.size();
405 std::stringstream ss;
406 ss << "generic_layer_" << count << "_" << memid;
408 auto reorder = std::make_shared<cldnn::generic_layer>(ss.str(), memid, expected_layout, reorder_params);
409 _cached_generic_layers[ckey] = reorder;
410 return std::make_pair(reorder, false);
413 std::vector<std::pair<std::shared_ptr<primitive>, bool>> layout_optimizer::get_generic_layer(
414 const kernel_selector::weights_reorder_params & reorder_params,
415 primitive_id input_id,
416 const layout & old_layout,
420 if (reorder_params.engine == kernel_selector::weights_reorder_params::Engine::NONE || type != data_type::weights)
423 std::vector<std::pair<std::shared_ptr<primitive>, bool>> ret;
425 if (reorder_params.engine == kernel_selector::weights_reorder_params::Engine::CPU &&
426 reorder_params.cpuKernel != nullptr)
428 const auto intermediate_format = from_weights_layout(reorder_params.cpuKernel->GetExpectedInputLayout());
429 const auto intermediate_type = from_weights_type(reorder_params.cpuKernel->GetExpectedInputType());
430 if (intermediate_format != old_layout.format ||
431 intermediate_type != old_layout.data_type)
433 const layout intermediate_layout = { intermediate_type, intermediate_format, old_layout.size.transform(intermediate_format, 1) };
435 auto reorder = create_reorder_if_needed(old_layout, input_id, intermediate_layout);
438 ret.push_back(reorder);
439 input_id = reorder.first->id;
444 auto new_dtype = from_weights_type(reorder_params.dtype);
445 const auto bpp = data_type_traits::size_of(new_dtype);
446 tensor expected_size = { 1,1,1,(tensor::value_type)(reorder_params.newBufferSize / bpp) };
448 if (reorder_params.toImageType)
449 expected_size = old_layout.size;
451 layout expected_layout = {
452 new_dtype, reorder_params.toImageType ? from_weights_layout(reorder_params.destLayout) : format::bfyx, // simple linear format (flatten to x channel)
456 auto reorder = create_reorder_from_given_source(input_id, expected_layout, reorder_params);
458 ret.push_back(reorder);
463 void layout_optimizer::set_optimization_attribute(optimization_attributes_type attribute, int32_t val)
467 case optimization_attributes_type::splitted_convolution:
468 _optimization_attributes.splitted_convolution = val;
470 case optimization_attributes_type::bfyx_only_layer:
471 _optimization_attributes.bfyx_only_layer = val;
474 throw std::out_of_range("unsupported layout optimization attribute");