Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / src / layout_optimizer.cpp
1 /*
2 // Copyright (c) 2018 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "layout_optimizer.h"
18 #include "topology_impl.h"
19 #include "network_impl.h"
20 #include "primitive_inst.h"
21 #include "error_handler.h"
22
23 #include "data_inst.h"
24 #include "reorder_inst.h"
25 #include "generic_layer.hpp"
26 #include <sstream>
27
28 #include "eltwise_inst.h"
29 #include "pooling_inst.h"
30
31 using namespace cldnn;
32
33 namespace {
34     bool should_use_winograd_2x3_s1(std::shared_ptr<const convolution> const& prim, layout const& input_layout, layout const& weights_layout, bool output_size_handling_enabled)
35     {
36         //cases when NOT to use winograd
37         if (input_layout.size.feature[0] % 64 != 0       //current algorithm is effective for ifm to be multiply of 64
38             || weights_layout.size.spatial[0] != 3          //weights have to be 3x3 by definiton
39             || weights_layout.size.spatial[1] != 3          //weights have to be 3x3 by definition
40             || weights_layout.size.batch[0] % 64 != 0       //current algorithm is effective for ofm to be multiply of 64
41             || prim->stride != tensor{ 1 }                  //stride has to be 1x1 by definition
42             || prim->dilation != tensor{ 1 }                //no support for dilation
43             || prim->split() != 1                           //no support for splitted convolutions
44             || (output_size_handling_enabled && prim->with_output_size) //no support for convolutions with user-specified output size
45             || (input_layout.count() > 3000000)             //limit max input size as winograd consumes more memory
46             || (input_layout.count() < 50000)               //limit min input size as winograd is not effective for small input
47             || (input_layout.size.spatial[0] < 8 && input_layout.size.spatial[1] < 8)) //disable winograd for small spatials as perf is poor
48         {
49             return false;
50         }
51
52         return true;
53     }
54 }
55
56 layout_optimizer::layout_optimizer(bool output_size_handling_enabled)
57     : _optimization_attributes()
58     , _output_size_handling_enabled(output_size_handling_enabled)
59 {
60 }
61
62 bool layout_optimizer::convolution_bfyx_opt(layout const& output_layout, const layout& weights_layout, std::shared_ptr<const convolution> conv)
63 {
64     //A set of rules that define when bfyx mem format has better performance than yxfb
65     if (output_layout.size.batch[0] == 16 || output_layout.size.batch[0] % 16 != 0 || 
66         output_layout.data_type != data_types::f16 || weights_layout.size.batch[0] % 16 != 0 ||
67         !((weights_layout.size.spatial[0] == 1 && weights_layout.size.spatial[1] == 1) ||
68         (weights_layout.size.spatial[0] >= 5 && weights_layout.size.spatial[1] >= 5) ||
69             (conv->stride.spatial[0] > 1 && conv->stride.spatial[1] > 1) ||
70             (weights_layout.size.feature[0] <= 32 && output_layout.size.spatial[0] < 224 && output_layout.size.spatial[1] < 224) ||
71             (weights_layout.size.feature[0] <= 64 && output_layout.size.spatial[0] < 112 && output_layout.size.spatial[1] < 112) ||
72             (weights_layout.size.feature[0] <= 128 && output_layout.size.spatial[0] < 56 && output_layout.size.spatial[1] < 56) ||
73             (weights_layout.size.feature[0] <= 256 && output_layout.size.spatial[0] < 28 && output_layout.size.spatial[1] < 28) ||
74             (weights_layout.size.feature[0] <= 512 && output_layout.size.spatial[0] < 14 && output_layout.size.spatial[1] < 14) ||
75             (weights_layout.size.feature[0] <= 1024 && output_layout.size.spatial[0] <= 7 && output_layout.size.spatial[1] <= 7)) ||
76         //WA for AgeGender, which has one convolution that is better on yxfb, but due to additonal reorder overall performance is worse than bfyx
77         (output_layout.size.spatial[0] == 82 && output_layout.size.spatial[1] == 82) ||
78         (_optimization_attributes.splitted_convolution && output_layout.size.batch[0] == 16) ||
79         (!_optimization_attributes.splitted_convolution && output_layout.size.batch[0] >= 128) ||
80         _optimization_attributes.bfyx_only_layer)
81         return true;
82
83     return false;
84 }
85
86 bool layout_optimizer::convolution_byxf_opt(layout const& output_layout, const layout& weights_layout, std::shared_ptr<const convolution> conv)
87 {
88     //A set of rules that define when byxf mem format has better performance
89     if ((output_layout.data_type == data_types::f16 &&
90         weights_layout.size.spatial[0] == 1 && weights_layout.size.spatial[1] == 1 &&
91         output_layout.size.feature[0] % 64 == 0 && weights_layout.size.batch[0] % 64 == 0 &&
92         conv->stride.spatial[0] == 1 && conv->stride.spatial[1] == 1 &&
93         conv->input_offset.spatial[0] == 0 && conv->input_offset.spatial[1] == 0) ||
94         //Winograd
95         should_use_winograd_2x3_s1(conv, output_layout, weights_layout, _output_size_handling_enabled))
96         return true;
97
98     return false;
99 }
100
101 bool layout_optimizer::users_for_convolution_byxf_opt(program_node const& node, uint32_t depth)
102 {
103     //This function checks if byxf optimization can be applied to the required depth of node's users.
104     //Setting depth to 1 will check only node's users, depth = 2 are user's users etc.
105     if (depth == 0)
106         return true;
107
108     bool use_byxf = false;
109     for (auto& user : node.get_users())
110     {
111         //primitives that support transitions byxf->other format and other format->byxf are valid for byxf opt
112         if (user->type() == cldnn::eltwise::type_id() || user->type() == cldnn::pooling::type_id())
113             use_byxf = users_for_convolution_byxf_opt(*user, depth - 1);
114         //convolution that is capable to use byxf and is performant is also valid for byxf opt
115         else if (user->type() == cldnn::convolution::type_id())
116         {
117             auto conv_prim = user->as<convolution>().get_primitive();
118             if (convolution_byxf_opt(user->calc_output_layout(), user->get_dependency(1).get_output_layout(), conv_prim))
119                 use_byxf = users_for_convolution_byxf_opt(*user, depth - 1);
120             else
121             {
122                 use_byxf = false;
123                 break;
124             }
125         }
126         else
127         {
128             use_byxf = false;
129             break;
130         }
131     }
132     return use_byxf;
133 }
134
135 bool layout_optimizer::deps_depth_in_same_format(program_node const& node, const cldnn::format format, uint32_t depth)
136 {
137     //This function checks if requested format is the same for node's users in the required depth.
138     //Setting depth to 1 will check only node's dependencies, depth = 2 are dep's dependencies etc.
139     if (depth == 0)
140         return true;
141
142     bool same_format = false;
143     for (auto& dep : node.get_dependencies())
144     {
145         //skip data and generic_layers
146         if (dep->type() == cldnn::data::type_id() || dep->type() == cldnn::generic_layer::type_id())
147             continue;
148
149         //if dependency is of type reorder and format is different then skip it and move to its dependency
150         //further in graph such reorders could be optimized out
151         if(dep->type() == cldnn::reorder::type_id() && dep->get_dependencies().size() == 1 && dep->get_output_layout().format != format)
152             same_format = deps_depth_in_same_format(dep->get_dependency(0), format, depth);
153         else if (dep->get_output_layout().format == format)
154             //if dependency is of type reorder and format is the same, check if its users are primitives with support for different input and output formats
155             //if that is true then graph optimizer will optimize such reorder and layout for its dependency will be changed
156             if (dep->type() == cldnn::reorder::type_id() &&
157                 (dep->get_dependency(0).type() == cldnn::eltwise::type_id() || dep->get_dependency(0).type() == cldnn::pooling::type_id()) &&
158                 dep->get_dependencies().size() == 1)
159                 same_format = deps_depth_in_same_format(dep->get_dependency(0), format, depth - 1);
160             else
161                 same_format = deps_depth_in_same_format(*dep, format, depth - 1);
162         else
163         {
164             same_format = false;
165             break;
166         }
167     }
168     return same_format;
169 }
170
171 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, convolution_node const& node, layout const& output_or_weights_layout)
172 {
173     auto prim = node.get_primitive();
174     auto expected_tensor = current_layout.size;
175     auto expected_data_type = current_layout.data_type;
176     auto expected_format = current_layout.format;
177
178     if (type == data_type::weights || type == data_type::bias)
179     {
180         expected_data_type = output_or_weights_layout.data_type;
181     }
182
183     switch (type)
184     {
185     case data_type::bias: //convolution bias
186         expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
187         expected_format = cldnn::format::bfyx;
188         break;
189
190     case data_type::input: //convolution input
191
192         if (current_layout.data_type == data_types::f16 &&
193             layout_optimizer::convolution_byxf_opt(current_layout, output_or_weights_layout, prim) &&
194             (users_for_convolution_byxf_opt(node, 2) || deps_depth_in_same_format(node, cldnn::format::byxf, 2)) &&
195             //TODO: remove this condition when yxfb optimizations will be disabled
196             current_layout.format != cldnn::format::yxfb &&
197             current_layout.size.batch[0] == 1 &&
198             prim->dilation == tensor{ 1 } &&
199             !node.get_transposed())
200         {
201             expected_tensor = current_layout.size;
202             expected_format = cldnn::format::byxf;
203         }
204         // IMAD case
205         else if (current_layout.format == format::b_fs_yx_fsv4 ||
206                  current_layout.format == format::os_is_yx_osv16_isv4)
207         {
208             // Nothing to do, just go out from here.
209         }
210         // MMAD case
211         else if (current_layout.data_type == data_types::i8)
212         {
213             expected_tensor = current_layout.size;
214             expected_format = current_layout.format;//cldnn::format::byxf_af32;
215         }
216         else if (layout_optimizer::convolution_bfyx_opt(current_layout, output_or_weights_layout, prim)
217             || (_output_size_handling_enabled && prim->with_output_size) ||
218             node.get_transposed())
219         {
220             // commented out due to performance reasons, maybe enable in future
221             /*if (current_layout.data_type == data_types::f32 &&
222                 current_layout.size.batch[0] % 16 == 0 &&
223                 current_layout.format == format::bfyx &&
224                 output_or_weights_layout.size.spatial[0] == 1 && output_or_weights_layout.size.spatial[1] == 1 &&
225                 prim->stride.spatial[0] == 1 && prim->stride.spatial[1] == 1 &&
226                 prim->input_offset.spatial[0] == 0 && prim->input_offset.spatial[1] == 0 &&
227                 !node.get_transposed())
228             {
229                 if (!((current_layout.size.feature[0] % 8) == 0 && (current_layout.size.spatial[0] * current_layout.size.spatial[1]) == 16 &&
230                     current_layout.data_padding == padding{ { 0,0,0,0 }, 0 }))
231                 {
232                     expected_tensor = current_layout.size.transform(cldnn::format::bf8_xy16, 1);
233                     expected_format = cldnn::format::bf8_xy16;
234                 }
235             }
236             else*/
237             {
238                 expected_tensor = current_layout.size;
239                 expected_format = cldnn::format::bfyx;
240             }
241         }
242         else
243         {
244             expected_tensor = current_layout.size;
245             expected_format = cldnn::format::yxfb;
246         }
247
248         break;
249
250     default:
251         throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for convolution primitive");
252     }
253
254     return layout(expected_data_type, expected_format, expected_tensor);
255 }
256
257 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, fully_connected_node const& node, layout const& output_or_weights_layout)
258 {
259     auto prim = node.get_primitive();
260     auto expected_tensor = current_layout.size;
261     auto expected_data_type = current_layout.data_type;
262     auto expected_format = current_layout.format;
263
264     if (type == data_type::weights || type == data_type::bias)
265     {
266         expected_data_type = output_or_weights_layout.data_type;
267     }
268
269     switch (type)
270     {
271     case data_type::bias: //fc bias
272         expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
273         expected_format = cldnn::format::bfyx;
274         break;
275
276     default:
277         throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for fully-connected primitive");
278     }
279
280     return layout(expected_data_type, expected_format, expected_tensor);
281 }
282
283 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, lstm_gemm_node const& node, layout const& output_or_weights_layout)
284 {
285     auto prim = node.get_primitive();
286     auto expected_tensor = current_layout.size;
287     auto expected_data_type = current_layout.data_type;
288     auto expected_format = current_layout.format;
289
290     if (type == data_type::weights || type == data_type::bias)
291     {
292         expected_data_type = output_or_weights_layout.data_type;
293     }
294
295     switch (type)
296     {
297     case data_type::bias:
298         expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
299         expected_format = cldnn::format::bfyx;
300         break;
301
302     default:
303         throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for fully-connected primitive");
304     }
305
306     return layout(expected_data_type, expected_format, expected_tensor);
307 }
308
309 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, deconvolution_node const& node, layout const& output_or_weights_layout)
310 {
311     auto prim = node.get_primitive();
312     auto expected_tensor = current_layout.size;
313     auto expected_data_type = current_layout.data_type;
314     auto expected_format = current_layout.format;
315
316     if (type == data_type::weights || type == data_type::bias)
317     {
318         expected_data_type = output_or_weights_layout.data_type;
319     }
320
321     switch (type)
322     {
323     case data_type::bias: //convolution bias
324         expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
325         expected_format = cldnn::format::bfyx;
326         break;
327
328     default:
329         throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for deconvolution primitive");
330     }
331
332     return layout(expected_data_type, expected_format, expected_tensor);
333 }
334
335 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, detection_output_node const& node, layout const& output_or_weights_layout)
336 {
337     auto prim = node.get_primitive();
338     auto expected_tensor = current_layout.size;
339     auto expected_data_type = data_types::f32;
340     auto expected_format = output_or_weights_layout.format;
341
342     if (type != data_type::input)
343         CLDNN_ERROR_MESSAGE(prim->id, "detection_output only supports optimization of its output (no weights/biases)");
344
345     return layout(expected_data_type, expected_format, expected_tensor);
346 }
347
348 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, embed_node const& node, layout const& output_or_weights_layout)
349 {
350     auto prim = node.get_primitive();
351     auto expected_tensor = current_layout.size;
352     auto expected_data_type = current_layout.data_type;
353     auto expected_format = current_layout.format;
354
355     if (type == data_type::weights || type == data_type::bias)
356     {
357         expected_data_type = output_or_weights_layout.data_type;
358     }
359
360     switch (type)
361     {
362     case data_type::bias:
363         expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
364         expected_format = cldnn::format::bfyx;
365         break;
366
367     default:
368         throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for embed primitive");
369     }
370
371     return layout(expected_data_type, expected_format, expected_tensor);
372 }
373
374 std::pair<std::shared_ptr<cldnn::reorder>, bool>
375 layout_optimizer::create_reorder_if_needed(const layout& current_layout, const cldnn::primitive_id& memid, layout const& expected_layout)
376 {
377     if (current_layout != expected_layout)
378     {
379         cache_key ckey{ memid, expected_layout };
380         auto itr = _cached_reorders.find(ckey);
381         if (itr != _cached_reorders.end())
382             return std::make_pair(itr->second, true);
383
384         auto count = _cached_reorders.size();
385         std::stringstream ss;
386         ss << "reorder_" << count << "_" << memid;
387
388         auto reorder = std::make_shared<cldnn::reorder>(ss.str(), memid, expected_layout);
389         _cached_reorders[ckey] = reorder;
390         return std::make_pair(reorder, false);
391     }
392
393     return std::make_pair(nullptr, true);
394 }
395
396 std::pair<std::shared_ptr<cldnn::generic_layer>, bool>
397 layout_optimizer::create_reorder_from_given_source(const cldnn::primitive_id& memid, layout const& expected_layout, const kernel_selector::weights_reorder_params& reorder_params)
398 {
399     cache_key ckey{ memid, expected_layout };
400     auto itr = _cached_generic_layers.find(ckey);
401     if (itr != _cached_generic_layers.end())
402         return std::make_pair(itr->second, true);
403
404     auto count = _cached_generic_layers.size();
405     std::stringstream ss;
406     ss << "generic_layer_" << count << "_" << memid;
407
408     auto reorder = std::make_shared<cldnn::generic_layer>(ss.str(), memid, expected_layout, reorder_params);
409     _cached_generic_layers[ckey] = reorder;
410     return std::make_pair(reorder, false);
411 }
412
413 std::vector<std::pair<std::shared_ptr<primitive>, bool>> layout_optimizer::get_generic_layer(
414     const kernel_selector::weights_reorder_params & reorder_params,
415     primitive_id input_id,
416     const layout & old_layout,
417     data_type type)
418 {
419
420     if (reorder_params.engine == kernel_selector::weights_reorder_params::Engine::NONE || type != data_type::weights)
421         return{};
422
423     std::vector<std::pair<std::shared_ptr<primitive>, bool>> ret;
424
425     if (reorder_params.engine == kernel_selector::weights_reorder_params::Engine::CPU &&
426         reorder_params.cpuKernel != nullptr)
427     {
428         const auto intermediate_format = from_weights_layout(reorder_params.cpuKernel->GetExpectedInputLayout());
429         const auto intermediate_type = from_weights_type(reorder_params.cpuKernel->GetExpectedInputType());
430         if (intermediate_format != old_layout.format ||
431             intermediate_type != old_layout.data_type)
432         {
433             const layout intermediate_layout = { intermediate_type, intermediate_format, old_layout.size.transform(intermediate_format, 1) };
434
435             auto reorder = create_reorder_if_needed(old_layout, input_id, intermediate_layout);
436             if (reorder.first)
437             {
438                 ret.push_back(reorder);
439                 input_id = reorder.first->id;
440             }
441         }
442     }
443
444     auto new_dtype = from_weights_type(reorder_params.dtype);
445     const auto bpp = data_type_traits::size_of(new_dtype);
446     tensor expected_size = { 1,1,1,(tensor::value_type)(reorder_params.newBufferSize / bpp) };
447
448     if (reorder_params.toImageType)
449         expected_size = old_layout.size;
450
451     layout expected_layout = {
452         new_dtype, reorder_params.toImageType ? from_weights_layout(reorder_params.destLayout) : format::bfyx, // simple linear format (flatten to x channel)
453         expected_size
454     };
455
456     auto reorder = create_reorder_from_given_source(input_id, expected_layout, reorder_params);
457     if (reorder.first)
458         ret.push_back(reorder);
459
460     return ret;
461 }
462
463 void layout_optimizer::set_optimization_attribute(optimization_attributes_type attribute, int32_t val)
464 {
465     switch (attribute)
466     {
467     case optimization_attributes_type::splitted_convolution:
468         _optimization_attributes.splitted_convolution = val;
469         break;
470     case optimization_attributes_type::bfyx_only_layer:
471         _optimization_attributes.bfyx_only_layer = val;
472         break;
473     default:
474         throw std::out_of_range("unsupported layout optimization attribute");
475     }
476 }