inference-engine/thirdparty/clDNN/src/layout_optimizer.cpp

   1 /*
   2 // Copyright (c) 2018 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 #include "layout_optimizer.h"
  18 #include "topology_impl.h"
  19 #include "network_impl.h"
  20 #include "primitive_inst.h"
  21 #include "error_handler.h"
  22
  23 #include "data_inst.h"
  24 #include "reorder_inst.h"
  25 #include "generic_layer.hpp"
  26 #include <sstream>
  27
  28 #include "eltwise_inst.h"
  29 #include "pooling_inst.h"
  30
  31 using namespace cldnn;
  32
  33 namespace {
  34     bool should_use_winograd_2x3_s1(std::shared_ptr<const convolution> const& prim, layout const& input_layout, layout const& weights_layout, bool output_size_handling_enabled)
  35     {
  36         //cases when NOT to use winograd
  37         if (input_layout.size.feature[0] % 64 != 0       //current algorithm is effective for ifm to be multiply of 64
  38             || weights_layout.size.spatial[0] != 3          //weights have to be 3x3 by definiton
  39             || weights_layout.size.spatial[1] != 3          //weights have to be 3x3 by definition
  40             || weights_layout.size.batch[0] % 64 != 0       //current algorithm is effective for ofm to be multiply of 64
  41             || prim->stride != tensor{ 1 }                  //stride has to be 1x1 by definition
  42             || prim->dilation != tensor{ 1 }                //no support for dilation
  43             || prim->split() != 1                           //no support for splitted convolutions
  44             || (output_size_handling_enabled && prim->with_output_size) //no support for convolutions with user-specified output size
  45             || (input_layout.count() > 3000000)             //limit max input size as winograd consumes more memory
  46             || (input_layout.count() < 50000)               //limit min input size as winograd is not effective for small input
  47             || (input_layout.size.spatial[0] < 8 && input_layout.size.spatial[1] < 8)) //disable winograd for small spatials as perf is poor
  48         {
  49             return false;
  50         }
  51
  52         return true;
  53     }
  54 }
  55
  56 layout_optimizer::layout_optimizer(bool output_size_handling_enabled)
  57     : _optimization_attributes()
  58     , _output_size_handling_enabled(output_size_handling_enabled)
  59 {
  60 }
  61
  62 bool layout_optimizer::convolution_bfyx_opt(layout const& output_layout, const layout& weights_layout, std::shared_ptr<const convolution> conv)
  63 {
  64     //A set of rules that define when bfyx mem format has better performance than yxfb
  65     if (output_layout.size.batch[0] == 16 || output_layout.size.batch[0] % 16 != 0 ||
  66         output_layout.data_type != data_types::f16 || weights_layout.size.batch[0] % 16 != 0 ||
  67         !((weights_layout.size.spatial[0] == 1 && weights_layout.size.spatial[1] == 1) ||
  68         (weights_layout.size.spatial[0] >= 5 && weights_layout.size.spatial[1] >= 5) ||
  69             (conv->stride.spatial[0] > 1 && conv->stride.spatial[1] > 1) ||
  70             (weights_layout.size.feature[0] <= 32 && output_layout.size.spatial[0] < 224 && output_layout.size.spatial[1] < 224) ||
  71             (weights_layout.size.feature[0] <= 64 && output_layout.size.spatial[0] < 112 && output_layout.size.spatial[1] < 112) ||
  72             (weights_layout.size.feature[0] <= 128 && output_layout.size.spatial[0] < 56 && output_layout.size.spatial[1] < 56) ||
  73             (weights_layout.size.feature[0] <= 256 && output_layout.size.spatial[0] < 28 && output_layout.size.spatial[1] < 28) ||
  74             (weights_layout.size.feature[0] <= 512 && output_layout.size.spatial[0] < 14 && output_layout.size.spatial[1] < 14) ||
  75             (weights_layout.size.feature[0] <= 1024 && output_layout.size.spatial[0] <= 7 && output_layout.size.spatial[1] <= 7)) ||
  76         //WA for AgeGender, which has one convolution that is better on yxfb, but due to additonal reorder overall performance is worse than bfyx
  77         (output_layout.size.spatial[0] == 82 && output_layout.size.spatial[1] == 82) ||
  78         (_optimization_attributes.splitted_convolution && output_layout.size.batch[0] == 16) ||
  79         (!_optimization_attributes.splitted_convolution && output_layout.size.batch[0] >= 128) ||
  80         _optimization_attributes.bfyx_only_layer)
  81         return true;
  82
  83     return false;
  84 }
  85
  86 bool layout_optimizer::convolution_byxf_opt(layout const& output_layout, const layout& weights_layout, std::shared_ptr<const convolution> conv)
  87 {
  88     //A set of rules that define when byxf mem format has better performance
  89     if ((output_layout.data_type == data_types::f16 &&
  90         weights_layout.size.spatial[0] == 1 && weights_layout.size.spatial[1] == 1 &&
  91         output_layout.size.feature[0] % 64 == 0 && weights_layout.size.batch[0] % 64 == 0 &&
  92         conv->stride.spatial[0] == 1 && conv->stride.spatial[1] == 1 &&
  93         conv->input_offset.spatial[0] == 0 && conv->input_offset.spatial[1] == 0) ||
  94         //Winograd
  95         should_use_winograd_2x3_s1(conv, output_layout, weights_layout, _output_size_handling_enabled))
  96         return true;
  97
  98     return false;
  99 }
 100
 101 bool layout_optimizer::users_for_convolution_byxf_opt(program_node const& node, uint32_t depth)
 102 {
 103     //This function checks if byxf optimization can be applied to the required depth of node's users.
 104     //Setting depth to 1 will check only node's users, depth = 2 are user's users etc.
 105     if (depth == 0)
 106         return true;
 107
 108     bool use_byxf = false;
 109     for (auto& user : node.get_users())
 110     {
 111         //primitives that support transitions byxf->other format and other format->byxf are valid for byxf opt
 112         if (user->type() == cldnn::eltwise::type_id() || user->type() == cldnn::pooling::type_id())
 113             use_byxf = users_for_convolution_byxf_opt(*user, depth - 1);
 114         //convolution that is capable to use byxf and is performant is also valid for byxf opt
 115         else if (user->type() == cldnn::convolution::type_id())
 116         {
 117             auto conv_prim = user->as<convolution>().get_primitive();
 118             if (convolution_byxf_opt(user->calc_output_layout(), user->get_dependency(1).get_output_layout(), conv_prim))
 119                 use_byxf = users_for_convolution_byxf_opt(*user, depth - 1);
 120             else
 121             {
 122                 use_byxf = false;
 123                 break;
 124             }
 125         }
 126         else
 127         {
 128             use_byxf = false;
 129             break;
 130         }
 131     }
 132     return use_byxf;
 133 }
 134
 135 bool layout_optimizer::deps_depth_in_same_format(program_node const& node, const cldnn::format format, uint32_t depth)
 136 {
 137     //This function checks if requested format is the same for node's users in the required depth.
 138     //Setting depth to 1 will check only node's dependencies, depth = 2 are dep's dependencies etc.
 139     if (depth == 0)
 140         return true;
 141
 142     bool same_format = false;
 143     for (auto& dep : node.get_dependencies())
 144     {
 145         //skip data and generic_layers
 146         if (dep->type() == cldnn::data::type_id() || dep->type() == cldnn::generic_layer::type_id())
 147             continue;
 148
 149         //if dependency is of type reorder and format is different then skip it and move to its dependency
 150         //further in graph such reorders could be optimized out
 151         if(dep->type() == cldnn::reorder::type_id() && dep->get_dependencies().size() == 1 && dep->get_output_layout().format != format)
 152             same_format = deps_depth_in_same_format(dep->get_dependency(0), format, depth);
 153         else if (dep->get_output_layout().format == format)
 154             //if dependency is of type reorder and format is the same, check if its users are primitives with support for different input and output formats
 155             //if that is true then graph optimizer will optimize such reorder and layout for its dependency will be changed
 156             if (dep->type() == cldnn::reorder::type_id() &&
 157                 (dep->get_dependency(0).type() == cldnn::eltwise::type_id() || dep->get_dependency(0).type() == cldnn::pooling::type_id()) &&
 158                 dep->get_dependencies().size() == 1)
 159                 same_format = deps_depth_in_same_format(dep->get_dependency(0), format, depth - 1);
 160             else
 161                 same_format = deps_depth_in_same_format(*dep, format, depth - 1);
 162         else
 163         {
 164             same_format = false;
 165             break;
 166         }
 167     }
 168     return same_format;
 169 }
 170
 171 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, convolution_node const& node, layout const& output_or_weights_layout)
 172 {
 173     auto prim = node.get_primitive();
 174     auto expected_tensor = current_layout.size;
 175     auto expected_data_type = current_layout.data_type;
 176     auto expected_format = current_layout.format;
 177
 178     if (type == data_type::weights || type == data_type::bias)
 179     {
 180         expected_data_type = output_or_weights_layout.data_type;
 181     }
 182
 183     switch (type)
 184     {
 185     case data_type::bias: //convolution bias
 186         expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
 187         expected_format = cldnn::format::bfyx;
 188         break;
 189
 190     case data_type::input: //convolution input
 191
 192         if (current_layout.data_type == data_types::f16 &&
 193             layout_optimizer::convolution_byxf_opt(current_layout, output_or_weights_layout, prim) &&
 194             (users_for_convolution_byxf_opt(node, 2) || deps_depth_in_same_format(node, cldnn::format::byxf, 2)) &&
 195             //TODO: remove this condition when yxfb optimizations will be disabled
 196             current_layout.format != cldnn::format::yxfb &&
 197             current_layout.size.batch[0] == 1 &&
 198             prim->dilation == tensor{ 1 } &&
 199             !node.get_transposed())
 200         {
 201             expected_tensor = current_layout.size;
 202             expected_format = cldnn::format::byxf;
 203         }
 204         // IMAD case
 205         else if (current_layout.format == format::b_fs_yx_fsv4 ||
 206                  current_layout.format == format::os_is_yx_osv16_isv4)
 207         {
 208             // Nothing to do, just go out from here.
 209         }
 210         // MMAD case
 211         else if (current_layout.data_type == data_types::i8)
 212         {
 213             expected_tensor = current_layout.size;
 214             expected_format = current_layout.format;//cldnn::format::byxf_af32;
 215         }
 216         else if (layout_optimizer::convolution_bfyx_opt(current_layout, output_or_weights_layout, prim)
 217             || (_output_size_handling_enabled && prim->with_output_size) ||
 218             node.get_transposed())
 219         {
 220             // commented out due to performance reasons, maybe enable in future
 221             /*if (current_layout.data_type == data_types::f32 &&
 222                 current_layout.size.batch[0] % 16 == 0 &&
 223                 current_layout.format == format::bfyx &&
 224                 output_or_weights_layout.size.spatial[0] == 1 && output_or_weights_layout.size.spatial[1] == 1 &&
 225                 prim->stride.spatial[0] == 1 && prim->stride.spatial[1] == 1 &&
 226                 prim->input_offset.spatial[0] == 0 && prim->input_offset.spatial[1] == 0 &&
 227                 !node.get_transposed())
 228             {
 229                 if (!((current_layout.size.feature[0] % 8) == 0 && (current_layout.size.spatial[0] * current_layout.size.spatial[1]) == 16 &&
 230                     current_layout.data_padding == padding{ { 0,0,0,0 }, 0 }))
 231                 {
 232                     expected_tensor = current_layout.size.transform(cldnn::format::bf8_xy16, 1);
 233                     expected_format = cldnn::format::bf8_xy16;
 234                 }
 235             }
 236             else*/
 237             {
 238                 expected_tensor = current_layout.size;
 239                 expected_format = cldnn::format::bfyx;
 240             }
 241         }
 242         else
 243         {
 244             expected_tensor = current_layout.size;
 245             expected_format = cldnn::format::yxfb;
 246         }
 247
 248         break;
 249
 250     default:
 251         throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for convolution primitive");
 252     }
 253
 254     return layout(expected_data_type, expected_format, expected_tensor);
 255 }
 256
 257 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, fully_connected_node const& node, layout const& output_or_weights_layout)
 258 {
 259     auto prim = node.get_primitive();
 260     auto expected_tensor = current_layout.size;
 261     auto expected_data_type = current_layout.data_type;
 262     auto expected_format = current_layout.format;
 263
 264     if (type == data_type::weights || type == data_type::bias)
 265     {
 266         expected_data_type = output_or_weights_layout.data_type;
 267     }
 268
 269     switch (type)
 270     {
 271     case data_type::bias: //fc bias
 272         expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
 273         expected_format = cldnn::format::bfyx;
 274         break;
 275
 276     default:
 277         throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for fully-connected primitive");
 278     }
 279
 280     return layout(expected_data_type, expected_format, expected_tensor);
 281 }
 282
 283 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, lstm_gemm_node const& node, layout const& output_or_weights_layout)
 284 {
 285     auto prim = node.get_primitive();
 286     auto expected_tensor = current_layout.size;
 287     auto expected_data_type = current_layout.data_type;
 288     auto expected_format = current_layout.format;
 289
 290     if (type == data_type::weights || type == data_type::bias)
 291     {
 292         expected_data_type = output_or_weights_layout.data_type;
 293     }
 294
 295     switch (type)
 296     {
 297     case data_type::bias:
 298         expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
 299         expected_format = cldnn::format::bfyx;
 300         break;
 301
 302     default:
 303         throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for fully-connected primitive");
 304     }
 305
 306     return layout(expected_data_type, expected_format, expected_tensor);
 307 }
 308
 309 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, deconvolution_node const& node, layout const& output_or_weights_layout)
 310 {
 311     auto prim = node.get_primitive();
 312     auto expected_tensor = current_layout.size;
 313     auto expected_data_type = current_layout.data_type;
 314     auto expected_format = current_layout.format;
 315
 316     if (type == data_type::weights || type == data_type::bias)
 317     {
 318         expected_data_type = output_or_weights_layout.data_type;
 319     }
 320
 321     switch (type)
 322     {
 323     case data_type::bias: //convolution bias
 324         expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
 325         expected_format = cldnn::format::bfyx;
 326         break;
 327
 328     default:
 329         throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for deconvolution primitive");
 330     }
 331
 332     return layout(expected_data_type, expected_format, expected_tensor);
 333 }
 334
 335 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, detection_output_node const& node, layout const& output_or_weights_layout)
 336 {
 337     auto prim = node.get_primitive();
 338     auto expected_tensor = current_layout.size;
 339     auto expected_data_type = data_types::f32;
 340     auto expected_format = output_or_weights_layout.format;
 341
 342     if (type != data_type::input)
 343         CLDNN_ERROR_MESSAGE(prim->id, "detection_output only supports optimization of its output (no weights/biases)");
 344
 345     return layout(expected_data_type, expected_format, expected_tensor);
 346 }
 347
 348 layout layout_optimizer::get_expected_layout(layout const& current_layout, data_type type, embed_node const& node, layout const& output_or_weights_layout)
 349 {
 350     auto prim = node.get_primitive();
 351     auto expected_tensor = current_layout.size;
 352     auto expected_data_type = current_layout.data_type;
 353     auto expected_format = current_layout.format;
 354
 355     if (type == data_type::weights || type == data_type::bias)
 356     {
 357         expected_data_type = output_or_weights_layout.data_type;
 358     }
 359
 360     switch (type)
 361     {
 362     case data_type::bias:
 363         expected_tensor = cldnn::tensor(1, 1, static_cast<tensor::value_type>(current_layout.count()), 1);
 364         expected_format = cldnn::format::bfyx;
 365         break;
 366
 367     default:
 368         throw std::runtime_error("Unsupported data type in layout_optimizer::get_expected_layout for embed primitive");
 369     }
 370
 371     return layout(expected_data_type, expected_format, expected_tensor);
 372 }
 373
 374 std::pair<std::shared_ptr<cldnn::reorder>, bool>
 375 layout_optimizer::create_reorder_if_needed(const layout& current_layout, const cldnn::primitive_id& memid, layout const& expected_layout)
 376 {
 377     if (current_layout != expected_layout)
 378     {
 379         cache_key ckey{ memid, expected_layout };
 380         auto itr = _cached_reorders.find(ckey);
 381         if (itr != _cached_reorders.end())
 382             return std::make_pair(itr->second, true);
 383
 384         auto count = _cached_reorders.size();
 385         std::stringstream ss;
 386         ss << "reorder_" << count << "_" << memid;
 387
 388         auto reorder = std::make_shared<cldnn::reorder>(ss.str(), memid, expected_layout);
 389         _cached_reorders[ckey] = reorder;
 390         return std::make_pair(reorder, false);
 391     }
 392
 393     return std::make_pair(nullptr, true);
 394 }
 395
 396 std::pair<std::shared_ptr<cldnn::generic_layer>, bool>
 397 layout_optimizer::create_reorder_from_given_source(const cldnn::primitive_id& memid, layout const& expected_layout, const kernel_selector::weights_reorder_params& reorder_params)
 398 {
 399     cache_key ckey{ memid, expected_layout };
 400     auto itr = _cached_generic_layers.find(ckey);
 401     if (itr != _cached_generic_layers.end())
 402         return std::make_pair(itr->second, true);
 403
 404     auto count = _cached_generic_layers.size();
 405     std::stringstream ss;
 406     ss << "generic_layer_" << count << "_" << memid;
 407
 408     auto reorder = std::make_shared<cldnn::generic_layer>(ss.str(), memid, expected_layout, reorder_params);
 409     _cached_generic_layers[ckey] = reorder;
 410     return std::make_pair(reorder, false);
 411 }
 412
 413 std::vector<std::pair<std::shared_ptr<primitive>, bool>> layout_optimizer::get_generic_layer(
 414     const kernel_selector::weights_reorder_params & reorder_params,
 415     primitive_id input_id,
 416     const layout & old_layout,
 417     data_type type)
 418 {
 419
 420     if (reorder_params.engine == kernel_selector::weights_reorder_params::Engine::NONE || type != data_type::weights)
 421         return{};
 422
 423     std::vector<std::pair<std::shared_ptr<primitive>, bool>> ret;
 424
 425     if (reorder_params.engine == kernel_selector::weights_reorder_params::Engine::CPU &&
 426         reorder_params.cpuKernel != nullptr)
 427     {
 428         const auto intermediate_format = from_weights_layout(reorder_params.cpuKernel->GetExpectedInputLayout());
 429         const auto intermediate_type = from_weights_type(reorder_params.cpuKernel->GetExpectedInputType());
 430         if (intermediate_format != old_layout.format ||
 431             intermediate_type != old_layout.data_type)
 432         {
 433             const layout intermediate_layout = { intermediate_type, intermediate_format, old_layout.size.transform(intermediate_format, 1) };
 434
 435             auto reorder = create_reorder_if_needed(old_layout, input_id, intermediate_layout);
 436             if (reorder.first)
 437             {
 438                 ret.push_back(reorder);
 439                 input_id = reorder.first->id;
 440             }
 441         }
 442     }
 443
 444     auto new_dtype = from_weights_type(reorder_params.dtype);
 445     const auto bpp = data_type_traits::size_of(new_dtype);
 446     tensor expected_size = { 1,1,1,(tensor::value_type)(reorder_params.newBufferSize / bpp) };
 447
 448     if (reorder_params.toImageType)
 449         expected_size = old_layout.size;
 450
 451     layout expected_layout = {
 452         new_dtype, reorder_params.toImageType ? from_weights_layout(reorder_params.destLayout) : format::bfyx, // simple linear format (flatten to x channel)
 453         expected_size
 454     };
 455
 456     auto reorder = create_reorder_from_given_source(input_id, expected_layout, reorder_params);
 457     if (reorder.first)
 458         ret.push_back(reorder);
 459
 460     return ret;
 461 }
 462
 463 void layout_optimizer::set_optimization_attribute(optimization_attributes_type attribute, int32_t val)
 464 {
 465     switch (attribute)
 466     {
 467     case optimization_attributes_type::splitted_convolution:
 468         _optimization_attributes.splitted_convolution = val;
 469         break;
 470     case optimization_attributes_type::bfyx_only_layer:
 471         _optimization_attributes.bfyx_only_layer = val;
 472         break;
 473     default:
 474         throw std::out_of_range("unsupported layout optimization attribute");
 475     }
 476 }