Publishing R3
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / src / include / kernel_selector_helper.h
1 /*
2 // Copyright (c) 2016 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #pragma once
18 #include "api/C/cldnn.h"
19 #include "api/CPP/program.hpp"
20 #include "program_impl.h"
21 #include "gpu/ocl_toolkit.h"
22 #include "tensor_type.h"
23 #include "kernel_selector_params.h"
24 #include "kernel_selector_common.h"
25 #include "jitter.h"
26
27 using namespace cldnn;
28
29 namespace kernel_selector
30 {
31     using n_dims                            = kernel_selector::Tensor::NDims;
32     using kernel_data                       = kernel_selector::KernelData;
33     using kernel_string                     = kernel_selector::KernelString;
34     using cl_kernel_data                    = kernel_selector::clKernelData;
35     using kernel_arguments                  = kernel_selector::Arguments;
36     using kernel_argument_element           = kernel_selector::ArgumentDescriptor;
37     using kernel_argument_types             = kernel_selector::ArgumentDescriptor::Types;
38     using kernel_scalar_arguments           = kernel_selector::Scalars;
39     using kernel_scalar_argument_types      = kernel_selector::ScalarDescriptor::Types;
40     using jit_constants                     = kernel_selector::JitConstants;
41
42     using data_type                         = kernel_selector::Datatype;
43     using kernel_type                       = kernel_selector::KernelType;
44     using weights_type                      = kernel_selector::WeightsType;
45     using activation_function               = kernel_selector::ActivationFunction;
46     using pool_type                         = kernel_selector::PoolType;
47     using pool_remainder                    = kernel_selector::PoolRemainder;
48         using argm_axis                                                 = kernel_selector::ArgMaxMinAxis;
49         using argm_output                                               = kernel_selector::ArgMaxMinOut;
50     using lookt_axis                        = kernel_selector::LookUpTableAxis;
51     using lrn_mode                          = kernel_selector::LRNMode;
52     using normalize_mode                    = kernel_selector::NormalizeMode;
53     using mvn_mode                          = kernel_selector::MVNMode;
54     using kernel_divider_mode               = kernel_selector::KernelDividerMode;
55     using eltwise_mode                      = kernel_selector::EltwiseMode;
56     using eltwise_input_mode                = kernel_selector::EltwiseInputMode;
57     using softmax_dim                       = kernel_selector::SoftmaxDim;
58     using mean_subtruct_mode                = kernel_selector::MeanSubtractMode;
59     using mean_op                           = kernel_selector::MeanOp;
60     using concat_axis                       = kernel_selector::ConcatAxis;
61     using tuning_mode                       = kernel_selector::TuningMode;
62     using sample_type                       = kernel_selector::SampleType;
63
64     using data_tensor                       = kernel_selector::DataTensor;
65     using weights_tensor                    = kernel_selector::WeightsTensor;
66     using data_layout                       = kernel_selector::DataLayout;
67     using weights_layout                    = kernel_selector::WeightsLayout;
68     using multi_data_tensor                 = kernel_selector::MultiDataTensor;
69
70     using params                            = kernel_selector::Params;
71     using weights_reorder_params            = kernel_selector::WeightsReorderParams;
72     using generic_kernel_params             = kernel_selector::GenericKernelParams;
73 }
74
75 inline kernel_selector::data_type to_data_type(data_types dt)
76 {
77     switch (dt)
78     {
79     case cldnn::data_types::i8:     return kernel_selector::data_type::INT8;
80     case cldnn::data_types::u8:     return kernel_selector::data_type::UINT8;
81     case cldnn::data_types::f16:    return kernel_selector::data_type::F16;
82     case cldnn::data_types::f32:    return kernel_selector::data_type::F32;
83     default:
84         assert(0);
85         return kernel_selector::data_type::F16;
86     }
87 }
88
89 inline data_types from_data_type(kernel_selector::data_type dt)
90 {
91     switch (dt)
92     {
93     case kernel_selector::data_type::INT8:   return cldnn::data_types::i8;
94     case kernel_selector::data_type::UINT8:   return cldnn::data_types::u8;
95     case kernel_selector::data_type::F16:    return cldnn::data_types::f16;
96     case kernel_selector::data_type::F32:    return cldnn::data_types::f32;
97     default:
98         assert(0);
99         return cldnn::data_types::f16;
100     }
101 }
102
103 inline kernel_selector::weights_type to_weights_type(data_types dt)
104 {
105     switch (dt)
106     {
107     case cldnn::data_types::i8:     return kernel_selector::weights_type::INT8;
108     case cldnn::data_types::f16:    return kernel_selector::weights_type::F16;
109     case cldnn::data_types::f32:    return kernel_selector::weights_type::F32;
110     default:
111         assert(0);
112         return kernel_selector::weights_type::F16;
113     }
114 }
115
116 inline data_types from_weights_type(kernel_selector::weights_type dt)
117 {
118     switch (dt)
119     {
120     case kernel_selector::weights_type::INT8:   return data_types::i8;
121     case kernel_selector::weights_type::F16:    return data_types::f16;
122     case kernel_selector::weights_type::F32:    return data_types::f32;
123     default:
124         assert(0);
125         return data_types::f16;;
126     }
127 }
128
129 inline kernel_selector::data_layout to_data_layout(format f)
130 {
131     switch (f)
132     {
133     case format::bfyx:              return kernel_selector::data_layout::bfyx;
134     case format::yxfb:              return kernel_selector::data_layout::yxfb;
135     case format::byxf:              return kernel_selector::data_layout::byxf;
136     case format::fyxb:              return kernel_selector::data_layout::fyxb;
137     case format::bs_x_bsv16:        return kernel_selector::data_layout::bs_f_bsv16__af8;
138     case format::bs_xs_xsv8_bsv8:   return kernel_selector::data_layout::bs_f_bsv8__af8;
139     case format::bs_xs_xsv8_bsv16:  return kernel_selector::data_layout::bs_f_bsv16__af8;
140     case format::bf8_xy16:          return kernel_selector::data_layout::bf8_xy16;
141     case format::winograd_2x3_s1_data:  return kernel_selector::data_layout::winograd_2x3_s1_data;
142     case format::byxf_af32: return kernel_selector::data_layout::byxf_af32;
143 //     case format::brfyx:          return kernel_selector::data_layout::brfyx;
144     default:
145         return kernel_selector::data_layout::bfyx;
146     }
147 }
148
149 static inline cldnn::format from_data_layout(kernel_selector::data_layout l)
150 {
151     switch (l)
152     {
153     case kernel_selector::data_layout::bf:                return cldnn::format::bfyx;
154     case kernel_selector::data_layout::fb:                return cldnn::format::fyxb;
155     case kernel_selector::data_layout::bfyx:              return cldnn::format::bfyx;
156     case kernel_selector::data_layout::yxfb:              return cldnn::format::yxfb;
157     case kernel_selector::data_layout::byxf:              return cldnn::format::byxf;
158     case kernel_selector::data_layout::fyxb:              return cldnn::format::fyxb;
159     case kernel_selector::data_layout::bs_f_bsv8__af8:    return cldnn::format::bs_xs_xsv8_bsv8;
160     case kernel_selector::data_layout::bs_f_bsv16__af8:   return cldnn::format::bs_x_bsv16;
161     case kernel_selector::data_layout::bf8_xy16:          return cldnn::format::bf8_xy16;
162     case kernel_selector::data_layout::brfyx:             return cldnn::format::bfyx;
163     case kernel_selector::data_layout::winograd_2x3_s1_data:   return cldnn::format::winograd_2x3_s1_data;
164     case kernel_selector::data_layout::byxf_af32: return cldnn::format::byxf_af32;
165     default:
166         return cldnn::format::bfyx;
167         break;
168     }
169 }
170
171 inline kernel_selector::weights_layout to_weights_layout(format f)
172 {
173     switch (f)
174     {
175     case format::bfyx:              return kernel_selector::weights_layout::oiyx;
176     case format::fyxb:              return kernel_selector::weights_layout::iyxo;
177     case format::byxf:              return kernel_selector::weights_layout::oyxi;
178     case format::yxfb:              return kernel_selector::weights_layout::yxio;
179     case format::os_iyx_osv16:      return kernel_selector::weights_layout::os_iyx_osv16;
180     case format::bs_xs_xsv8_bsv8:   return kernel_selector::weights_layout::os_i_osv8__ai8;
181     case format::bs_xs_xsv8_bsv16:  return kernel_selector::weights_layout::os_i_osv16__ai8;
182     case format::bs_x_bsv16:        return kernel_selector::weights_layout::os_i_osv16;
183     case format::image_2d_weights_c4_fyx_b:     return kernel_selector::weights_layout::image_2d_weights_c4_fyx_b;
184     case format::image_2d_weights_c1_b_fyx:     return kernel_selector::weights_layout::image_2d_weights_c1_b_fyx;
185     case format::winograd_2x3_s1_weights:       return kernel_selector::weights_layout::winograd_2x3_s1_weights;
186     case format::winograd_2x3_s1_fused_weights: return kernel_selector::weights_layout::winograd_2x3_s1_fused_weights;
187     case format::winograd_6x3_s1_fused_weights: return kernel_selector::weights_layout::winograd_6x3_s1_fused_weights;
188     case format::image_2d_weights_winograd_6x3_s1_fbxyb:     return kernel_selector::weights_layout::image_2d_weights_winograd_6x3_s1_fbxyb;
189     case format::image_2d_weights_winograd_6x3_s1_xfbyb:     return kernel_selector::weights_layout::image_2d_weights_winograd_6x3_s1_xfbyb;
190     case format::os_is_yx_isa8_osv8_isv4: return kernel_selector::weights_layout::os_is_yx_isa8_osv8_isv4;
191     default:
192         return kernel_selector::weights_layout::oi;
193     }
194 }
195
196 static inline cldnn::format::type from_weights_layout(kernel_selector::weights_layout l)
197 {
198     switch (l)
199     {
200     case kernel_selector::weights_layout::oi:
201     case kernel_selector::weights_layout::oiyx:               return cldnn::format::bfyx;
202     case kernel_selector::weights_layout::oyxi:               return cldnn::format::byxf;
203     case kernel_selector::weights_layout::io:
204     case kernel_selector::weights_layout::iyxo:               return cldnn::format::fyxb;
205     case kernel_selector::weights_layout::yxio:               return cldnn::format::yxfb;
206     case kernel_selector::weights_layout::os_iyx_osv16:       return cldnn::format::os_iyx_osv16;
207     case kernel_selector::weights_layout::os_i_osv16:         return cldnn::format::bs_x_bsv16;
208     case kernel_selector::weights_layout::os_i_osv8__ai8:     return cldnn::format::bs_xs_xsv8_bsv8;
209     case kernel_selector::weights_layout::os_i_osv16__ai8:    return cldnn::format::bs_xs_xsv8_bsv16;
210     case kernel_selector::weights_layout::image_2d_weights_c4_fyx_b:        return cldnn::format::image_2d_weights_c4_fyx_b;
211     case kernel_selector::weights_layout::image_2d_weights_c1_b_fyx:        return cldnn::format::image_2d_weights_c1_b_fyx;
212     case kernel_selector::weights_layout::winograd_2x3_s1_weights:          return cldnn::format::winograd_2x3_s1_weights;
213     case kernel_selector::weights_layout::winograd_2x3_s1_fused_weights:    return cldnn::format::winograd_2x3_s1_fused_weights;
214     case kernel_selector::weights_layout::winograd_6x3_s1_fused_weights:    return cldnn::format::winograd_6x3_s1_fused_weights;
215     case kernel_selector::weights_layout::image_2d_weights_winograd_6x3_s1_fbxyb:        return cldnn::format::image_2d_weights_winograd_6x3_s1_fbxyb;
216     case kernel_selector::weights_layout::image_2d_weights_winograd_6x3_s1_xfbyb:        return cldnn::format::image_2d_weights_winograd_6x3_s1_xfbyb;
217     case kernel_selector::weights_layout::os_is_yx_isa8_osv8_isv4: return cldnn::format::os_is_yx_isa8_osv8_isv4;
218     default:
219         return cldnn::format::bfyx;
220     }
221 }
222
223 inline kernel_selector::tuning_mode to_tuning_mode(cldnn::tuning_mode mode)
224 {
225     switch (mode)
226     {
227     case cldnn::tuning_mode::tuning_disabled:         return kernel_selector::tuning_mode::TUNING_DISABLED;
228     case cldnn::tuning_mode::tuning_use_cache:        return kernel_selector::tuning_mode::TUNING_USE_CACHE;
229     case cldnn::tuning_mode::tuning_tune_and_cache:   return kernel_selector::tuning_mode::TUNING_TUNE_AND_CACHE;
230     default:
231         return kernel_selector::tuning_mode::TUNING_DISABLED;
232     }
233 }
234
235 inline std::string to_host_version(const cldnn::version_t& version)
236 {
237     std::stringstream ss;
238     ss << version.major << "." << version.minor << "." << version.build << "." << version.revision;
239     return ss.str();
240 }
241
242 inline kernel_selector::data_tensor convert_data_tensor(const layout& l, uint32_t split = 1, const tensor view_offset = {})
243 {
244     const auto& pad = l.data_padding;
245     const auto& vals = l.size.sizes(l.format);
246     const auto& add_offsets = view_offset.sizes(l.format);
247     const auto& lower_pad = pad.lower_size().sizes(l.format);
248     const auto& upper_pad = pad.upper_size().sizes(l.format);
249     const auto ks_layout = to_data_layout(l.format);
250     kernel_selector::n_dims vec(kernel_selector::DataTensor::ChannelsCount(ks_layout));
251
252     size_t pitch = 1;
253     size_t offset = 0;
254
255     auto new_vals = vals;
256
257     if (ks_layout == kernel_selector::Tensor::byxf_af32)
258     {
259         new_vals[3] = align_to(vals[3], 32);
260     }
261
262     for (size_t i = 0; i < vec.size(); i++)
263     {
264         const size_t tensor_index = vec.size() - 1 - i;
265         const auto d = vals[tensor_index];
266         const auto lp = lower_pad[tensor_index];
267         const auto up = upper_pad[tensor_index];
268         // tells us how many elements are reserved in memory for this tensor index
269         const auto reserved_in_mem_count = new_vals[tensor_index];
270
271         auto& elm = vec[i];
272         elm.v = static_cast<size_t>(d - add_offsets[tensor_index]);
273         elm.pitch = pitch;
274         elm.pad.before = lp;
275         elm.pad.after = up;
276
277         offset += pitch*(add_offsets[tensor_index]);
278         pitch *= (reserved_in_mem_count + lp + up);
279     }
280
281     const int feature_index = kernel_selector::DataTensor::Channelndex(ks_layout, kernel_selector::Tensor::DataChannelName::FEATURE);
282     vec[feature_index].v /= split;
283
284     return kernel_selector::data_tensor(
285         vec,
286         to_data_type(l.data_type),
287         ks_layout,
288         offset);
289 }
290
291 inline kernel_selector::weights_tensor convert_weights_tensor(const layout& l)
292 {
293     assert(l.format.dimension() == 4);
294     const auto& t = l.size.sizes(format::bfyx);
295     const auto base_layout = kernel_selector::weights_layout::oiyx;
296     const auto ks_type = to_weights_type(l.data_type);
297     const auto ks_layout = to_weights_layout(l.format);
298     std::vector<size_t> vec(kernel_selector::WeightsTensor::ChannelsCount(base_layout));
299
300     for (size_t i = 0; i < vec.size(); i++)
301     {
302         const size_t tensor_index = t.size() - 1 - i;
303         const auto d = t[tensor_index];
304         vec[i] = static_cast<size_t>(d);
305     }
306
307     return kernel_selector::weights_tensor(
308         vec,
309         ks_type,
310         base_layout).TransformIgnorePadding(ks_layout);
311 }
312
313 template <typename p_type>
314 inline void convert_activation_func_params(const p_type primitive, kernel_selector::base_params& params)
315 {
316     const float negative_slope = primitive->activation_negative_slope;
317     if (negative_slope)
318     {
319         params.activationParams.m = negative_slope;
320         params.activationFunc = kernel_selector::activation_function::RELU_NEGATIVE_SLOPE;
321     }
322     else
323     {
324         params.activationFunc = kernel_selector::activation_function::RELU;
325     }
326 }
327
328 inline kernel_selector::activation_function get_kernel_selector_activation_param(cldnn_activation_func activation_func)
329 {
330     switch (activation_func)
331     {
332     case activation_none:
333         return kernel_selector::activation_function::NONE;
334     case activation_logistic:
335         return kernel_selector::activation_function::LOGISTIC;
336     case activation_hyperbolic_tan:
337         return kernel_selector::activation_function::HYPERBOLIC_TAN;
338     case activation_relu:
339         return kernel_selector::activation_function::RELU;
340     case activation_relu_negative_slope:
341         return kernel_selector::activation_function::RELU_NEGATIVE_SLOPE;
342     case activation_clamp:
343         return kernel_selector::activation_function::CLAMP;
344     case activation_softrelu:
345         return kernel_selector::activation_function::SOFTRELU;
346     case activation_abs:
347         return kernel_selector::activation_function::ABS;
348     case activation_linear:
349         return kernel_selector::activation_function::LINEAR;
350     case activation_square:
351         return kernel_selector::activation_function::SQUARE;
352     case activation_sqrt:
353         return kernel_selector::activation_function::SQRT;
354     case activation_elu:
355         return kernel_selector::activation_function::ELU;
356     default:
357         throw std::runtime_error("Unknown activation function");
358         break;
359     }
360 }
361
362 inline kernel_selector::activation_function get_kernel_selector_activation_grad_param(cldnn_activation_grad_func activation_grad_func)
363 {
364     switch (activation_grad_func)
365     {
366     case activation_grad_none:
367         return kernel_selector::activation_function::NONE_GRAD;
368     case activation_grad_relu:
369         return kernel_selector::activation_function::RELU_GRAD;
370     case activation_grad_relu_negative_slope:
371         return kernel_selector::activation_function::RELU_NEGATIVE_SLOPE_GRAD;
372     default:
373         throw std::runtime_error("Unknown activation_grad function");
374         break;
375     }
376 }
377
378 template <typename arg_t>
379 inline void convert_fused_activation_func_params(const arg_t& arg, kernel_selector::base_params& params)
380 {
381     params.activationParams.m = arg.get_fused_activation_params().a;
382     params.activationParams.n = arg.get_fused_activation_params().b;
383     params.activationFunc = get_kernel_selector_activation_param(arg.get_fused_activation_func());
384 }
385
386 template <typename p_type>
387 inline void convert_new_activation_func(const p_type primitive, kernel_selector::base_params& params)
388 {
389     params.activationFunc = get_kernel_selector_activation_param(primitive->activation_func);
390     params.activationParams.m = primitive->additional_params.a;
391     params.activationParams.n = primitive->additional_params.b;
392 }
393
394 template <typename params_t, typename arg_t>
395 inline params_t get_default_params(const arg_t& arg, uint32_t split = 1)
396 {
397     params_t params;
398
399     const auto& context = arg.get_program().get_engine().get_context();
400     const auto& engine_info = context->get_engine_info();
401
402     params.engineInfo.bSubGroupSupport      = context->extension_supported("cl_intel_subgroups");
403     params.engineInfo.bSubGroupShortSupport = context->extension_supported("cl_intel_subgroups_short");
404     params.engineInfo.bFP16Support          = context->extension_supported("cl_khr_fp16");
405     params.engineInfo.bFP64Support          = context->extension_supported("cl_khr_fp64");
406     params.engineInfo.bImageSupport         = engine_info.supports_image != 0;
407     params.engineInfo.maxWorkGroupSize      = engine_info.max_work_group_size;
408     params.engineInfo.maxLocalMemSize       = engine_info.max_local_mem_size;
409     params.engineInfo.maxImage2dWidth       = engine_info.max_image2d_width;
410     params.engineInfo.maxImage2dHeight      = engine_info.max_image2d_height;
411     params.engineInfo.deviceId              = engine_info.dev_id;
412     params.engineInfo.driverVersion         = engine_info.driver_version;
413     params.engineInfo.hostVersion           = to_host_version(cldnn::get_version());
414     
415     const auto& input_layout    = arg.input().get_output_layout();
416     const auto& output_layout   = arg.get_output_layout();
417
418     params.inputs[0] = convert_data_tensor(input_layout, split);
419     params.output = convert_data_tensor(output_layout, split);
420
421     params.layerID = arg.id();
422
423     convert_fused_activation_func_params(arg, params);
424
425     return params;
426 }
427
428 template <typename params_t, typename arg_t>
429 inline params_t get_weights_bias_default_params(const arg_t& arg, uint32_t split = 1)
430 {
431     params_t params = get_default_params<params_t>(arg, split);
432
433     const auto& weights_layout = arg.weights().get_output_layout();
434     params.weights = convert_weights_tensor(weights_layout);
435
436     if (arg.bias_term())
437     {
438         const auto& bias_layout = arg.bias().get_output_layout();
439         // bias per output is not supported on cldnn
440         params.bias.push_back(convert_data_tensor(bias_layout).FlattenFeatureAndSpatials());
441     }
442
443     return params;
444 }
445
446 template <typename params_t, typename arg_t>
447 inline params_t get_default_learning_params(const arg_t& arg, uint32_t split = 1)
448 {
449         params_t params = get_weights_bias_default_params<params_t>(arg, split);
450
451         const auto learning_params = arg.get_program().get_options().template get<build_option_type::learning_config>()->params;
452
453         if (arg.use_momentum())
454         {
455                 params.use_momentum = true;
456         }
457
458         params.momentum_factor = learning_params.momentum;
459         params.weights_decay = learning_params.weights_decay;
460
461         return params;
462 }
463
464 template <typename optional_params_t>
465 inline optional_params_t get_default_optional_params(const program_impl& program)
466 {
467     optional_params_t params;
468     
469     const auto& context = program.get_engine().get_context();
470
471     params.meaningfulKernelsNames       = context->get_configuration().meaningful_kernels_names;
472     params.allowStaticInputReordering   = program.get_options().get<build_option_type::optimize_data>()->enabled();
473     params.allowInputReordering         = false;
474     params.allowOutputReordering        = false;
475     
476     const auto& tuning_config = program.get_options().get<build_option_type::tuning_config>();
477     params.tuningParams.mode = to_tuning_mode(tuning_config->config.mode);
478     params.tuningParams.cacheFilePath = tuning_config->config.cache_file_path;
479
480     return params;
481 }
482
483 template <typename optional_params_t>
484 inline optional_params_t get_default_weights_bias_optional_params(const program_impl& program)
485 {
486     return get_default_optional_params<optional_params_t>(program);
487 }
488
489 template <typename optional_params_t>
490 inline optional_params_t get_default_learning_optional_params(const program_impl& program)
491 {
492         return get_default_weights_bias_optional_params<optional_params_t>(program);
493 }