Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / src / gpu / convolution_gpu.cpp
1 /*
2 // Copyright (c) 2016-2018 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "convolution_inst.h"
18 #include "primitive_gpu_base.h"
19 #include "implementation_map.h"
20 #include "error_handler.h"
21 #include "kernel_selector_helper.h"
22 #include "kernel_runner.h"
23 #include "convolution/convolution_kernel_selector.h"
24 #include "convolution/convolution_params.h"
25
26 namespace cldnn { namespace gpu {
27
28 struct convolution_gpu : typed_primitive_gpu_impl<convolution>
29 {
30     using parent = typed_primitive_gpu_impl<convolution>;
31     using parent::parent;
32
33 protected:
34
35     virtual bool validate_impl(const typed_primitive_inst<convolution>& instance) const override
36     {
37         bool res = true;
38
39         auto outer_id = _outer.id();
40         auto data_type = instance.node.input().get_output_layout().data_type;
41
42         // Check whether all memory elements use the same unit type (FP16 or FP32).
43         CLDNN_ERROR_DATA_TYPES_MISMATCH(outer_id, "Input memory", data_type, "output memory", instance.node.get_output_layout().data_type, "");
44         // Integer signed/unsigned is ok for convoluiton
45         CLDNN_ERROR_DATA_TYPES_MISMATCH_IGNORE_SIGN(outer_id, "Input memory", data_type, "filter memory", instance.weights_memory(0).get_layout().data_type, "");
46
47         return res;
48     }
49
50     virtual kernel::kernel_arguments_data get_arguments(typed_primitive_inst<convolution>& instance, int32_t split) const override
51     {
52         kernel::kernel_arguments_data args = parent::get_arguments(instance, split);
53
54         args.weights              = &instance.weights_memory(split);
55         args.bias                 = instance.bias_term() ? &instance.bias_memory(split) : nullptr;
56         args.weights_quantization_factors = instance.weights_quantization_factors_term() ? &instance.weights_quantization_factors_memory(split) : nullptr;
57         args.output_calibration_factors = instance.output_calibration_factors_term() ? &instance.output_calibration_factors_memory(split) : nullptr;
58         return args;
59     }
60
61     virtual int32_t get_split() const override
62     { 
63         return _outer.get_split(); 
64     }
65
66     virtual uint32_t get_groups() const override
67     {
68         return _outer.get_groups();
69     }
70
71 public:
72
73     static primitive_impl* create(const convolution_node &arg)
74     {
75         const auto& primitive       = arg.get_primitive();
76         const auto& input_layout    = arg.input().get_output_layout();
77         const auto& weights_layout  = arg.weights(0).get_output_layout();
78         const auto& weights_size    = weights_layout.size;
79
80         const auto& split           = primitive->split();
81         const auto& stride          = primitive->stride;
82         const auto& dilation        = primitive->dilation;
83         const auto& input_offset    = primitive->input_offset;
84         const auto& groups           = primitive->groups;
85
86         const auto depthwise_separable_opt = arg.get_depthwise_sep_opt();
87         const auto actual_split = depthwise_separable_opt ? (decltype(split))1 : split;
88
89         const auto transposed = arg.get_transposed();
90
91         assert(arg.get_output_layout().size.feature[0] / primitive->split() == weights_layout.size.batch[0]);
92
93         auto conv_params = get_weights_bias_default_params<kernel_selector::convolution_params>(arg, (groups > 1 && !depthwise_separable_opt) ? groups : actual_split, groups);
94         auto conv_optional_params = get_default_weights_bias_optional_params<kernel_selector::convolution_optional_params>(arg.get_program());
95
96         const auto additional_offset = tensor::max(input_offset, 0);
97         if (additional_offset != 0)
98         {
99             conv_params.inputs[0] = convert_data_tensor(input_layout, (groups > 1 && !depthwise_separable_opt) ? groups : actual_split, additional_offset);
100         }
101
102         if(primitive->with_activation)
103             convert_activation_func_params(primitive, conv_params.activation);
104
105         conv_params.depthwise_separable_opt = depthwise_separable_opt;
106         conv_params.transposed = transposed;
107
108         conv_params.local_convolution = weights_size.local[0] > 1 || weights_size.local[1] > 1;
109         conv_params.split = split;
110         conv_params.groups = groups;
111         conv_params.filterSize = {
112             (uint32_t)weights_size.spatial[0],
113             (uint32_t)weights_size.spatial[1],
114         };
115
116         conv_params.padding = {
117             (uint32_t)std::max(-input_offset.spatial[0], 0),
118             (uint32_t)std::max(-input_offset.spatial[1], 0)
119         };
120
121         conv_params.stride = {
122             (uint32_t)stride.spatial[0],
123             (uint32_t)stride.spatial[1]
124         };
125         conv_params.dilation = {
126             (uint32_t)dilation.spatial[0],
127             (uint32_t)dilation.spatial[1]
128         };
129         
130         if (primitive->weights_quantization_factors.size() > 0)
131         {
132             conv_params.int8_quantization = true;
133             conv_params.weights_quantization_factors.push_back(convert_data_tensor(arg.weights_quantization_factors().get_output_layout()).FlattenFeatureAndSpatials());
134             conv_params.input_quantization_factor = arg.get_input_qf();
135
136             if (primitive->output_calibration_factors.size() > 0)
137             {
138                 conv_params.output_calibration = true;
139                 conv_params.output_calibration_factors.push_back(convert_data_tensor(arg.output_calibration_factors().get_output_layout()).FlattenFeatureAndSpatials());
140             }
141             else
142                 conv_params.output_quantization_factor = arg.get_output_qf();
143         }
144
145         auto& kernel_selector = kernel_selector::convolution_kernel_selector::Instance();
146
147         const auto& tuning_config = arg.get_program().get_options().get<build_option_type::tuning_config>();
148
149         if (tuning_config->config.mode == tuning_mode::tuning_tune_and_cache)
150         {
151             conv_optional_params.tuningParams.runner = std::make_shared<gpu::kernel_runner>(arg.get_program().get_engine(), true);
152         }
153
154         kernel_selector::KernelsData best_kernels = kernel_selector.GetBestKernels(conv_params, conv_optional_params);
155                 
156         CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with these arguments");
157         auto conv = new convolution_gpu(arg, best_kernels[0]);
158
159         return conv;
160     }
161 };
162
163 namespace{
164     struct attach {
165         attach() {
166             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), convolution_gpu::create);
167             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), convolution_gpu::create);
168             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), convolution_gpu::create);
169             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), convolution_gpu::create);
170             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), convolution_gpu::create);
171             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::winograd_2x3_s1_data), convolution_gpu::create);
172             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::winograd_2x3_s1_data), convolution_gpu::create);
173             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bf8_xy16), convolution_gpu::create);
174             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bf8_xy16), convolution_gpu::create);
175             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), convolution_gpu::create);
176             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), convolution_gpu::create);
177             // MMAD
178             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), convolution_gpu::create);
179             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byx8_f4), convolution_gpu::create);
180
181             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), convolution_gpu::create);
182             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), convolution_gpu::create);
183             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), convolution_gpu::create);
184             implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), convolution_gpu::create);
185         }
186         ~attach() {}
187     };
188     attach attach_impl;
189 }
190 } }