2 // Copyright (c) 2016-2018 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "convolution_inst.h"
18 #include "primitive_gpu_base.h"
19 #include "implementation_map.h"
20 #include "error_handler.h"
21 #include "kernel_selector_helper.h"
22 #include "kernel_runner.h"
23 #include "convolution/convolution_kernel_selector.h"
24 #include "convolution/convolution_params.h"
26 namespace cldnn { namespace gpu {
28 struct convolution_gpu : typed_primitive_gpu_impl<convolution>
30 using parent = typed_primitive_gpu_impl<convolution>;
35 virtual bool validate_impl(const typed_primitive_inst<convolution>& instance) const override
39 auto outer_id = _outer.id();
40 auto data_type = instance.node.input().get_output_layout().data_type;
42 // Check whether all memory elements use the same unit type (FP16 or FP32).
43 CLDNN_ERROR_DATA_TYPES_MISMATCH(outer_id, "Input memory", data_type, "output memory", instance.node.get_output_layout().data_type, "");
44 // Integer signed/unsigned is ok for convoluiton
45 CLDNN_ERROR_DATA_TYPES_MISMATCH_IGNORE_SIGN(outer_id, "Input memory", data_type, "filter memory", instance.weights_memory(0).get_layout().data_type, "");
50 virtual kernel::kernel_arguments_data get_arguments(typed_primitive_inst<convolution>& instance, int32_t split) const override
52 kernel::kernel_arguments_data args = parent::get_arguments(instance, split);
54 args.weights = &instance.weights_memory(split);
55 args.bias = instance.bias_term() ? &instance.bias_memory(split) : nullptr;
56 args.weights_quantization_factors = instance.weights_quantization_factors_term() ? &instance.weights_quantization_factors_memory(split) : nullptr;
57 args.output_calibration_factors = instance.output_calibration_factors_term() ? &instance.output_calibration_factors_memory(split) : nullptr;
61 virtual int32_t get_split() const override
63 return _outer.get_split();
66 virtual uint32_t get_groups() const override
68 return _outer.get_groups();
73 static primitive_impl* create(const convolution_node &arg)
75 const auto& primitive = arg.get_primitive();
76 const auto& input_layout = arg.input().get_output_layout();
77 const auto& weights_layout = arg.weights(0).get_output_layout();
78 const auto& weights_size = weights_layout.size;
80 const auto& split = primitive->split();
81 const auto& stride = primitive->stride;
82 const auto& dilation = primitive->dilation;
83 const auto& input_offset = primitive->input_offset;
84 const auto& groups = primitive->groups;
86 const auto depthwise_separable_opt = arg.get_depthwise_sep_opt();
87 const auto actual_split = depthwise_separable_opt ? (decltype(split))1 : split;
89 const auto transposed = arg.get_transposed();
91 assert(arg.get_output_layout().size.feature[0] / primitive->split() == weights_layout.size.batch[0]);
93 auto conv_params = get_weights_bias_default_params<kernel_selector::convolution_params>(arg, (groups > 1 && !depthwise_separable_opt) ? groups : actual_split, groups);
94 auto conv_optional_params = get_default_weights_bias_optional_params<kernel_selector::convolution_optional_params>(arg.get_program());
96 const auto additional_offset = tensor::max(input_offset, 0);
97 if (additional_offset != 0)
99 conv_params.inputs[0] = convert_data_tensor(input_layout, (groups > 1 && !depthwise_separable_opt) ? groups : actual_split, additional_offset);
102 if(primitive->with_activation)
103 convert_activation_func_params(primitive, conv_params.activation);
105 conv_params.depthwise_separable_opt = depthwise_separable_opt;
106 conv_params.transposed = transposed;
108 conv_params.local_convolution = weights_size.local[0] > 1 || weights_size.local[1] > 1;
109 conv_params.split = split;
110 conv_params.groups = groups;
111 conv_params.filterSize = {
112 (uint32_t)weights_size.spatial[0],
113 (uint32_t)weights_size.spatial[1],
116 conv_params.padding = {
117 (uint32_t)std::max(-input_offset.spatial[0], 0),
118 (uint32_t)std::max(-input_offset.spatial[1], 0)
121 conv_params.stride = {
122 (uint32_t)stride.spatial[0],
123 (uint32_t)stride.spatial[1]
125 conv_params.dilation = {
126 (uint32_t)dilation.spatial[0],
127 (uint32_t)dilation.spatial[1]
130 if (primitive->weights_quantization_factors.size() > 0)
132 conv_params.int8_quantization = true;
133 conv_params.weights_quantization_factors.push_back(convert_data_tensor(arg.weights_quantization_factors().get_output_layout()).FlattenFeatureAndSpatials());
134 conv_params.input_quantization_factor = arg.get_input_qf();
136 if (primitive->output_calibration_factors.size() > 0)
138 conv_params.output_calibration = true;
139 conv_params.output_calibration_factors.push_back(convert_data_tensor(arg.output_calibration_factors().get_output_layout()).FlattenFeatureAndSpatials());
142 conv_params.output_quantization_factor = arg.get_output_qf();
145 auto& kernel_selector = kernel_selector::convolution_kernel_selector::Instance();
147 const auto& tuning_config = arg.get_program().get_options().get<build_option_type::tuning_config>();
149 if (tuning_config->config.mode == tuning_mode::tuning_tune_and_cache)
151 conv_optional_params.tuningParams.runner = std::make_shared<gpu::kernel_runner>(arg.get_program().get_engine(), true);
154 kernel_selector::KernelsData best_kernels = kernel_selector.GetBestKernels(conv_params, conv_optional_params);
156 CLDNN_ERROR_BOOL(arg.id(), "Best_kernel.empty()", best_kernels.empty(), "Cannot find a proper kernel with these arguments");
157 auto conv = new convolution_gpu(arg, best_kernels[0]);
166 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::yxfb), convolution_gpu::create);
167 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::yxfb), convolution_gpu::create);
168 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bfyx), convolution_gpu::create);
169 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bfyx), convolution_gpu::create);
170 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::bfyx), convolution_gpu::create);
171 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::winograd_2x3_s1_data), convolution_gpu::create);
172 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::winograd_2x3_s1_data), convolution_gpu::create);
173 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::bf8_xy16), convolution_gpu::create);
174 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::bf8_xy16), convolution_gpu::create);
175 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f32, format::byxf), convolution_gpu::create);
176 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::f16, format::byxf), convolution_gpu::create);
178 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf_af32), convolution_gpu::create);
179 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byx8_f4), convolution_gpu::create);
181 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::fs_bs_yx_bsv4_fsv32), convolution_gpu::create);
182 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::byxf), convolution_gpu::create);
183 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::i8, format::b_fs_yx_fsv4), convolution_gpu::create);
184 implementation_map<convolution>::add(std::make_tuple(engine_types::ocl, data_types::u8, format::b_fs_yx_fsv4), convolution_gpu::create);