2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 #include "custom_gpu_primitive_inst.h"
19 #include "implementation_map.h"
20 #include "kernel_selector_helper.h"
21 #include "network_impl.h"
22 #include "engine_impl.h"
24 #include "error_handler.h"
29 using namespace cldnn;
30 namespace kernel_selector
32 using jit_constants = kernel_selector::JitConstants;
38 struct custom_gpu_primitive_gpu : typed_primitive_impl<custom_gpu_primitive>
40 const custom_gpu_primitive_node& outer;
41 std::shared_ptr<kernel_selector::cl_kernel_data> cl_kernel;
44 custom_gpu_primitive_gpu(const custom_gpu_primitive_node& arg, std::shared_ptr<kernel_selector::cl_kernel_data>& cl_kernel)
46 , cl_kernel(cl_kernel)
47 , _kernel(arg.get_program().get_engine().get_context(), cl_kernel->kernelString, arg.get_program().get_engine().get_context()->get_configuration().dump_custom_program)
50 event_impl::ptr execute_impl(const std::vector<event_impl::ptr>& events, custom_gpu_primitive_inst& instance) override
52 gpu::kernel::kernel_arguments_data args;
53 for (auto& dep : instance.dependencies())
55 args.inputs.push_back(&(dep->output_memory()));
57 args.output = &instance.output_memory();
58 _kernel.set_output_event(instance.node.is_output());
59 return _kernel.run(*cl_kernel.get(), events, args);
63 static kernel_selector::kernel_argument_element get_arg(cldnn_arg arg)
65 kernel_selector::kernel_argument_element ret;
69 ret.t = kernel_selector::kernel_argument_types::INPUT;
72 ret.t = kernel_selector::kernel_argument_types::OUTPUT;
75 throw std::runtime_error("Unknown argument type");
79 ret.index = arg.index;
84 std::string value_macro(const std::string& name, const std::string& value)
86 std::ostringstream oss;
87 oss << "#define " << name << " " << value << std::endl;
91 static void add_layout_to_jit(kernel_selector::jit_constants& mem_consts, const std::string& name, layout l)
94 // #define INPUT0_DIMS (uint[]) { b, f, y, x, }
95 mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_DIMS", l.size.sizes(format::bfyx)));
98 // #define INPUT0_TYPE float
99 static const std::map<data_types, std::string> dataTypeToIndex{
100 { data_types::i8 ,"char" },
101 { data_types::u8 ,"uchar" },
102 { data_types::i32 ,"int" },
103 { data_types::i64 ,"long" },
104 { data_types::f16 ,"half" },
105 { data_types::f32 ,"float" },
108 if (dataTypeToIndex.find(l.data_type) == dataTypeToIndex.end())
110 CLDNN_ERROR_MESSAGE("add layout to jit", "Unhandled data type in layout");
113 mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_TYPE", dataTypeToIndex.at(l.data_type)));
116 // #define INPUT0_FORMAT_BFYX
117 mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_FORMAT_" + kernel_selector::toString(to_data_layout(l.format)), ""));
119 // Padding (in elements)
120 // #define INPUT0_LOWER_PADDING (uint[]) { 0, 0, 0, 0 }
121 // #define INPUT0_UPPER_PADDING (uint[]) { 0, 0, 0, 0 }
122 mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_LOWER_PADDING", l.data_padding.lower_size().sizes(format::bfyx)));
123 mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_UPPER_PADDING", l.data_padding.upper_size().sizes(format::bfyx)));
125 // Pitches (in elements)
126 // #define INPUT0_PITCHES (uint[]) { b, f, h, w, }
127 auto padded_sizes = l.get_buffer_size().sizes(format::bfyx);
129 std::vector<tensor::value_type> pitches(4);
134 pitches[2] = padded_sizes[3];
135 pitches[1] = padded_sizes[2] * pitches[2];
136 pitches[0] = padded_sizes[1] * pitches[1];
140 pitches[3] = padded_sizes[1];
141 pitches[2] = padded_sizes[3] * pitches[3];
142 pitches[0] = padded_sizes[2] * pitches[2];
146 pitches[1] = padded_sizes[0];
147 pitches[3] = padded_sizes[1] * pitches[1];
148 pitches[2] = padded_sizes[3] * pitches[3];
152 pitches[3] = padded_sizes[0];
153 pitches[2] = padded_sizes[3] * pitches[3];
154 pitches[1] = padded_sizes[2] * pitches[2];
157 throw std::runtime_error("Unhandled format in pitch calculation");
160 mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_PITCHES", pitches));
162 // Offset (in elements)
163 // #define INPUT0_OFFSET 0
165 (pitches[0] * l.data_padding.lower_size().batch[0]) +
166 (pitches[1] * l.data_padding.lower_size().feature[0]) +
167 (pitches[2] * l.data_padding.lower_size().spatial[1]) +
168 (pitches[3] * l.data_padding.lower_size().spatial[0]);
169 mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_OFFSET", std::to_string(offset)));
172 static std::string get_jit_constant(const custom_gpu_primitive_node& outer)
174 kernel_selector::jit_constants mem_consts{ kernel_selector::MakeJitConstant("NUM_INPUTS", std::to_string(outer.get_dependencies().size())) };
175 const auto primitive = outer.get_primitive().get();
177 mem_consts.AddConstants({
178 kernel_selector::MakeJitConstant("GLOBAL_WORKSIZE", primitive->gws),
179 kernel_selector::MakeJitConstant("LOCAL_WORKSIZE", primitive->lws),
182 for (size_t i = 0; i < outer.get_dependencies().size(); i++)
184 add_layout_to_jit(mem_consts, "INPUT" + std::to_string(i), outer.input(i).get_output_layout());
187 add_layout_to_jit(mem_consts, "OUTPUT0", outer.get_output_layout());
189 std::ostringstream oss;
190 oss << "// Custom Layer Built-ins\n\n";
191 for (auto& definition : mem_consts.GetDefinitions())
193 oss << value_macro(definition.first, definition.second);
199 static primitive_impl* create(const custom_gpu_primitive_node& arg)
201 const auto primitive = arg.get_primitive().get();
203 auto cl_kernel = std::make_shared<kernel_selector::cl_kernel_data>();
204 cl_kernel->kernelString = std::make_shared<kernel_selector::kernel_string>();
205 cl_kernel->kernelString->entry_point = primitive->kernel_entry_point;
206 cl_kernel->kernelString->options = primitive->build_options;
207 cl_kernel->kernelString->jit = get_jit_constant(arg);
208 for (const auto& s : primitive->kernels_code)
210 cl_kernel->kernelString->str += s + "\n";
213 cl_kernel->workGroups.global = primitive->gws;
214 cl_kernel->workGroups.local = primitive->lws;
216 for (const auto& p : primitive->kernel_arguments)
218 cl_kernel->arguments.push_back(get_arg(p));
221 return new custom_gpu_primitive_gpu(arg, cl_kernel);
227 implementation_map<custom_gpu_primitive>::add({
228 { cldnn::engine_types::ocl, create }