Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / src / gpu / custom_gpu_primitive_gpu.cpp
1 /*
2 // Copyright (c) 2016 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 #include "custom_gpu_primitive_inst.h"
18 #include "kernel.h"
19 #include "implementation_map.h"
20 #include "kernel_selector_helper.h"
21 #include "network_impl.h"
22 #include "engine_impl.h"
23 #include "jitter.h"
24 #include "error_handler.h"
25
26 #include <map>
27 #include <sstream>
28
29 using namespace cldnn;
30 namespace kernel_selector
31 {
32     using jit_constants = kernel_selector::JitConstants;
33 }
34
35 namespace neural
36 {
37
38 struct custom_gpu_primitive_gpu : typed_primitive_impl<custom_gpu_primitive>
39 {
40     const custom_gpu_primitive_node& outer;
41     std::shared_ptr<kernel_selector::cl_kernel_data> cl_kernel;
42     gpu::kernel _kernel;
43
44     custom_gpu_primitive_gpu(const custom_gpu_primitive_node& arg, std::shared_ptr<kernel_selector::cl_kernel_data>& cl_kernel)
45     : outer(arg)
46     , cl_kernel(cl_kernel)
47     , _kernel(arg.get_program().get_engine().get_context(), cl_kernel->kernelString, arg.get_program().get_engine().get_context()->get_configuration().dump_custom_program)
48     {}
49
50     event_impl::ptr execute_impl(const std::vector<event_impl::ptr>& events, custom_gpu_primitive_inst& instance) override
51     {
52         gpu::kernel::kernel_arguments_data args;
53         for (auto& dep : instance.dependencies())
54         {
55             args.inputs.push_back(&(dep->output_memory()));
56         }
57         args.output = &instance.output_memory();
58         _kernel.set_output_event(instance.node.is_output());
59         return _kernel.run(*cl_kernel.get(), events, args);
60     }
61 };
62
63 static kernel_selector::kernel_argument_element get_arg(cldnn_arg arg)
64 {
65     kernel_selector::kernel_argument_element ret;
66     switch (arg.arg_type)
67     {
68     case arg_input:
69         ret.t = kernel_selector::kernel_argument_types::INPUT;
70         break;
71     case arg_output:
72         ret.t = kernel_selector::kernel_argument_types::OUTPUT;
73         break;
74     default:
75         throw std::runtime_error("Unknown argument type");
76         break;
77     }
78
79     ret.index = arg.index;
80
81     return ret;
82 }
83
84 std::string value_macro(const std::string& name, const std::string& value)
85 {
86     std::ostringstream oss;
87     oss << "#define " << name << " " << value << std::endl;
88     return oss.str();
89 }
90
91 static void add_layout_to_jit(kernel_selector::jit_constants& mem_consts, const std::string& name, layout l) 
92 {
93     // Size (in elements)
94     // #define INPUT0_DIMS (uint[]) { b, f, y, x, }
95     mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_DIMS", l.size.sizes(format::bfyx)));
96
97     // Data type
98     // #define INPUT0_TYPE float 
99     static const std::map<data_types, std::string> dataTypeToIndex{
100         { data_types::i8    ,"char" },
101         { data_types::u8    ,"uchar" },
102         { data_types::i32   ,"int" },
103         { data_types::i64   ,"long" },
104         { data_types::f16   ,"half" },
105         { data_types::f32   ,"float" },
106     };
107
108     if (dataTypeToIndex.find(l.data_type) == dataTypeToIndex.end()) 
109     {
110         CLDNN_ERROR_MESSAGE("add layout to jit", "Unhandled data type in layout");
111     }
112
113     mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_TYPE", dataTypeToIndex.at(l.data_type)));
114
115     // Format
116     // #define INPUT0_FORMAT_BFYX
117     mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_FORMAT_" + kernel_selector::toString(to_data_layout(l.format)), ""));
118
119     // Padding (in elements)
120     // #define INPUT0_LOWER_PADDING (uint[]) { 0, 0, 0, 0 }
121     // #define INPUT0_UPPER_PADDING (uint[]) { 0, 0, 0, 0 }
122     mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_LOWER_PADDING", l.data_padding.lower_size().sizes(format::bfyx)));
123     mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_UPPER_PADDING", l.data_padding.upper_size().sizes(format::bfyx)));
124
125     // Pitches (in elements)
126     // #define INPUT0_PITCHES (uint[]) { b, f, h, w, }
127     auto padded_sizes = l.get_buffer_size().sizes(format::bfyx);
128     
129     std::vector<tensor::value_type> pitches(4);
130     switch (l.format)
131     {
132     case format::bfyx:
133         pitches[3] = 1;
134         pitches[2] = padded_sizes[3];
135         pitches[1] = padded_sizes[2] * pitches[2];
136         pitches[0] = padded_sizes[1] * pitches[1];
137         break;
138     case format::byxf:
139         pitches[1] = 1;
140         pitches[3] = padded_sizes[1];
141         pitches[2] = padded_sizes[3] * pitches[3];
142         pitches[0] = padded_sizes[2] * pitches[2];
143         break;
144     case format::yxfb:
145         pitches[0] = 1;
146         pitches[1] = padded_sizes[0];
147         pitches[3] = padded_sizes[1] * pitches[1];
148         pitches[2] = padded_sizes[3] * pitches[3];
149         break;
150     case format::fyxb:
151         pitches[0] = 1;
152         pitches[3] = padded_sizes[0];
153         pitches[2] = padded_sizes[3] * pitches[3];
154         pitches[1] = padded_sizes[2] * pitches[2];
155         break;
156     default:
157         throw std::runtime_error("Unhandled format in pitch calculation");
158     }
159     
160     mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_PITCHES", pitches));
161
162     // Offset (in elements)
163     // #define INPUT0_OFFSET 0
164     int32_t offset =
165         (pitches[0] * l.data_padding.lower_size().batch[0]) +
166         (pitches[1] * l.data_padding.lower_size().feature[0]) +
167         (pitches[2] * l.data_padding.lower_size().spatial[1]) +
168         (pitches[3] * l.data_padding.lower_size().spatial[0]);
169     mem_consts.AddConstant(kernel_selector::MakeJitConstant(name + "_OFFSET", std::to_string(offset)));
170 }
171
172 static std::string get_jit_constant(const custom_gpu_primitive_node& outer)
173 {
174     kernel_selector::jit_constants mem_consts{ kernel_selector::MakeJitConstant("NUM_INPUTS", std::to_string(outer.get_dependencies().size())) };
175     const auto primitive = outer.get_primitive().get();
176
177     mem_consts.AddConstants({
178         kernel_selector::MakeJitConstant("GLOBAL_WORKSIZE", primitive->gws),
179         kernel_selector::MakeJitConstant("LOCAL_WORKSIZE", primitive->lws),
180     });
181
182     for (size_t i = 0; i < outer.get_dependencies().size(); i++) 
183     {
184         add_layout_to_jit(mem_consts, "INPUT" + std::to_string(i), outer.input(i).get_output_layout());
185     }
186
187     add_layout_to_jit(mem_consts, "OUTPUT0", outer.get_output_layout());
188
189     std::ostringstream oss;
190     oss << "// Custom Layer Built-ins\n\n";
191     for (auto& definition : mem_consts.GetDefinitions())
192     {
193         oss << value_macro(definition.first, definition.second);
194     }
195
196     return oss.str();
197 }
198
199 static primitive_impl* create(const custom_gpu_primitive_node& arg)
200 {
201     const auto primitive = arg.get_primitive().get();
202     
203     auto cl_kernel = std::make_shared<kernel_selector::cl_kernel_data>();
204     cl_kernel->kernelString = std::make_shared<kernel_selector::kernel_string>();
205     cl_kernel->kernelString->entry_point = primitive->kernel_entry_point;
206     cl_kernel->kernelString->options = primitive->build_options;
207     cl_kernel->kernelString->jit = get_jit_constant(arg);
208     for (const auto& s : primitive->kernels_code)
209     {
210         cl_kernel->kernelString->str += s + "\n";
211     }
212
213     cl_kernel->workGroups.global = primitive->gws;
214     cl_kernel->workGroups.local = primitive->lws;
215
216     for (const auto& p : primitive->kernel_arguments)
217     {
218         cl_kernel->arguments.push_back(get_arg(p));
219     }
220
221     return new custom_gpu_primitive_gpu(arg, cl_kernel);
222 }
223
224 namespace {
225     struct attach {
226         attach() {
227             implementation_map<custom_gpu_primitive>::add({
228                 { cldnn::engine_types::ocl, create }
229             });
230         }
231         ~attach() {}
232     };
233     attach attach_impl;
234 }
235 }