Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / thirdparty / clDNN / src / gpu / ocl_toolkit.cpp
1 /*
2 // Copyright (c) 2016 Intel Corporation
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 */
16
17 ///////////////////////////////////////////////////////////////////////////////////////////////////
18 #include "ocl_toolkit.h"
19 #include "ocl_base_event.h"
20 #include "ocl_user_event.h"
21 #include "command_queues_builder.h"
22 #include "events_pool.h"
23
24 #include <cassert>
25 #include <iomanip>
26 #include <ios>
27
28 #include <fstream>
29
30 // NOTE: Due to buggy scope transition of warnings we need to disable warning in place of use/instantation
31 //       of some types (even though we already disabled them in scope of definition of these types).
32 //       Moreover this warning is pretty much now only for annoyance: it is generated due to lack
33 //       of proper support for mangling of custom GCC attributes into type name (usually when used
34 //       with templates, even from standard library).
35 #if defined __GNUC__ && __GNUC__ >= 6
36     #pragma GCC diagnostic ignored "-Wignored-attributes"
37 #endif
38
39 namespace {
40     std::string ndrange_to_string(cl::NDRange const& range)
41     {
42         std::string ret = "(";
43         for (cl::size_type i = 0; i < range.dimensions(); ++i)
44             ret += (!i ? "" : ", ") + std::to_string(range.get()[i]);
45
46         ret += ")";
47         return ret;
48     }
49
50     std::string events_list_to_string(std::vector<cldnn::event_impl::ptr> events)
51     {
52         std::string ret = "(";
53         bool empty = true;
54         for (auto& ev : events)
55         {
56             std::string id = "unk";
57             if (auto* ocl_ev = dynamic_cast<cldnn::gpu::base_event*>(ev.get()))
58                 id = std::to_string(ocl_ev->get_queue_stamp());
59
60             ret += (empty ? "" : ", ") + id;
61             empty = false;
62         }
63
64         ret += ")";
65         return ret;
66     }
67 }
68
69 namespace cldnn { namespace gpu {
70
71 ocl_error::ocl_error(cl::Error const & err) : error(err.what() + std::string(", error code: ") + std::to_string(err.err()))
72 {
73 }
74
75 std::shared_ptr<gpu_toolkit> gpu_toolkit::create(const configuration & cfg)
76 {
77     struct make_shared_wa : public gpu_toolkit { make_shared_wa(const configuration& cfg) : gpu_toolkit(cfg) {} };
78     try {
79         return std::make_shared<make_shared_wa>(cfg);
80     }
81     catch (cl::Error const& err) {
82         throw ocl_error(err);
83     }
84 }
85
86 struct gpu_toolkit::ocl_logger
87 {
88     std::ofstream _log_file;
89 };
90
91 gpu_toolkit::gpu_toolkit(const configuration& config)
92     : _configuration(config)
93     , _ocl_builder(config)
94     , _user_context(_ocl_builder.is_user_context())
95     , _neo_driver(strstr(get_device_version().c_str(), "NEO") ? true : false)
96     , _context(_ocl_builder.get_context())
97     , _platform_id(_ocl_builder.get_platform_id())
98     , _engine_info(*this)
99     , _kernels_cache(*this)
100     , _events_pool(new events_pool())
101 {
102     _ocl_builder.get_device().getInfo(CL_DEVICE_EXTENSIONS, &_extensions);
103     build_command_queues(config);
104
105     _logger = std::unique_ptr<ocl_logger>(new ocl_logger());
106     if (logging_enabled())
107     {
108         open_log()
109             << "Engine configuration:\n"
110             << "    profiling: "           << std::boolalpha << _configuration.enable_profiling << "\n"
111             << "    meaningful names: "    << std::boolalpha << _configuration.meaningful_kernels_names << "\n"
112             << "    dump custom program: " << std::boolalpha << _configuration.dump_custom_program << "\n"
113             << "    device type: "         << std::to_string(_configuration.device_type) << "\n"
114             << "    vendor type: "         << std::hex << std::setfill('0') << std::setw(4) << std::right
115                                            << std::to_string(_configuration.device_vendor) << "\n"
116                                            << std::dec << std::setfill(' ') << std::right
117             << "    compiler options: "    << _configuration.compiler_options << "\n"
118             << "    single kernel name: "  << _configuration.single_kernel_name << "\n"
119             << "    out-of-order: "        << std::boolalpha << _configuration.host_out_of_order << "\n"
120             << "    engine log: "          << _configuration.log << "\n"
121             << "    sources dumps: "       << _configuration.ocl_sources_dumps_dir << "\n"
122             << "\nEngine info:\n"
123             << "    device id: "           << _engine_info.dev_id << "\n"
124             << "    cores count: "         << _engine_info.cores_count << "\n"
125             << "    core frequencey: "     << _engine_info.core_frequency << "\n"
126             << "    max work group size: " << _engine_info.max_work_group_size << "\n"
127             << "    local memory size: "   << _engine_info.max_local_mem_size << "\n"
128             << "    fp16: "                << std::boolalpha << (_engine_info.supports_fp16 != 0) << "\n"
129             << "    fp16 denorms: "        << std::boolalpha << (_engine_info.supports_fp16_denorms != 0) << "\n"
130             << "    subgroups short: "     << std::boolalpha << (_engine_info.supports_subgroups_short != 0) << "\n"
131             << "    used defined context: "<< std::boolalpha << _user_context << "\n"
132             << std::endl;
133     }
134 }
135
136 void gpu_toolkit::build_command_queues(const configuration& config)
137 {
138     command_queues_builder queue_builder(_context, _ocl_builder.get_device(), _platform_id);
139     queue_builder.set_profiling(config.enable_profiling);
140     queue_builder.set_out_of_order((config.host_out_of_order && _neo_driver));
141
142     bool priorty_extensions = extension_supported("cl_khr_priority_hints") && extension_supported("cl_khr_create_command_queue");
143     queue_builder.set_priority_mode(config.priority_mode, priorty_extensions);
144
145     bool throttle_extensions = extension_supported("cl_khr_throttle_hints") && extension_supported("cl_khr_create_command_queue");
146     queue_builder.set_throttle_mode(config.throttle_mode, throttle_extensions);
147
148     queue_builder.build();
149
150     _command_queue = queue_builder.queue();
151 }
152
153 event_impl::ptr gpu_toolkit::enqueue_kernel(cl::Kernel const& kern, cl::NDRange const& global, cl::NDRange const& local, std::vector<event_impl::ptr> const & deps)
154 {
155     std::vector<cl::Event> dep_events;
156     auto dep_events_ptr = &dep_events;
157     if (!_configuration.host_out_of_order)
158     {
159         for (auto& dep : deps)
160             if (auto ocl_ev = dynamic_cast<base_event*>(dep.get()))
161                 dep_events.push_back(ocl_ev->get());
162     }
163     else
164     {
165         dep_events_ptr = nullptr;
166         sync_events(deps);
167     }
168
169     cl::Event ret_ev;
170     try {
171         if (!_configuration.host_out_of_order || _output_event || _configuration.enable_profiling)
172         {
173             _command_queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local, dep_events_ptr, &ret_ev);
174         }
175         else
176         {
177             _command_queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local, dep_events_ptr, nullptr);
178         }
179     }
180     catch (cl::Error const& err) {
181         throw ocl_error(err);
182     }
183
184     if (logging_enabled())
185     {
186         auto msg = kern.getInfo<CL_KERNEL_FUNCTION_NAME>() + ", gws: " + ndrange_to_string(global) + ", lws: " + ndrange_to_string(local) + ", deps: ";
187         if (_configuration.host_out_of_order)
188             msg += "()";
189         else
190             msg += events_list_to_string(deps);
191
192         log(_queue_counter + 1, msg);
193     }
194     return _events_pool->get_from_base_pool(shared_from_this(), ret_ev, ++_queue_counter);
195 }
196
197 event_impl::ptr gpu_toolkit::enqueue_marker(std::vector<event_impl::ptr> const& deps)
198 {
199     if (deps.empty())
200         return _events_pool->get_from_user_pool(shared_from_this(), true);
201
202     if (!_configuration.host_out_of_order)
203     {
204         cl::Event ret_ev;
205         if (!enabled_single_kernel())
206         {
207             std::vector<cl::Event> dep_events;
208             for (auto& dep : deps)
209                 if (auto ocl_ev = dynamic_cast<base_event*>(dep.get()))
210                     dep_events.push_back(ocl_ev->get());
211
212             try {
213                 _command_queue.enqueueMarkerWithWaitList(&dep_events, &ret_ev);
214             }
215             catch (cl::Error const& err) {
216                 throw ocl_error(err);
217             }
218         }
219         else
220         {
221             try {
222                 _command_queue.enqueueMarkerWithWaitList(nullptr, &ret_ev);
223             }
224             catch (cl::Error const& err) {
225                 throw ocl_error(err);
226             }
227         }
228
229         if (logging_enabled())
230             log(_queue_counter + 1, "Marker with dependencies: " + events_list_to_string(deps));
231         return _events_pool->get_from_base_pool(shared_from_this(), ret_ev, ++_queue_counter);
232     }
233     else
234     {
235         sync_events(deps);
236         return _events_pool->get_from_base_pool(shared_from_this(), _last_barrier_ev, _last_barrier);
237     }
238 }
239
240 event_impl::ptr gpu_toolkit::group_events(std::vector<event_impl::ptr> const& deps)
241 {
242     return _events_pool->get_from_group_pool(shared_from_this(), deps);
243 }
244
245 event_impl::ptr gpu_toolkit::create_user_event(bool set)
246 {
247     return _events_pool->get_from_user_pool(shared_from_this(), set);
248 }
249
250 void gpu_toolkit::reset_events()
251 {
252     _events_pool->reset_events();
253 }
254
255 void gpu_toolkit::release_events_pool()
256 {
257     _events_pool.reset();
258 }
259
260 void gpu_toolkit::flush()
261 {
262     if (logging_enabled())
263         log(0, "Flush");
264     queue().flush();
265 }
266 void gpu_toolkit::release_pending_memory()
267 {
268     /*
269     TODO: Temp. solution, untill proper API calls from OpenCL are released.
270     */
271     void* ptr = nullptr;
272     ptr = _mm_malloc(4096, 4096);
273     queue().finish();
274     try
275     {
276         cl::Buffer flusher(_context, CL_MEM_USE_HOST_PTR, (size_t)4096, ptr);
277         flusher = (cl_mem)nullptr; //clear buffer
278     }
279     catch (...)
280     {
281         _mm_free(ptr);
282         throw;
283     }
284     _mm_free(ptr);
285 }
286
287 void gpu_toolkit::wait_for_events(std::vector<event_impl::ptr> const & events)
288 {
289     if (logging_enabled())
290         log(0, "Wait for events: " + events_list_to_string(events));
291
292     std::vector<cl::Event> clevents;
293     for (auto& ev : events)
294         if (auto ocl_ev = dynamic_cast<base_event*>(ev.get()))
295             clevents.push_back(ocl_ev->get());
296
297     try {
298         cl::WaitForEvents(clevents);
299     }
300     catch (cl::Error const& err) {
301         throw ocl_error(err);
302     }
303 }
304
305 void gpu_toolkit::log(uint64_t id, std::string const & msg)
306 {
307     if (_configuration.log.empty())
308         return;
309
310     open_log() << "[" << id << "] " << msg << std::endl;
311 }
312
313 void gpu_toolkit::sync_events(std::vector<event_impl::ptr> const & deps)
314 {
315     if (!_configuration.host_out_of_order)
316         return;
317
318     bool needs_barrier = false;
319     for (auto& dep : deps)
320     {
321         auto* ocl_ev = dynamic_cast<ocl_base_event*>(dep.get());
322         if (ocl_ev->get_queue_stamp() > _last_barrier)
323         {
324             needs_barrier = true;
325         }
326     }
327
328     if (needs_barrier)
329     {
330         try {
331             if (_output_event)
332             {
333                 _command_queue.enqueueBarrierWithWaitList(nullptr, &_last_barrier_ev);
334             }
335             else
336             {
337                 _command_queue.enqueueBarrierWithWaitList(nullptr, nullptr);
338             }
339
340         }
341         catch (cl::Error const& err) {
342             throw ocl_error(err);
343         }
344
345         _last_barrier = ++_queue_counter;
346         if (logging_enabled())
347             log(_last_barrier, "Barrier");
348     }
349 }
350
351 std::ofstream& gpu_toolkit::open_log()
352 {
353     if (!_logger->_log_file.is_open())
354     {
355         _logger->_log_file.open(_configuration.log, std::ios::out | std::ios::trunc);
356         if (!_logger->_log_file.good())
357             throw std::runtime_error("Could not initialize ocl_toolkit log file");
358         if (!_logger->_log_file.is_open())
359         {
360             throw std::runtime_error("Could not open ocl_toolkit log file '" + _configuration.log + "' for writing");
361         }
362     }
363
364     return _logger->_log_file;
365 }
366
367 }
368
369 }