2 // Copyright (c) 2016 Intel Corporation
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
8 // http://www.apache.org/licenses/LICENSE-2.0
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
17 ///////////////////////////////////////////////////////////////////////////////////////////////////
18 #include "ocl_toolkit.h"
19 #include "ocl_base_event.h"
20 #include "ocl_user_event.h"
21 #include "command_queues_builder.h"
22 #include "events_pool.h"
30 // NOTE: Due to buggy scope transition of warnings we need to disable warning in place of use/instantation
31 // of some types (even though we already disabled them in scope of definition of these types).
32 // Moreover this warning is pretty much now only for annoyance: it is generated due to lack
33 // of proper support for mangling of custom GCC attributes into type name (usually when used
34 // with templates, even from standard library).
35 #if defined __GNUC__ && __GNUC__ >= 6
36 #pragma GCC diagnostic ignored "-Wignored-attributes"
40 std::string ndrange_to_string(cl::NDRange const& range)
42 std::string ret = "(";
43 for (cl::size_type i = 0; i < range.dimensions(); ++i)
44 ret += (!i ? "" : ", ") + std::to_string(range.get()[i]);
50 std::string events_list_to_string(std::vector<cldnn::event_impl::ptr> events)
52 std::string ret = "(";
54 for (auto& ev : events)
56 std::string id = "unk";
57 if (auto* ocl_ev = dynamic_cast<cldnn::gpu::base_event*>(ev.get()))
58 id = std::to_string(ocl_ev->get_queue_stamp());
60 ret += (empty ? "" : ", ") + id;
69 namespace cldnn { namespace gpu {
71 ocl_error::ocl_error(cl::Error const & err) : error(err.what() + std::string(", error code: ") + std::to_string(err.err()))
75 std::shared_ptr<gpu_toolkit> gpu_toolkit::create(const configuration & cfg)
77 struct make_shared_wa : public gpu_toolkit { make_shared_wa(const configuration& cfg) : gpu_toolkit(cfg) {} };
79 return std::make_shared<make_shared_wa>(cfg);
81 catch (cl::Error const& err) {
86 struct gpu_toolkit::ocl_logger
88 std::ofstream _log_file;
91 gpu_toolkit::gpu_toolkit(const configuration& config)
92 : _configuration(config)
93 , _ocl_builder(config)
94 , _user_context(_ocl_builder.is_user_context())
95 , _neo_driver(strstr(get_device_version().c_str(), "NEO") ? true : false)
96 , _context(_ocl_builder.get_context())
97 , _platform_id(_ocl_builder.get_platform_id())
99 , _kernels_cache(*this)
100 , _events_pool(new events_pool())
102 _ocl_builder.get_device().getInfo(CL_DEVICE_EXTENSIONS, &_extensions);
103 build_command_queues(config);
105 _logger = std::unique_ptr<ocl_logger>(new ocl_logger());
106 if (logging_enabled())
109 << "Engine configuration:\n"
110 << " profiling: " << std::boolalpha << _configuration.enable_profiling << "\n"
111 << " meaningful names: " << std::boolalpha << _configuration.meaningful_kernels_names << "\n"
112 << " dump custom program: " << std::boolalpha << _configuration.dump_custom_program << "\n"
113 << " device type: " << std::to_string(_configuration.device_type) << "\n"
114 << " vendor type: " << std::hex << std::setfill('0') << std::setw(4) << std::right
115 << std::to_string(_configuration.device_vendor) << "\n"
116 << std::dec << std::setfill(' ') << std::right
117 << " compiler options: " << _configuration.compiler_options << "\n"
118 << " single kernel name: " << _configuration.single_kernel_name << "\n"
119 << " out-of-order: " << std::boolalpha << _configuration.host_out_of_order << "\n"
120 << " engine log: " << _configuration.log << "\n"
121 << " sources dumps: " << _configuration.ocl_sources_dumps_dir << "\n"
122 << "\nEngine info:\n"
123 << " device id: " << _engine_info.dev_id << "\n"
124 << " cores count: " << _engine_info.cores_count << "\n"
125 << " core frequencey: " << _engine_info.core_frequency << "\n"
126 << " max work group size: " << _engine_info.max_work_group_size << "\n"
127 << " local memory size: " << _engine_info.max_local_mem_size << "\n"
128 << " fp16: " << std::boolalpha << (_engine_info.supports_fp16 != 0) << "\n"
129 << " fp16 denorms: " << std::boolalpha << (_engine_info.supports_fp16_denorms != 0) << "\n"
130 << " subgroups short: " << std::boolalpha << (_engine_info.supports_subgroups_short != 0) << "\n"
131 << " used defined context: "<< std::boolalpha << _user_context << "\n"
136 void gpu_toolkit::build_command_queues(const configuration& config)
138 command_queues_builder queue_builder(_context, _ocl_builder.get_device(), _platform_id);
139 queue_builder.set_profiling(config.enable_profiling);
140 queue_builder.set_out_of_order((config.host_out_of_order && _neo_driver));
142 bool priorty_extensions = extension_supported("cl_khr_priority_hints") && extension_supported("cl_khr_create_command_queue");
143 queue_builder.set_priority_mode(config.priority_mode, priorty_extensions);
145 bool throttle_extensions = extension_supported("cl_khr_throttle_hints") && extension_supported("cl_khr_create_command_queue");
146 queue_builder.set_throttle_mode(config.throttle_mode, throttle_extensions);
148 queue_builder.build();
150 _command_queue = queue_builder.queue();
153 event_impl::ptr gpu_toolkit::enqueue_kernel(cl::Kernel const& kern, cl::NDRange const& global, cl::NDRange const& local, std::vector<event_impl::ptr> const & deps)
155 std::vector<cl::Event> dep_events;
156 auto dep_events_ptr = &dep_events;
157 if (!_configuration.host_out_of_order)
159 for (auto& dep : deps)
160 if (auto ocl_ev = dynamic_cast<base_event*>(dep.get()))
161 dep_events.push_back(ocl_ev->get());
165 dep_events_ptr = nullptr;
171 if (!_configuration.host_out_of_order || _output_event || _configuration.enable_profiling)
173 _command_queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local, dep_events_ptr, &ret_ev);
177 _command_queue.enqueueNDRangeKernel(kern, cl::NullRange, global, local, dep_events_ptr, nullptr);
180 catch (cl::Error const& err) {
181 throw ocl_error(err);
184 if (logging_enabled())
186 auto msg = kern.getInfo<CL_KERNEL_FUNCTION_NAME>() + ", gws: " + ndrange_to_string(global) + ", lws: " + ndrange_to_string(local) + ", deps: ";
187 if (_configuration.host_out_of_order)
190 msg += events_list_to_string(deps);
192 log(_queue_counter + 1, msg);
194 return _events_pool->get_from_base_pool(shared_from_this(), ret_ev, ++_queue_counter);
197 event_impl::ptr gpu_toolkit::enqueue_marker(std::vector<event_impl::ptr> const& deps)
200 return _events_pool->get_from_user_pool(shared_from_this(), true);
202 if (!_configuration.host_out_of_order)
205 if (!enabled_single_kernel())
207 std::vector<cl::Event> dep_events;
208 for (auto& dep : deps)
209 if (auto ocl_ev = dynamic_cast<base_event*>(dep.get()))
210 dep_events.push_back(ocl_ev->get());
213 _command_queue.enqueueMarkerWithWaitList(&dep_events, &ret_ev);
215 catch (cl::Error const& err) {
216 throw ocl_error(err);
222 _command_queue.enqueueMarkerWithWaitList(nullptr, &ret_ev);
224 catch (cl::Error const& err) {
225 throw ocl_error(err);
229 if (logging_enabled())
230 log(_queue_counter + 1, "Marker with dependencies: " + events_list_to_string(deps));
231 return _events_pool->get_from_base_pool(shared_from_this(), ret_ev, ++_queue_counter);
236 return _events_pool->get_from_base_pool(shared_from_this(), _last_barrier_ev, _last_barrier);
240 event_impl::ptr gpu_toolkit::group_events(std::vector<event_impl::ptr> const& deps)
242 return _events_pool->get_from_group_pool(shared_from_this(), deps);
245 event_impl::ptr gpu_toolkit::create_user_event(bool set)
247 return _events_pool->get_from_user_pool(shared_from_this(), set);
250 void gpu_toolkit::reset_events()
252 _events_pool->reset_events();
255 void gpu_toolkit::release_events_pool()
257 _events_pool.reset();
260 void gpu_toolkit::flush()
262 if (logging_enabled())
266 void gpu_toolkit::release_pending_memory()
269 TODO: Temp. solution, untill proper API calls from OpenCL are released.
272 ptr = _mm_malloc(4096, 4096);
276 cl::Buffer flusher(_context, CL_MEM_USE_HOST_PTR, (size_t)4096, ptr);
277 flusher = (cl_mem)nullptr; //clear buffer
287 void gpu_toolkit::wait_for_events(std::vector<event_impl::ptr> const & events)
289 if (logging_enabled())
290 log(0, "Wait for events: " + events_list_to_string(events));
292 std::vector<cl::Event> clevents;
293 for (auto& ev : events)
294 if (auto ocl_ev = dynamic_cast<base_event*>(ev.get()))
295 clevents.push_back(ocl_ev->get());
298 cl::WaitForEvents(clevents);
300 catch (cl::Error const& err) {
301 throw ocl_error(err);
305 void gpu_toolkit::log(uint64_t id, std::string const & msg)
307 if (_configuration.log.empty())
310 open_log() << "[" << id << "] " << msg << std::endl;
313 void gpu_toolkit::sync_events(std::vector<event_impl::ptr> const & deps)
315 if (!_configuration.host_out_of_order)
318 bool needs_barrier = false;
319 for (auto& dep : deps)
321 auto* ocl_ev = dynamic_cast<ocl_base_event*>(dep.get());
322 if (ocl_ev->get_queue_stamp() > _last_barrier)
324 needs_barrier = true;
333 _command_queue.enqueueBarrierWithWaitList(nullptr, &_last_barrier_ev);
337 _command_queue.enqueueBarrierWithWaitList(nullptr, nullptr);
341 catch (cl::Error const& err) {
342 throw ocl_error(err);
345 _last_barrier = ++_queue_counter;
346 if (logging_enabled())
347 log(_last_barrier, "Barrier");
351 std::ofstream& gpu_toolkit::open_log()
353 if (!_logger->_log_file.is_open())
355 _logger->_log_file.open(_configuration.log, std::ios::out | std::ios::trunc);
356 if (!_logger->_log_file.good())
357 throw std::runtime_error("Could not initialize ocl_toolkit log file");
358 if (!_logger->_log_file.is_open())
360 throw std::runtime_error("Could not open ocl_toolkit log file '" + _configuration.log + "' for writing");
364 return _logger->_log_file;