}
bool base_events::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
- cl_ulong min_queue = CL_ULONG_MAX;
- cl_ulong min_sub = CL_ULONG_MAX;
- cl_ulong min_start = CL_ULONG_MAX;
- uint64_t execution_time = 0;
+
+ // For every profiling period (i.e. submission / starting / executing),
+ // the goal is to sum up all disjoint durations of its projection on the time axis
+
+ std::map<std::string, std::vector<std::pair<unsigned long long, unsigned long long>>> all_durations;
for (size_t i = 0; i < _events.size(); i++) {
auto be = dynamic_cast<base_event*>(_events[i].get());
if (!is_event_profiled(be->_event))
continue;
- cl_ulong curr_queue;
- cl_ulong curr_sub;
- cl_ulong curr_start;
- cl_ulong curr_end;
- be->_event.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &curr_queue);
- be->_event.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &curr_sub);
- be->_event.getProfilingInfo(CL_PROFILING_COMMAND_START, &curr_start);
- be->_event.getProfilingInfo(CL_PROFILING_COMMAND_END, &curr_end);
-
- if (curr_queue < min_queue)
- min_queue = curr_queue;
-
- if (curr_sub < min_sub)
- min_sub = curr_sub;
-
- if (curr_start < min_start)
- min_start = curr_start;
-
- execution_time += curr_end - curr_start;
+ for (auto& period : profiling_periods) {
+ cl_ulong ev_start;
+ cl_ulong ev_end;
+ be->_event.getProfilingInfo(period.start, &ev_start);
+ be->_event.getProfilingInfo(period.stop, &ev_end);
+ auto ev_duration = std::make_pair(static_cast<unsigned long long>(ev_start),
+ static_cast<unsigned long long>(ev_end));
+
+ auto& durations = all_durations[period.name];
+ bool ev_duration_merged = false;
+ auto it = durations.begin();
+
+ while (it != durations.end()) {
+ auto& duration = *it;
+ if ((duration.second >= ev_duration.first) && (duration.first <= ev_duration.second)) {
+ if ((duration.first == ev_duration.first) && (duration.second == ev_duration.second)) {
+ if (!ev_duration_merged) {
+ ev_duration_merged = true;
+ break;
+ } else {
+ it = durations.erase(it);
+ }
+ } else {
+ if (!ev_duration_merged) {
+ duration.first = std::min(duration.first, ev_duration.first);
+ duration.second = std::max(duration.second, ev_duration.second);
+ ev_duration = duration;
+ ev_duration_merged = true;
+ it++;
+ } else {
+ if (duration.second > ev_duration.second) {
+ ev_duration.second = duration.second;
+ it--;
+ it->second = ev_duration.second;
+ it++;
+ }
+ it = durations.erase(it);
+ }
+ }
+ } else {
+ it++;
+ }
+ }
+
+ if (!ev_duration_merged) {
+ durations.insert(it, ev_duration);
+ }
+ }
}
- info.push_back(get_profiling_interval(profiling_periods[0].name, min_sub, min_queue));
- info.push_back(get_profiling_interval(profiling_periods[1].name, min_start, min_sub));
- info.push_back(get_profiling_interval(profiling_periods[2].name, 0, execution_time));
-
+ for (auto& period : profiling_periods) {
+ unsigned long long sum = 0;
+ for (auto& duration : all_durations[period.name]) {
+ sum += (duration.second - duration.first);
+ }
+ info.push_back(get_profiling_interval(period.name, 0, sum));
+ }
return true;
}
struct base_events : virtual public ocl_base_event {
public:
base_events(std::shared_ptr<gpu_toolkit> ctx, std::vector<event_impl::ptr> const& ev)
- : ocl_base_event(0, true), _ctx(ctx), _events(ev) {
+ : ocl_base_event(0, true), _ctx(ctx) {
+ for (size_t i = 0; i < ev.size(); i++) {
+ auto multiple_events = dynamic_cast<base_events*>(ev[i].get());
+ if (multiple_events) {
+ for (size_t j = 0; j < multiple_events->_events.size(); j++) {
+ _events.push_back(multiple_events->_events[j]);
+ }
+ } else {
+ _events.push_back(ev[i]);
+ }
+ }
set_queue_stamp();
}
void attach_events(const std::vector<event_impl::ptr>& ev) {
if (_attached)
throw std::runtime_error("Trying to attach events to valid event object.");
- _events = ev;
+ for (size_t i = 0; i < ev.size(); i++) {
+ auto multiple_events = dynamic_cast<base_events*>(ev[i].get());
+ if (multiple_events) {
+ for (size_t j = 0; j < multiple_events->_events.size(); j++) {
+ _events.push_back(multiple_events->_events[j]);
+ }
+ } else {
+ _events.push_back(ev[i]);
+ }
+ }
_attached = true;
set_queue_stamp();
}
std::shared_ptr<gpu_toolkit> get_context() const { return _ctx; }
+ const std::vector<event_impl::ptr>& get_events() { return _events; };
private:
void set_queue_stamp() {
uint64_t _queue_stamp_max = 0;
for (size_t i = 0; i < _events.size(); i++) {
- auto* _base_event = dynamic_cast<base_event*>(_events[i].get());
+ auto* _base_event = dynamic_cast<ocl_base_event*>(_events[i].get());
if (_base_event->get_queue_stamp() > _queue_stamp_max)
_queue_stamp_max = _base_event->get_queue_stamp();
}
std::vector<cl::Event> dep_events;
auto dep_events_ptr = &dep_events;
if (!context()->get_configuration().host_out_of_order) {
- for (auto& dep : deps)
- if (auto ocl_ev = dynamic_cast<base_event*>(dep.get()))
- dep_events.push_back(ocl_ev->get());
+ for (auto& dep : deps) {
+ auto multiple_events = dynamic_cast<base_events*>(dep.get());
+ if (multiple_events) {
+ for (size_t i = 0; i < multiple_events->get_events().size(); i++) {
+ if (auto base_ev = dynamic_cast<base_event*>(multiple_events->get_events()[i].get()))
+ dep_events.push_back(base_ev->get());
+ }
+ } else {
+ if (auto base_ev = dynamic_cast<base_event*>(dep.get()))
+ dep_events.push_back(base_ev->get());
+ }
+ }
} else {
dep_events_ptr = nullptr;
cl::Event ret_ev;
if (!enabled_single_kernel) {
std::vector<cl::Event> dep_events;
- for (auto& dep : deps)
- if (auto ocl_ev = dynamic_cast<base_event*>(dep.get()))
- dep_events.push_back(ocl_ev->get());
+ for (auto& dep : deps) {
+ auto multiple_events = dynamic_cast<base_events*>(dep.get());
+ if (multiple_events) {
+ for (size_t i = 0; i < multiple_events->get_events().size(); i++) {
+ if (auto base_ev = dynamic_cast<base_event*>(multiple_events->get_events()[i].get()))
+ dep_events.push_back(base_ev->get());
+ }
+ } else {
+ if (auto base_ev = dynamic_cast<base_event*>(dep.get()))
+ dep_events.push_back(base_ev->get());
+ }
+ }
try {
_command_queue.enqueueMarkerWithWaitList(&dep_events, &ret_ev);
void gpu_toolkit::wait_for_events(std::vector<event_impl::ptr> const& events) {
std::vector<cl::Event> clevents;
- for (auto& ev : events)
- if (auto ocl_ev = dynamic_cast<base_event*>(ev.get()))
- clevents.push_back(ocl_ev->get());
+ for (auto& ev : events) {
+ auto multiple_events = dynamic_cast<base_events*>(ev.get());
+ if (multiple_events) {
+ for (size_t i = 0; i < multiple_events->get_events().size(); i++) {
+ if (auto base_ev = dynamic_cast<base_event*>(multiple_events->get_events()[i].get()))
+ clevents.push_back(base_ev->get());
+ }
+ } else {
+ if (auto base_ev = dynamic_cast<base_event*>(ev.get()))
+ clevents.push_back(base_ev->get());
+ }
+ }
try {
cl::WaitForEvents(clevents);
virtual bool get_depthwise_sep_opt() const { return false; }
event_impl::ptr aggregate_events(const std::vector<event_impl::ptr>& events,
- uint32_t net_id,
- bool group = false) const {
+ uint32_t net_id) const {
if (events.size() == 1)
return events[0];
- if (group)
+ if (events.size() > 1)
return _outer.get_program().get_engine().get_context()->group_events(net_id, events);
return events_waiter(_outer.get_program().get_engine().get_context()).run(net_id, events);
}
std::vector<event_impl::ptr> tmp_events(events);
+ std::vector<event_impl::ptr> all_events;
// TODO - split should be handle in kernel selector by providing multiple kernels.
auto split = get_split();
auto event = _kernels[k].run(net_id, _kernel_data.kernels[k], tmp_events);
new_events.push_back(event);
+ all_events.push_back(event);
}
tmp_events = new_events;
}
- bool group_events = split > 1 ? true : false;
- return aggregate_events(tmp_events, net_id, group_events);
+ if ((all_events.size() == 0) && (tmp_events.size() > 0))
+ return aggregate_events(tmp_events, net_id);
+
+ return aggregate_events(all_events, net_id);
}
};