From 4d138307f4b532e5cd2e3164f18b926fd0434281 Mon Sep 17 00:00:00 2001
From: Jedrzej Hajduczenia <jedrzej.hajduczenia@intel.com>
Date: Tue, 13 Oct 2020 15:40:30 +0200
Subject: [PATCH] [IE CLDNN] Revert fix for profiling of multiple kernel
 implementations (#2219) (#2595)

---
 .../thirdparty/clDNN/src/gpu/ocl_base_event.cpp    | 87 +++++++---------------
 .../thirdparty/clDNN/src/gpu/ocl_base_event.h      | 26 +------
 .../thirdparty/clDNN/src/gpu/ocl_queue_wrapper.cpp | 30 ++------
 .../thirdparty/clDNN/src/gpu/ocl_toolkit.cpp       | 15 +---
 .../thirdparty/clDNN/src/gpu/primitive_gpu_base.h  | 13 ++--
 5 files changed, 44 insertions(+), 127 deletions(-)
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.cpp
index 51ba350..915eb79 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.cpp
@@ -121,73 +121,40 @@ bool base_events::is_set_impl() {
 }
 
 bool base_events::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
-
-    // For every profiling period (i.e. submission / starting / executing),
-    // the goal is to sum up all disjoint durations of its projection on the time axis
-
-    std::map<std::string, std::vector<std::pair<unsigned long long, unsigned long long>>> all_durations;
+    cl_ulong min_queue = CL_ULONG_MAX;
+    cl_ulong min_sub = CL_ULONG_MAX;
+    cl_ulong min_start = CL_ULONG_MAX;
+    uint64_t execution_time = 0;
 
     for (size_t i = 0; i < _events.size(); i++) {
         auto be = dynamic_cast<base_event*>(_events[i].get());
         if (!is_event_profiled(be->_event))
             continue;
 
-        for (auto& period : profiling_periods) {
-            cl_ulong ev_start;
-            cl_ulong ev_end;
-            be->_event.getProfilingInfo(period.start, &ev_start);
-            be->_event.getProfilingInfo(period.stop, &ev_end);
-            auto ev_duration = std::make_pair(static_cast<unsigned long long>(ev_start),
-                                              static_cast<unsigned long long>(ev_end));
-
-            auto& durations = all_durations[period.name];
-            bool ev_duration_merged = false;
-            auto it = durations.begin();
-
-            while (it != durations.end()) {
-                auto& duration = *it;
-                if ((duration.second >= ev_duration.first) && (duration.first <= ev_duration.second)) {
-                    if ((duration.first == ev_duration.first) && (duration.second == ev_duration.second)) {
-                        if (!ev_duration_merged) {
-                            ev_duration_merged = true;
-                            break;
-                        } else {
-                            it = durations.erase(it);
-                        }
-                    } else {
-                        if (!ev_duration_merged) {
-                            duration.first = std::min(duration.first, ev_duration.first);
-                            duration.second = std::max(duration.second, ev_duration.second);
-                            ev_duration = duration;
-                            ev_duration_merged = true;
-                            it++;
-                        } else {
-                            if (duration.second > ev_duration.second) {
-                                ev_duration.second = duration.second;
-                                it--;
-                                it->second = ev_duration.second;
-                                it++;
-                            }
-                            it = durations.erase(it);
-                        }
-                    }
-                } else {
-                    it++;
-                }
-            }
-
-            if (!ev_duration_merged) {
-                durations.insert(it, ev_duration);
-            }
-        }
-    }
+        cl_ulong curr_queue;
+        cl_ulong curr_sub;
+        cl_ulong curr_start;
+        cl_ulong curr_end;
+        be->_event.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &curr_queue);
+        be->_event.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &curr_sub);
+        be->_event.getProfilingInfo(CL_PROFILING_COMMAND_START, &curr_start);
+        be->_event.getProfilingInfo(CL_PROFILING_COMMAND_END, &curr_end);
 
-    for (auto& period : profiling_periods) {
-        unsigned long long sum = 0;
-        for (auto& duration : all_durations[period.name]) {
-            sum += (duration.second - duration.first);
-        }
-        info.push_back(get_profiling_interval(period.name, 0, sum));
+        if (curr_queue < min_queue)
+            min_queue = curr_queue;
+
+        if (curr_sub < min_sub)
+            min_sub = curr_sub;
+
+        if (curr_start < min_start)
+            min_start = curr_start;
+
+        execution_time += curr_end - curr_start;
     }
+
+    info.push_back(get_profiling_interval(profiling_periods[0].name, min_sub, min_queue));
+    info.push_back(get_profiling_interval(profiling_periods[1].name, min_start, min_sub));
+    info.push_back(get_profiling_interval(profiling_periods[2].name, 0, execution_time));
+
     return true;
 }
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h b/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h
index ffccbce..39e6f04 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h
+++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h
@@ -77,17 +77,7 @@ protected:
 struct base_events : virtual public ocl_base_event {
 public:
     base_events(std::shared_ptr<gpu_toolkit> ctx, std::vector<event_impl::ptr> const& ev)
-        : ocl_base_event(0, true), _ctx(ctx) {
-        for (size_t i = 0; i < ev.size(); i++) {
-            auto multiple_events = dynamic_cast<base_events*>(ev[i].get());
-            if (multiple_events) {
-                for (size_t j = 0; j < multiple_events->_events.size(); j++) {
-                    _events.push_back(multiple_events->_events[j]);
-                }
-            } else {
-                _events.push_back(ev[i]);
-            }
-        }
+        : ocl_base_event(0, true), _ctx(ctx), _events(ev) {
         set_queue_stamp();
     }
 
@@ -96,28 +86,18 @@ public:
     void attach_events(const std::vector<event_impl::ptr>& ev) {
         if (_attached)
             throw std::runtime_error("Trying to attach events to valid event object.");
-        for (size_t i = 0; i < ev.size(); i++) {
-            auto multiple_events = dynamic_cast<base_events*>(ev[i].get());
-            if (multiple_events) {
-                for (size_t j = 0; j < multiple_events->_events.size(); j++) {
-                    _events.push_back(multiple_events->_events[j]);
-                }
-            } else {
-                _events.push_back(ev[i]);
-            }
-        }
+        _events = ev;
         _attached = true;
         set_queue_stamp();
     }
 
     std::shared_ptr<gpu_toolkit> get_context() const { return _ctx; }
-    const std::vector<event_impl::ptr>& get_events() { return _events; };
 
 private:
     void set_queue_stamp() {
         uint64_t _queue_stamp_max = 0;
         for (size_t i = 0; i < _events.size(); i++) {
-            auto* _base_event = dynamic_cast<ocl_base_event*>(_events[i].get());
+            auto* _base_event = dynamic_cast<base_event*>(_events[i].get());
             if (_base_event->get_queue_stamp() > _queue_stamp_max)
                 _queue_stamp_max = _base_event->get_queue_stamp();
         }
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_queue_wrapper.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_queue_wrapper.cpp
index 8e3b238..18bc5b1 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_queue_wrapper.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_queue_wrapper.cpp
@@ -79,18 +79,9 @@ event_impl::ptr gpu_queue::enqueue_kernel(kernels_cache::kernel_type const& kern
     std::vector<cl::Event> dep_events;
     auto dep_events_ptr = &dep_events;
     if (!context()->get_configuration().host_out_of_order) {
-        for (auto& dep : deps) {
-            auto multiple_events = dynamic_cast<base_events*>(dep.get());
-            if (multiple_events) {
-                for (size_t i = 0; i < multiple_events->get_events().size(); i++) {
-                    if (auto base_ev = dynamic_cast<base_event*>(multiple_events->get_events()[i].get()))
-                        dep_events.push_back(base_ev->get());
-                }
-            } else {
-                if (auto base_ev = dynamic_cast<base_event*>(dep.get()))
-                    dep_events.push_back(base_ev->get());
-            }
-        }
+        for (auto& dep : deps)
+            if (auto ocl_ev = dynamic_cast<base_event*>(dep.get()))
+                dep_events.push_back(ocl_ev->get());
     } else {
         dep_events_ptr = nullptr;
 
@@ -122,18 +113,9 @@ event_impl::ptr gpu_queue::enqueue_marker(std::vector<event_impl::ptr> const& de
         cl::Event ret_ev;
         if (!enabled_single_kernel) {
             std::vector<cl::Event> dep_events;
-            for (auto& dep : deps) {
-                auto multiple_events = dynamic_cast<base_events*>(dep.get());
-                if (multiple_events) {
-                    for (size_t i = 0; i < multiple_events->get_events().size(); i++) {
-                        if (auto base_ev = dynamic_cast<base_event*>(multiple_events->get_events()[i].get()))
-                            dep_events.push_back(base_ev->get());
-                    }
-                } else {
-                    if (auto base_ev = dynamic_cast<base_event*>(dep.get()))
-                        dep_events.push_back(base_ev->get());
-                }
-            }
+            for (auto& dep : deps)
+                if (auto ocl_ev = dynamic_cast<base_event*>(dep.get()))
+                    dep_events.push_back(ocl_ev->get());
 
             try {
                 _command_queue.enqueueMarkerWithWaitList(&dep_events, &ret_ev);
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp
index 5299d58..0d1f2d3 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp
@@ -229,18 +229,9 @@ void gpu_toolkit::release_pending_memory(uint32_t queue_id) { get_command_queue(
 
 void gpu_toolkit::wait_for_events(std::vector<event_impl::ptr> const& events) {
     std::vector<cl::Event> clevents;
-    for (auto& ev : events) {
-        auto multiple_events = dynamic_cast<base_events*>(ev.get());
-        if (multiple_events) {
-            for (size_t i = 0; i < multiple_events->get_events().size(); i++) {
-                if (auto base_ev = dynamic_cast<base_event*>(multiple_events->get_events()[i].get()))
-                    clevents.push_back(base_ev->get());
-            }
-        } else {
-            if (auto base_ev = dynamic_cast<base_event*>(ev.get()))
-                clevents.push_back(base_ev->get());
-        }
-    }
+    for (auto& ev : events)
+        if (auto ocl_ev = dynamic_cast<base_event*>(ev.get()))
+            clevents.push_back(ocl_ev->get());
 
     try {
         cl::WaitForEvents(clevents);
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h b/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h
index d5f7fa6..0c01641 100644
--- a/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h
+++ b/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h
@@ -107,11 +107,12 @@ protected:
     virtual bool get_depthwise_sep_opt() const { return false; }
 
     event_impl::ptr aggregate_events(const std::vector<event_impl::ptr>& events,
-                                     uint32_t net_id) const {
+                                     uint32_t net_id,
+                                     bool group = false) const {
         if (events.size() == 1)
             return events[0];
 
-        if (events.size() > 1)
+        if (group)
             return _outer.get_program().get_engine().get_context()->group_events(net_id, events);
 
         return events_waiter(_outer.get_program().get_engine().get_context()).run(net_id, events);
@@ -160,7 +161,6 @@ protected:
         }
 
         std::vector<event_impl::ptr> tmp_events(events);
-        std::vector<event_impl::ptr> all_events;
 
         // TODO - split should be handle in kernel selector by providing multiple kernels.
         auto split = get_split();
@@ -181,16 +181,13 @@ protected:
 
                 auto event = _kernels[k].run(net_id, _kernel_data.kernels[k], tmp_events);
                 new_events.push_back(event);
-                all_events.push_back(event);
             }
 
             tmp_events = new_events;
         }
 
-        if ((all_events.size() == 0) && (tmp_events.size() > 0))
-            return aggregate_events(tmp_events, net_id);
-
-        return aggregate_events(all_events, net_id);
+        bool group_events = split > 1 ? true : false;
+        return aggregate_events(tmp_events, net_id, group_events);
     }
 };
 
-- 
2.7.4