[IE CLDNN] Another try to fix multiple-kernel implementations profiling (#2630)

author Jedrzej Hajduczenia <jedrzej.hajduczenia@intel.com>

Wed, 21 Oct 2020 10:36:32 +0000 (12:36 +0200)

committer GitHub <noreply@github.com>

Wed, 21 Oct 2020 10:36:32 +0000 (13:36 +0300)
author Jedrzej Hajduczenia <jedrzej.hajduczenia@intel.com>
Wed, 21 Oct 2020 10:36:32 +0000 (12:36 +0200)
committer GitHub <noreply@github.com>
Wed, 21 Oct 2020 10:36:32 +0000 (13:36 +0300)
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.cpp

index 915eb79..3e4ec04 100644 (file)
--- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.cpp
@@ -102,59 +102,90 @@ bool base_event::get_profiling_info_impl(std::list<instrumentation::profiling_in
  }
  
  void base_events::wait_impl() {
-    if (!_events.empty()) {
-        for (size_t i = 0; i < _events.size(); i++) {
-            _events[i]->wait();
+    if (_last_ocl_event.get() != nullptr) {
+        _last_ocl_event.wait();
+        if (get_context()->logging_enabled()) {
+            get_context()->log(0, "Wait for event: " + std::to_string(_queue_stamp));
          }
      }
  }
  
  bool base_events::is_set_impl() {
-    if (!_events.empty()) {
-        for (size_t i = 0; i < _events.size(); i++) {
-            if (!_events[i]->is_set())
-                return false;
-        }
-        return true;
+    if (_last_ocl_event.get() != nullptr) {
+        return _last_ocl_event.getInfo<CL_EVENT_COMMAND_EXECUTION_STATUS>() == CL_COMPLETE;
      }
      return true;
  }
  
  bool base_events::get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) {
-    cl_ulong min_queue = CL_ULONG_MAX;
-    cl_ulong min_sub = CL_ULONG_MAX;
-    cl_ulong min_start = CL_ULONG_MAX;
-    uint64_t execution_time = 0;
+
+    // For every profiling period (i.e. submission / starting / executing),
+    // the goal is to sum up all disjoint durations of its projection on the time axis
+
+    std::map<std::string, std::vector<std::pair<unsigned long long, unsigned long long>>> all_durations;
  
      for (size_t i = 0; i < _events.size(); i++) {
          auto be = dynamic_cast<base_event*>(_events[i].get());
          if (!is_event_profiled(be->_event))
              continue;
  
-        cl_ulong curr_queue;
-        cl_ulong curr_sub;
-        cl_ulong curr_start;
-        cl_ulong curr_end;
-        be->_event.getProfilingInfo(CL_PROFILING_COMMAND_QUEUED, &curr_queue);
-        be->_event.getProfilingInfo(CL_PROFILING_COMMAND_SUBMIT, &curr_sub);
-        be->_event.getProfilingInfo(CL_PROFILING_COMMAND_START, &curr_start);
-        be->_event.getProfilingInfo(CL_PROFILING_COMMAND_END, &curr_end);
-
-        if (curr_queue < min_queue)
-            min_queue = curr_queue;
-
-        if (curr_sub < min_sub)
-            min_sub = curr_sub;
-
-        if (curr_start < min_start)
-            min_start = curr_start;
-
-        execution_time += curr_end - curr_start;
+        for (auto& period : profiling_periods) {
+            cl_ulong ev_start;
+            cl_ulong ev_end;
+            be->_event.getProfilingInfo(period.start, &ev_start);
+            be->_event.getProfilingInfo(period.stop, &ev_end);
+            auto ev_duration = std::make_pair(static_cast<unsigned long long>(ev_start),
+                                              static_cast<unsigned long long>(ev_end));
+
+            auto& durations = all_durations[period.name];
+            bool ev_duration_merged = false;
+            auto it = durations.begin();
+
+            while (it != durations.end()) {
+                auto& duration = *it;
+                if ((duration.second >= ev_duration.first) && (duration.first <= ev_duration.second)) {
+                    if ((duration.first == ev_duration.first) && (duration.second == ev_duration.second)) {
+                        if (!ev_duration_merged) {
+                            ev_duration_merged = true;
+                            break;
+                        } else {
+                            it = durations.erase(it);
+                        }
+                    } else {
+                        if (!ev_duration_merged) {
+                            duration.first = std::min(duration.first, ev_duration.first);
+                            duration.second = std::max(duration.second, ev_duration.second);
+                            ev_duration = duration;
+                            ev_duration_merged = true;
+                            it++;
+                        } else {
+                            if (duration.second > ev_duration.second) {
+                                ev_duration.second = duration.second;
+                                it--;
+                                it->second = ev_duration.second;
+                                it++;
+                            }
+                            it = durations.erase(it);
+                        }
+                    }
+                } else {
+                    it++;
+                }
+            }
+
+            if (!ev_duration_merged) {
+                durations.insert(it, ev_duration);
+            }
+        }
      }
  
-    info.push_back(get_profiling_interval(profiling_periods[0].name, min_sub, min_queue));
-    info.push_back(get_profiling_interval(profiling_periods[1].name, min_start, min_sub));
-    info.push_back(get_profiling_interval(profiling_periods[2].name, 0, execution_time));
+    for (auto& period : profiling_periods) {
+        unsigned long long sum = 0;
+        for (auto& duration : all_durations[period.name]) {
+            sum += (duration.second - duration.first);
+        }
+        info.push_back(get_profiling_interval(period.name, 0, sum));
+    }
  
      return true;
  }
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h b/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h

index 39e6f04..e449661 100644 (file)
--- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h
+++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h
@@ -34,6 +34,7 @@ struct ocl_base_event : virtual public event_impl {
  public:
      explicit ocl_base_event(uint64_t queue_stamp = 0, bool valid = false) : _queue_stamp(queue_stamp) { _attached = valid; }
      uint64_t get_queue_stamp() const { return _queue_stamp; }
+    virtual cl::Event get() = 0;
  
  protected:
      uint64_t _queue_stamp = 0;
@@ -77,8 +78,8 @@ protected:
  struct base_events : virtual public ocl_base_event {
  public:
      base_events(std::shared_ptr<gpu_toolkit> ctx, std::vector<event_impl::ptr> const& ev)
-        : ocl_base_event(0, true), _ctx(ctx), _events(ev) {
-        set_queue_stamp();
+        : ocl_base_event(0, true), _ctx(ctx) {
+        process_events(ev);
      }
  
      explicit base_events(std::shared_ptr<gpu_toolkit> ctx) : ocl_base_event(0, false), _ctx(ctx) {}
@@ -86,28 +87,47 @@ public:
      void attach_events(const std::vector<event_impl::ptr>& ev) {
          if (_attached)
              throw std::runtime_error("Trying to attach events to valid event object.");
-        _events = ev;
+        process_events(ev);
          _attached = true;
-        set_queue_stamp();
      }
  
+    cl::Event get() { return _last_ocl_event; }
      std::shared_ptr<gpu_toolkit> get_context() const { return _ctx; }
  
  private:
-    void set_queue_stamp() {
-        uint64_t _queue_stamp_max = 0;
-        for (size_t i = 0; i < _events.size(); i++) {
-            auto* _base_event = dynamic_cast<base_event*>(_events[i].get());
-            if (_base_event->get_queue_stamp() > _queue_stamp_max)
-                _queue_stamp_max = _base_event->get_queue_stamp();
-        }
-        _queue_stamp = _queue_stamp_max;
-    }
      void wait_impl() override;
      bool is_set_impl() override;
  
+    void process_events(const std::vector<event_impl::ptr>& ev) {
+        for (size_t i = 0; i < ev.size(); i++) {
+            auto multiple_events = dynamic_cast<base_events*>(ev[i].get());
+            if (multiple_events) {
+                for (size_t j = 0; j < multiple_events->_events.size(); j++) {
+                    if (auto base_ev = dynamic_cast<base_event*>(multiple_events->_events[j].get())) {
+                        auto current_ev_queue_stamp = base_ev->get_queue_stamp();
+                        if ((_queue_stamp == 0) || (current_ev_queue_stamp > _queue_stamp)) {
+                            _queue_stamp = current_ev_queue_stamp;
+                            _last_ocl_event = base_ev->get();
+                        }
+                    }
+                    _events.push_back(multiple_events->_events[j]);
+                }
+            } else {
+                if (auto base_ev = dynamic_cast<base_event*>(ev[i].get())) {
+                    auto current_ev_queue_stamp = base_ev->get_queue_stamp();
+                    if ((_queue_stamp == 0) || (current_ev_queue_stamp > _queue_stamp)) {
+                        _queue_stamp = current_ev_queue_stamp;
+                        _last_ocl_event = base_ev->get();
+                    }
+                }
+                _events.push_back(ev[i]);
+            }
+        }
+    }
+
      bool get_profiling_info_impl(std::list<instrumentation::profiling_interval>& info) override;
  
+    cl::Event _last_ocl_event;
      std::shared_ptr<gpu_toolkit> _ctx;
      std::vector<event_impl::ptr> _events;
  };
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_queue_wrapper.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_queue_wrapper.cpp

index 18bc5b1..c06c914 100644 (file)
--- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_queue_wrapper.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_queue_wrapper.cpp
@@ -79,9 +79,11 @@ event_impl::ptr gpu_queue::enqueue_kernel(kernels_cache::kernel_type const& kern
      std::vector<cl::Event> dep_events;
      auto dep_events_ptr = &dep_events;
      if (!context()->get_configuration().host_out_of_order) {
-        for (auto& dep : deps)
-            if (auto ocl_ev = dynamic_cast<base_event*>(dep.get()))
-                dep_events.push_back(ocl_ev->get());
+        for (auto& dep : deps) {
+            if (auto ocl_base_ev = dynamic_cast<ocl_base_event*>(dep.get())) {
+                dep_events.push_back(ocl_base_ev->get());
+            }
+        }
      } else {
          dep_events_ptr = nullptr;
  
@@ -113,9 +115,10 @@ event_impl::ptr gpu_queue::enqueue_marker(std::vector<event_impl::ptr> const& de
          cl::Event ret_ev;
          if (!enabled_single_kernel) {
              std::vector<cl::Event> dep_events;
-            for (auto& dep : deps)
-                if (auto ocl_ev = dynamic_cast<base_event*>(dep.get()))
-                    dep_events.push_back(ocl_ev->get());
+            for (auto& dep : deps) {
+                if (auto ocl_base_ev = dynamic_cast<ocl_base_event*>(dep.get()))
+                    dep_events.push_back(ocl_base_ev->get());
+            }
  
              try {
                  _command_queue.enqueueMarkerWithWaitList(&dep_events, &ret_ev);
@@ -169,8 +172,8 @@ void gpu_queue::release_pending_memory() {
  void gpu_queue::sync_events(std::vector<event_impl::ptr> const& deps) {
      bool needs_barrier = false;
      for (auto& dep : deps) {
-        auto* ocl_ev = dynamic_cast<ocl_base_event*>(dep.get());
-        if (ocl_ev->get_queue_stamp() > _last_barrier) {
+        auto* ocl_base_ev = dynamic_cast<ocl_base_event*>(dep.get());
+        if (ocl_base_ev->get_queue_stamp() > _last_barrier) {
              needs_barrier = true;
          }
      }
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp b/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp

index 0d1f2d3..775fb92 100644 (file)
--- a/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp
+++ b/inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp
@@ -229,9 +229,10 @@ void gpu_toolkit::release_pending_memory(uint32_t queue_id) { get_command_queue(
  
  void gpu_toolkit::wait_for_events(std::vector<event_impl::ptr> const& events) {
      std::vector<cl::Event> clevents;
-    for (auto& ev : events)
-        if (auto ocl_ev = dynamic_cast<base_event*>(ev.get()))
-            clevents.push_back(ocl_ev->get());
+    for (auto& ev : events) {
+        if (auto ocl_base_ev = dynamic_cast<ocl_base_event*>(ev.get()))
+            clevents.push_back(ocl_base_ev->get());
+    }
  
      try {
          cl::WaitForEvents(clevents);
diff --git a/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h b/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h

index 0c01641..0a8a974 100644 (file)
--- a/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h
+++ b/inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h
@@ -161,6 +161,7 @@ protected:
          }
  
          std::vector<event_impl::ptr> tmp_events(events);
+        std::vector<event_impl::ptr> all_events;
  
          // TODO - split should be handle in kernel selector by providing multiple kernels.
          auto split = get_split();
@@ -181,13 +182,17 @@ protected:
  
                  auto event = _kernels[k].run(net_id, _kernel_data.kernels[k], tmp_events);
                  new_events.push_back(event);
+                all_events.push_back(event);
              }
  
              tmp_events = new_events;
          }
  
-        bool group_events = split > 1 ? true : false;
-        return aggregate_events(tmp_events, net_id, group_events);
+        if ((all_events.size() == 0) && (tmp_events.size() > 0))
+            return aggregate_events(tmp_events, net_id);
+
+        bool group_events = (all_events.size() > 1);
+        return aggregate_events(all_events, net_id, group_events);
      }
  };
author	Jedrzej Hajduczenia <jedrzej.hajduczenia@intel.com>
	Wed, 21 Oct 2020 10:36:32 +0000 (12:36 +0200)
committer	GitHub <noreply@github.com>
	Wed, 21 Oct 2020 10:36:32 +0000 (13:36 +0300)
inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.cpp		patch \| blob \| history
inference-engine/thirdparty/clDNN/src/gpu/ocl_base_event.h		patch \| blob \| history
inference-engine/thirdparty/clDNN/src/gpu/ocl_queue_wrapper.cpp		patch \| blob \| history
inference-engine/thirdparty/clDNN/src/gpu/ocl_toolkit.cpp		patch \| blob \| history
inference-engine/thirdparty/clDNN/src/gpu/primitive_gpu_base.h		patch \| blob \| history