[Profile] Support layer manipulation in kernel profiling

author Dongju Chae <dongju.chae@samsung.com>

Thu, 25 Mar 2021 06:10:36 +0000 (15:10 +0900)

committer 채동주/On-Device Lab(SR)/Staff Engineer/삼성전자 <dongju.chae@samsung.com>

Thu, 25 Mar 2021 10:23:33 +0000 (19:23 +0900)
author Dongju Chae <dongju.chae@samsung.com>
Thu, 25 Mar 2021 06:10:36 +0000 (15:10 +0900)
committer 채동주/On-Device Lab(SR)/Staff Engineer/삼성전자 <dongju.chae@samsung.com>
Thu, 25 Mar 2021 10:23:33 +0000 (19:23 +0900)
diff --git a/src/core/meson.build b/src/core/meson.build

index 85e9e13..c2c808b 100644 (file)
--- a/src/core/meson.build
+++ b/src/core/meson.build
@@ -16,6 +16,7 @@ ne_core_sources = [
    'ne-mem.cc',
    'ne-data.cc',
    'ne-handler.cc',
+  'ne-profiler.cc',
    'ne-scheduler.cc',
    'ne-host-input-service.cc',
    'ne-hw-input-service.cc',
diff --git a/src/core/ne-handler.cc b/src/core/ne-handler.cc

index 1550b79..260fca9 100644 (file)
--- a/src/core/ne-handler.cc
+++ b/src/core/ne-handler.cc
@@ -35,11 +35,13 @@ HostHandler::HostHandler (Device *device)
      /* ignored as we don't use double buffering anymore, but for backward-compatibility */
      async_mode_ (NPUASYNC_WAIT)
  {
+  profiler_ = new ModelProfiler (device_->getDriverAPI ());
  }
  
  /** @brief host handler destructor */
  HostHandler::~HostHandler ()
  {
+  delete profiler_;
  }
  
  /**
@@ -133,19 +135,10 @@ HostHandler::getProfile (int task_id, npu_profile *profile)
      return -EINVAL;
    }
  
-  const DriverAPI * api = device_->getDriverAPI ();
-  assert (api != nullptr);
-
    profile->num_layers = 0;
    profile->layers = nullptr;
  
-  int status = api->getProfile (task_id, profile);
-  if (status != 0) {
-    logerr (TAG, "Failed to get profile information: %d\n", status);
-    return status;
-  }
-
-  return 0;
+  return profiler_->getTaskProfile (task_id, profile);
  }
  
  /**
@@ -354,7 +347,12 @@ HostHandler::runAsync (uint32_t modelid, const input_buffers *input,
    }
  
    device_->setAsyncMode (mode);
-  return device_->run (NPUINPUT_HOST, model, input, cb, cb_data, sequence);
+
+  int task_id = device_->run (NPUINPUT_HOST, model, input, cb, cb_data, sequence);
+  if (task_id > 0)
+    profiler_->appendTask (task_id, model);
+
+  return task_id;
  }
  
  /**
diff --git a/src/core/ne-handler.h b/src/core/ne-handler.h

index 47b41ab..f9fa4f6 100644 (file)
--- a/src/core/ne-handler.h
+++ b/src/core/ne-handler.h
@@ -22,6 +22,7 @@
  #include "ne-scheduler.h"
  #include "ne-model.h"
  #include "ne-utils.h"
+#include "ne-profiler.h"
  
  class Device;
  /** @brief class def. of host handler */
@@ -73,6 +74,8 @@ class HostHandler {
      HostHandler (Device *device);
  
      Device *device_;  /**< dedicated device instance */
+    ModelProfiler *profiler_;
+
      ThreadSafeMap<uint32_t, Model> models_;
                        /**< registerd models */
      npu_async_mode async_mode_;
diff --git a/src/core/ne-model.h b/src/core/ne-model.h

index e60be65..dc2afef 100644 (file)
--- a/src/core/ne-model.h
+++ b/src/core/ne-model.h
@@ -347,6 +347,7 @@ class Model : public HWmem {
      uint64_t getInternalID () const { return internal_id_; }
      HWmem * getWeightData () const { return weight_data_; }
      HWmem * getProgramData () const { return program_data_; }
+    HWmem * getExtendedMetadata () const { return extended_meta_; }
  
      uint32_t getInputTensorNum () const;
      uint32_t getOutputTensorNum () const;
diff --git a/src/core/ne-profiler.cc b/src/core/ne-profiler.cc

new file mode 100644 (file)

index 0000000..3b6b1c0
--- /dev/null
+++ b/src/core/ne-profiler.cc
@@ -0,0 +1,175 @@
+/**
+ * Proprietary
+ * Copyright (C) 2021 Samsung Electronics
+ * Copyright (C) 2021 Dongju Chae <dongju.chae@samsung.com>
+ */
+/**
+ * @file ne-profiler.cc
+ * @date 25 Mar 2021
+ * @brief Model profiler for NPU Engine (NE) users.
+ * @author Dongju Chae <dongju.chae@samsung.com>
+ * @bug No known bugs except for NYI items
+ */
+
+#include "ne-profiler.h"
+
+ModelProfiler::ModelProfiler (const DriverAPI * api)
+  : api_ (api)
+{
+}
+
+ModelProfiler::~ModelProfiler ()
+{
+  profile_map_.clear ();
+}
+
+int
+ModelProfiler::appendTask (int task_id, const Model * model)
+{
+  ProfileData * data = new ProfileData (task_id, model);
+  return profile_map_.insert (task_id, data);
+}
+
+int
+ModelProfiler::getTaskProfile (int task_id, npu_profile *profile)
+{
+  ProfileData * data = profile_map_.find (task_id);
+  if (data == nullptr)
+    return -ENOENT;
+
+  const Model * model = data->getModel ();
+  if (model == nullptr)
+    return -EINVAL;
+
+  int status = api_->getProfile (task_id, profile);
+  if (status != 0)
+    return status;
+
+  HWmem * extended = model->getExtendedMetadata ();
+  if (extended != nullptr)
+    manipulateProfile (extended, profile);
+
+  profile_map_.remove (task_id);
+
+  return 0;
+}
+
+void
+ModelProfiler::manipulateProfile (HWmem * extended, npu_profile *profile)
+{
+  npubin_meta_profile *meta_profile =
+    reinterpret_cast <npubin_meta_profile *> (extended->getData ());
+  npu_profile_layer * new_layers =
+    new npu_profile_layer[meta_profile->node_entry_num + 1];
+
+  npu_profile_layer * unclassified =
+    &new_layers[meta_profile->node_entry_num];
+
+  snprintf (unclassified->name, NPU_OPNAME_MAX - 1, "%s", "Unclassified");
+  unclassified->name[NPU_OPNAME_MAX - 1] = '\x00';
+  unclassified->node_id = -1;
+
+  unclassified->running_cycles = 0;
+  unclassified->dram_read_bytes = 0;
+  unclassified->dram_write_bytes = 0;
+  unclassified->sram_read_bytes = 0;
+  unclassified->sram_write_bytes = 0;
+
+  /** 1) parsing node table */
+  std::unordered_map<uint32_t, npu_profile_layer *> node_table;
+  uint32_t pos = 0;
+
+  node_table.reserve (meta_profile->node_entry_num);
+
+  for (uint32_t i = 0; i < meta_profile->node_entry_num; i++) {
+    uint32_t id, length;
+
+    memcpy (&id, meta_profile->entry_data + pos, sizeof (uint32_t));
+    pos += sizeof (uint32_t);
+
+    memcpy (&length, meta_profile->entry_data + pos, sizeof (uint32_t));
+    pos += sizeof (uint32_t);
+
+    if (length == 0) {
+      std::cerr << "Zero length detected at ";
+      std::cerr << id << "th node" << std::endl;
+
+      delete [] new_layers;
+      return;
+    }
+
+    std::string name (meta_profile->entry_data + pos);
+    pos += length;
+
+    npu_profile_layer * layer = &new_layers[i];
+
+    snprintf (layer->name, NPU_OPNAME_MAX - 1, "%s", name.c_str ());
+    layer->name[NPU_OPNAME_MAX - 1] = '\x00';
+    layer->node_id = id;
+
+    layer->running_cycles = 0;
+    layer->dram_read_bytes = 0;
+    layer->dram_write_bytes = 0;
+    layer->sram_read_bytes = 0;
+    layer->sram_write_bytes = 0;
+
+    node_table.insert(std::make_pair(id, layer));
+  }
+
+  /** 2) parsing visa table */
+  pos = meta_profile->node_table_size;
+  for (uint32_t i = 0; i < meta_profile->visa_entry_num; i++) {
+    uint32_t id, node_num;
+
+    memcpy (&id, meta_profile->entry_data + pos, sizeof (uint32_t));
+    pos += sizeof (uint32_t);
+
+    memcpy (&node_num, meta_profile->entry_data + pos, sizeof (uint32_t));
+    pos += sizeof (uint32_t);
+
+    if (node_num > 0) {
+      uint32_t * node_ids = (uint32_t *) (meta_profile->entry_data + pos);
+
+      for (uint32_t j = 0; j < node_num; j++) {
+        uint32_t node_id = node_ids[j];
+        auto it = node_table.find (node_id);
+
+        if (it != node_table.end ()) {
+          npu_profile_layer * layer = it->second;
+
+          /** TODO: evenly divided to fused layers */
+          layer->running_cycles += profile->layers[i].running_cycles / node_num;
+          layer->dram_read_bytes += profile->layers[i].dram_read_bytes / node_num;
+          layer->dram_write_bytes += profile->layers[i].dram_write_bytes / node_num;
+          layer->sram_read_bytes += profile->layers[i].sram_read_bytes / node_num;
+          layer->sram_write_bytes += profile->layers[i].sram_write_bytes / node_num;
+          layer->visa_exec_seq = -1;
+        } else {
+          std::cerr << "Unable to find the node ID " << node_id << std::endl;
+        }
+      }
+    } else {
+      unclassified->running_cycles += profile->layers[i].running_cycles;
+      unclassified->dram_read_bytes += profile->layers[i].dram_read_bytes;
+      unclassified->dram_write_bytes += profile->layers[i].dram_write_bytes;
+      unclassified->sram_read_bytes += profile->layers[i].sram_read_bytes;
+      unclassified->sram_write_bytes += profile->layers[i].sram_write_bytes;
+      unclassified->visa_exec_seq = -1;
+    }
+
+    pos += sizeof (uint32_t) * node_num;
+  }
+
+  /** 3) profile data mapping */
+  size_t num_layers = node_table.size ();
+  if (num_layers > 0) {
+    delete [] profile->layers;
+
+    profile->layers = new_layers;
+    profile->num_layers = num_layers + 1;
+  } else {
+    delete [] new_layers;
+  }
+}
+
+
diff --git a/src/core/ne-profiler.h b/src/core/ne-profiler.h

new file mode 100644 (file)

index 0000000..3e3bd97
--- /dev/null
+++ b/src/core/ne-profiler.h
@@ -0,0 +1,49 @@
+/**
+ * Proprietary
+ * Copyright (C) 2021 Samsung Electronics
+ * Copyright (C) 2021 Dongju Chae <dongju.chae@samsung.com>
+ */
+/**
+ * @file ne-profiler.h
+ * @date 25 Mar 2021
+ * @brief Model profiler for NPU Engine (NE) users.
+ * @author Dongju Chae <dongju.chae@samsung.com>
+ * @bug No known bugs except for NYI items
+ */
+
+#ifndef __NPU_ENGINE_PROFILER_H__
+#define __NPU_ENGINE_PROFILER_H__
+
+#include <typedef.h>
+#include "ne-model.h"
+
+class ProfileData
+{
+  public:
+    ProfileData (int task_id, const Model * model) :
+      task_id_ (task_id), model_ (model) {}
+
+    const Model * getModel () { return model_; }
+
+  private:
+    int task_id_;
+    const Model * model_;
+};
+
+class ModelProfiler
+{
+  public:
+    ModelProfiler (const DriverAPI * api);
+    ~ModelProfiler ();
+
+    int appendTask (int task_id, const Model * model);
+    int getTaskProfile (int task_id, npu_profile *profile);
+
+    void manipulateProfile (HWmem * extended, npu_profile *profile);
+
+  private:
+    const DriverAPI * api_;
+    ThreadSafeMap<int, ProfileData> profile_map_;
+};
+
+#endif
diff --git a/src/core/npu/NPUdrvAPI.h b/src/core/npu/NPUdrvAPI.h

index fbe813a..a251e63 100644 (file)
--- a/src/core/npu/NPUdrvAPI.h
+++ b/src/core/npu/NPUdrvAPI.h
@@ -108,6 +108,7 @@ class DriverAPI {
          void *addr, size_t size) const { return -EPERM; }
  #endif
  
+    /** @brief get profile data for vISA instructions */
      virtual int getProfile (int task_id, npu_profile *profile) const { return -EPERM; }
  
      virtual int getStatApps (npu_stat_apps *stat) const { return -EPERM; }
diff --git a/src/core/npu/NPUdrvAPI_emul.cc b/src/core/npu/NPUdrvAPI_emul.cc

index e221009..993c9b4 100644 (file)
--- a/src/core/npu/NPUdrvAPI_emul.cc
+++ b/src/core/npu/NPUdrvAPI_emul.cc
@@ -30,8 +30,7 @@
  class EmulTask {
    public:
      EmulTask (int taskid) :
-      taskid_ (taskid), stop_ (false), first_run_ (false),
-      extended_dbuf_fd_ (0), extended_size_ (0) {}
+      taskid_ (taskid), stop_ (false), first_run_ (false) {}
  
      void run_emul (char *prog, char **segt, char *metadata,
          std::string cmd_path, std::string prof_path) {
@@ -57,19 +56,6 @@ class EmulTask {
        task_.join ();
      }
  
-    void set_extended (int dbuf_fd, size_t size) {
-      extended_dbuf_fd_ = dbuf_fd;
-      extended_size_ = size;
-    }
-
-    int get_extended_dbuf_fd () {
-      return extended_dbuf_fd_;
-    }
-
-    size_t get_extended_size () {
-      return extended_size_;
-    }
-
      bool get_profile (npu_profile *profile) {
        std::string path (DEFAULT_PROFILE_PATH);
        path += "/ne_profile." + std::to_string (taskid_) + ".rec";
@@ -191,9 +177,6 @@ class EmulTask {
      bool stop_;
      bool first_run_;
      std::thread task_;
-
-    int extended_dbuf_fd_;
-    size_t extended_size_;
  };
  
  /**
@@ -643,7 +626,6 @@ TrinityEmulAPI::runInput (input_config_t *input_config) const
          prog, segment_table, static_cast <char*> (elem_metadata->getAddr ()),
          cmd_path, prof_path);
  
-    task->set_extended (model->metadata_ext_dbuf_fd, model->metadata_ext_size);
      task->run (func);
      status = taskid;
  
@@ -692,139 +674,6 @@ TrinityEmulAPI::stop_target (int taskid) const
    return 0;
  }
  
-void
-TrinityEmulAPI::manipulateProfile (EmulTask *task, npu_profile *profile) const
-{
-  int dbuf_fd = task->get_extended_dbuf_fd ();
-  size_t size = task->get_extended_size ();
-
-  EmulElement *elem = elem_map_.find (dbuf_fd);
-  if (elem == nullptr || elem->getAddr () == nullptr) {
-    std::cerr << "No available extended metadata" << std::endl;
-    return;
-  }
-
-  if (elem->getSize () != size) {
-    std::cerr << "Extended metadata size mismatch: ";
-    std::cerr << elem->getSize () << " vs. " << size << std::endl;
-    return;
-  }
-
-  npubin_meta_profile *meta_profile =
-         static_cast<npubin_meta_profile *> (elem->getAddr ());
-  npu_profile_layer * new_layers =
-    new npu_profile_layer[meta_profile->node_entry_num + 1];
-
-  npu_profile_layer * unclassified =
-    &new_layers[meta_profile->node_entry_num];
-
-  snprintf (unclassified->name, NPU_OPNAME_MAX - 1, "%s", "Unclassified");
-  unclassified->name[NPU_OPNAME_MAX - 1] = '\x00';
-  unclassified->node_id = -1;
-
-  unclassified->running_cycles = 0;
-  unclassified->dram_read_bytes = 0;
-  unclassified->dram_write_bytes = 0;
-  unclassified->sram_read_bytes = 0;
-  unclassified->sram_write_bytes = 0;
-
-  /** 1) parsing node table */
-  std::unordered_map<uint32_t, npu_profile_layer *> node_table;
-  uint32_t pos = 0;
-
-  node_table.reserve (meta_profile->node_entry_num);
-
-  for (uint32_t i = 0; i < meta_profile->node_entry_num; i++) {
-    uint32_t id, length;
-
-    memcpy (&id, meta_profile->entry_data + pos, sizeof (uint32_t));
-    pos += sizeof (uint32_t);
-
-    memcpy (&length, meta_profile->entry_data + pos, sizeof (uint32_t));
-    pos += sizeof (uint32_t);
-
-    if (length == 0) {
-      std::cerr << "Zero length detected at ";
-      std::cerr << id << "th node" << std::endl;
-
-      delete [] new_layers;
-      return;
-    }
-
-    std::string name (meta_profile->entry_data + pos);
-    pos += length;
-
-    npu_profile_layer * layer = &new_layers[i];
-
-    snprintf (layer->name, NPU_OPNAME_MAX - 1, "%s", name.c_str ());
-    layer->name[NPU_OPNAME_MAX - 1] = '\x00';
-    layer->node_id = id;
-
-    layer->running_cycles = 0;
-    layer->dram_read_bytes = 0;
-    layer->dram_write_bytes = 0;
-    layer->sram_read_bytes = 0;
-    layer->sram_write_bytes = 0;
-
-    node_table.insert(std::make_pair(id, layer));
-  }
-
-  /** 2) parsing visa table */
-  pos = meta_profile->node_table_size;
-  for (uint32_t i = 0; i < meta_profile->visa_entry_num; i++) {
-    uint32_t id, node_num;
-
-    memcpy (&id, meta_profile->entry_data + pos, sizeof (uint32_t));
-    pos += sizeof (uint32_t);
-
-    memcpy (&node_num, meta_profile->entry_data + pos, sizeof (uint32_t));
-    pos += sizeof (uint32_t);
-
-    if (node_num > 0) {
-      uint32_t * node_ids = (uint32_t *) (meta_profile->entry_data + pos);
-
-      for (uint32_t j = 0; j < node_num; j++) {
-        uint32_t node_id = node_ids[j];
-        auto it = node_table.find (node_id);
-
-        if (it != node_table.end ()) {
-          npu_profile_layer * layer = it->second;
-
-          /** TODO: evenly divided to fused layers */
-          layer->running_cycles += profile->layers[i].running_cycles / node_num;
-          layer->dram_read_bytes += profile->layers[i].dram_read_bytes / node_num;
-          layer->dram_write_bytes += profile->layers[i].dram_write_bytes / node_num;
-          layer->sram_read_bytes += profile->layers[i].sram_read_bytes / node_num;
-          layer->sram_write_bytes += profile->layers[i].sram_write_bytes / node_num;
-          layer->visa_exec_seq = -1;
-        } else {
-          std::cerr << "Unable to find the node ID " << node_id << std::endl;
-        }
-      }
-    } else {
-      unclassified->running_cycles += profile->layers[i].running_cycles;
-      unclassified->dram_read_bytes += profile->layers[i].dram_read_bytes;
-      unclassified->dram_write_bytes += profile->layers[i].dram_write_bytes;
-      unclassified->sram_read_bytes += profile->layers[i].sram_read_bytes;
-      unclassified->sram_write_bytes += profile->layers[i].sram_write_bytes;
-      unclassified->visa_exec_seq = -1;
-    }
-
-    pos += sizeof (uint32_t) * node_num;
-  }
-
-  /** 3) profile data mapping */
-  size_t num_layers = node_table.size ();
-  if (node_table.size () > 0) {
-    delete [] profile->layers;
-
-    profile->layers = new_layers;
-    profile->num_layers = num_layers + 1;
-  } else {
-    delete [] new_layers;
-  }
-}
-
  int
  TrinityEmulAPI::getProfile (int taskid, npu_profile *profile) const
  {
@@ -838,9 +687,6 @@ TrinityEmulAPI::getProfile (int taskid, npu_profile *profile) const
    if (!task->get_profile (profile))
      return -EINVAL;
  
-  if (task->get_extended_size () != 0)
-    manipulateProfile (task, profile);
-
    task_map_.remove (taskid);
  
    return 0;
diff --git a/src/core/npu/NPUdrvAPI_triv2.cc b/src/core/npu/NPUdrvAPI_triv2.cc

index 175e0b3..eccbc5a 100644 (file)
--- a/src/core/npu/NPUdrvAPI_triv2.cc
+++ b/src/core/npu/NPUdrvAPI_triv2.cc
@@ -473,8 +473,6 @@ TrinityVision2API::getProfile (int task_id, npu_profile *profile) const
      return -errno;
    }
  
-  /** TODO: manipulate the profiling info later (i.e., per-visa to per-layer) */
-
    return 0;
  }
author	Dongju Chae <dongju.chae@samsung.com>
	Thu, 25 Mar 2021 06:10:36 +0000 (15:10 +0900)
committer	채동주/On-Device Lab(SR)/Staff Engineer/삼성전자 <dongju.chae@samsung.com>
	Thu, 25 Mar 2021 10:23:33 +0000 (19:23 +0900)
src/core/meson.build		patch \| blob \| history
src/core/ne-handler.cc		patch \| blob \| history
src/core/ne-handler.h		patch \| blob \| history
src/core/ne-model.h		patch \| blob \| history
src/core/ne-profiler.cc	[new file with mode: 0644]	patch \| blob
src/core/ne-profiler.h	[new file with mode: 0644]	patch \| blob
src/core/npu/NPUdrvAPI.h		patch \| blob \| history
src/core/npu/NPUdrvAPI_emul.cc		patch \| blob \| history
src/core/npu/NPUdrvAPI_triv2.cc		patch \| blob \| history