This patch supports layer manipulation in kernel profiling.
Previously, the layer manipulation was supported in emulated
driver API using the mrpsim simulator.
In this patch, I added ModelProfiler class to provide common
interface used in both simulator and kernel driver.
Signed-off-by: Dongju Chae <dongju.chae@samsung.com>
'ne-mem.cc',
'ne-data.cc',
'ne-handler.cc',
+ 'ne-profiler.cc',
'ne-scheduler.cc',
'ne-host-input-service.cc',
'ne-hw-input-service.cc',
/* ignored as we don't use double buffering anymore, but for backward-compatibility */
async_mode_ (NPUASYNC_WAIT)
{
+ profiler_ = new ModelProfiler (device_->getDriverAPI ());
}
/** @brief host handler destructor */
HostHandler::~HostHandler ()
{
+ delete profiler_;
}
/**
return -EINVAL;
}
- const DriverAPI * api = device_->getDriverAPI ();
- assert (api != nullptr);
-
profile->num_layers = 0;
profile->layers = nullptr;
- int status = api->getProfile (task_id, profile);
- if (status != 0) {
- logerr (TAG, "Failed to get profile information: %d\n", status);
- return status;
- }
-
- return 0;
+ return profiler_->getTaskProfile (task_id, profile);
}
/**
}
device_->setAsyncMode (mode);
- return device_->run (NPUINPUT_HOST, model, input, cb, cb_data, sequence);
+
+ int task_id = device_->run (NPUINPUT_HOST, model, input, cb, cb_data, sequence);
+ if (task_id > 0)
+ profiler_->appendTask (task_id, model);
+
+ return task_id;
}
/**
#include "ne-scheduler.h"
#include "ne-model.h"
#include "ne-utils.h"
+#include "ne-profiler.h"
class Device;
/** @brief class def. of host handler */
HostHandler (Device *device);
Device *device_; /**< dedicated device instance */
+ ModelProfiler *profiler_;
+
ThreadSafeMap<uint32_t, Model> models_;
/**< registerd models */
npu_async_mode async_mode_;
uint64_t getInternalID () const { return internal_id_; }
HWmem * getWeightData () const { return weight_data_; }
HWmem * getProgramData () const { return program_data_; }
+ HWmem * getExtendedMetadata () const { return extended_meta_; }
uint32_t getInputTensorNum () const;
uint32_t getOutputTensorNum () const;
--- /dev/null
+/**
+ * Proprietary
+ * Copyright (C) 2021 Samsung Electronics
+ * Copyright (C) 2021 Dongju Chae <dongju.chae@samsung.com>
+ */
+/**
+ * @file ne-profiler.cc
+ * @date 25 Mar 2021
+ * @brief Model profiler for NPU Engine (NE) users.
+ * @author Dongju Chae <dongju.chae@samsung.com>
+ * @bug No known bugs except for NYI items
+ */
+
+#include "ne-profiler.h"
+
+ModelProfiler::ModelProfiler (const DriverAPI * api)
+ : api_ (api)
+{
+}
+
+ModelProfiler::~ModelProfiler ()
+{
+ profile_map_.clear ();
+}
+
+int
+ModelProfiler::appendTask (int task_id, const Model * model)
+{
+ ProfileData * data = new ProfileData (task_id, model);
+ return profile_map_.insert (task_id, data);
+}
+
+int
+ModelProfiler::getTaskProfile (int task_id, npu_profile *profile)
+{
+ ProfileData * data = profile_map_.find (task_id);
+ if (data == nullptr)
+ return -ENOENT;
+
+ const Model * model = data->getModel ();
+ if (model == nullptr)
+ return -EINVAL;
+
+ int status = api_->getProfile (task_id, profile);
+ if (status != 0)
+ return status;
+
+ HWmem * extended = model->getExtendedMetadata ();
+ if (extended != nullptr)
+ manipulateProfile (extended, profile);
+
+ profile_map_.remove (task_id);
+
+ return 0;
+}
+
+void
+ModelProfiler::manipulateProfile (HWmem * extended, npu_profile *profile)
+{
+ npubin_meta_profile *meta_profile =
+ reinterpret_cast <npubin_meta_profile *> (extended->getData ());
+ npu_profile_layer * new_layers =
+ new npu_profile_layer[meta_profile->node_entry_num + 1];
+
+ npu_profile_layer * unclassified =
+ &new_layers[meta_profile->node_entry_num];
+
+ snprintf (unclassified->name, NPU_OPNAME_MAX - 1, "%s", "Unclassified");
+ unclassified->name[NPU_OPNAME_MAX - 1] = '\x00';
+ unclassified->node_id = -1;
+
+ unclassified->running_cycles = 0;
+ unclassified->dram_read_bytes = 0;
+ unclassified->dram_write_bytes = 0;
+ unclassified->sram_read_bytes = 0;
+ unclassified->sram_write_bytes = 0;
+
+ /** 1) parsing node table */
+ std::unordered_map<uint32_t, npu_profile_layer *> node_table;
+ uint32_t pos = 0;
+
+ node_table.reserve (meta_profile->node_entry_num);
+
+ for (uint32_t i = 0; i < meta_profile->node_entry_num; i++) {
+ uint32_t id, length;
+
+ memcpy (&id, meta_profile->entry_data + pos, sizeof (uint32_t));
+ pos += sizeof (uint32_t);
+
+ memcpy (&length, meta_profile->entry_data + pos, sizeof (uint32_t));
+ pos += sizeof (uint32_t);
+
+ if (length == 0) {
+ std::cerr << "Zero length detected at ";
+ std::cerr << id << "th node" << std::endl;
+
+ delete [] new_layers;
+ return;
+ }
+
+ std::string name (meta_profile->entry_data + pos);
+ pos += length;
+
+ npu_profile_layer * layer = &new_layers[i];
+
+ snprintf (layer->name, NPU_OPNAME_MAX - 1, "%s", name.c_str ());
+ layer->name[NPU_OPNAME_MAX - 1] = '\x00';
+ layer->node_id = id;
+
+ layer->running_cycles = 0;
+ layer->dram_read_bytes = 0;
+ layer->dram_write_bytes = 0;
+ layer->sram_read_bytes = 0;
+ layer->sram_write_bytes = 0;
+
+ node_table.insert(std::make_pair(id, layer));
+ }
+
+ /** 2) parsing visa table */
+ pos = meta_profile->node_table_size;
+ for (uint32_t i = 0; i < meta_profile->visa_entry_num; i++) {
+ uint32_t id, node_num;
+
+ memcpy (&id, meta_profile->entry_data + pos, sizeof (uint32_t));
+ pos += sizeof (uint32_t);
+
+ memcpy (&node_num, meta_profile->entry_data + pos, sizeof (uint32_t));
+ pos += sizeof (uint32_t);
+
+ if (node_num > 0) {
+ uint32_t * node_ids = (uint32_t *) (meta_profile->entry_data + pos);
+
+ for (uint32_t j = 0; j < node_num; j++) {
+ uint32_t node_id = node_ids[j];
+ auto it = node_table.find (node_id);
+
+ if (it != node_table.end ()) {
+ npu_profile_layer * layer = it->second;
+
+ /** TODO: evenly divided to fused layers */
+ layer->running_cycles += profile->layers[i].running_cycles / node_num;
+ layer->dram_read_bytes += profile->layers[i].dram_read_bytes / node_num;
+ layer->dram_write_bytes += profile->layers[i].dram_write_bytes / node_num;
+ layer->sram_read_bytes += profile->layers[i].sram_read_bytes / node_num;
+ layer->sram_write_bytes += profile->layers[i].sram_write_bytes / node_num;
+ layer->visa_exec_seq = -1;
+ } else {
+ std::cerr << "Unable to find the node ID " << node_id << std::endl;
+ }
+ }
+ } else {
+ unclassified->running_cycles += profile->layers[i].running_cycles;
+ unclassified->dram_read_bytes += profile->layers[i].dram_read_bytes;
+ unclassified->dram_write_bytes += profile->layers[i].dram_write_bytes;
+ unclassified->sram_read_bytes += profile->layers[i].sram_read_bytes;
+ unclassified->sram_write_bytes += profile->layers[i].sram_write_bytes;
+ unclassified->visa_exec_seq = -1;
+ }
+
+ pos += sizeof (uint32_t) * node_num;
+ }
+
+ /** 3) profile data mapping */
+ size_t num_layers = node_table.size ();
+ if (num_layers > 0) {
+ delete [] profile->layers;
+
+ profile->layers = new_layers;
+ profile->num_layers = num_layers + 1;
+ } else {
+ delete [] new_layers;
+ }
+}
+
+
--- /dev/null
+/**
+ * Proprietary
+ * Copyright (C) 2021 Samsung Electronics
+ * Copyright (C) 2021 Dongju Chae <dongju.chae@samsung.com>
+ */
+/**
+ * @file ne-profiler.h
+ * @date 25 Mar 2021
+ * @brief Model profiler for NPU Engine (NE) users.
+ * @author Dongju Chae <dongju.chae@samsung.com>
+ * @bug No known bugs except for NYI items
+ */
+
+#ifndef __NPU_ENGINE_PROFILER_H__
+#define __NPU_ENGINE_PROFILER_H__
+
+#include <typedef.h>
+#include "ne-model.h"
+
+class ProfileData
+{
+ public:
+ ProfileData (int task_id, const Model * model) :
+ task_id_ (task_id), model_ (model) {}
+
+ const Model * getModel () { return model_; }
+
+ private:
+ int task_id_;
+ const Model * model_;
+};
+
+class ModelProfiler
+{
+ public:
+ ModelProfiler (const DriverAPI * api);
+ ~ModelProfiler ();
+
+ int appendTask (int task_id, const Model * model);
+ int getTaskProfile (int task_id, npu_profile *profile);
+
+ void manipulateProfile (HWmem * extended, npu_profile *profile);
+
+ private:
+ const DriverAPI * api_;
+ ThreadSafeMap<int, ProfileData> profile_map_;
+};
+
+#endif
void *addr, size_t size) const { return -EPERM; }
#endif
+ /** @brief get profile data for vISA instructions */
virtual int getProfile (int task_id, npu_profile *profile) const { return -EPERM; }
virtual int getStatApps (npu_stat_apps *stat) const { return -EPERM; }
class EmulTask {
public:
EmulTask (int taskid) :
- taskid_ (taskid), stop_ (false), first_run_ (false),
- extended_dbuf_fd_ (0), extended_size_ (0) {}
+ taskid_ (taskid), stop_ (false), first_run_ (false) {}
void run_emul (char *prog, char **segt, char *metadata,
std::string cmd_path, std::string prof_path) {
task_.join ();
}
- void set_extended (int dbuf_fd, size_t size) {
- extended_dbuf_fd_ = dbuf_fd;
- extended_size_ = size;
- }
-
- int get_extended_dbuf_fd () {
- return extended_dbuf_fd_;
- }
-
- size_t get_extended_size () {
- return extended_size_;
- }
-
bool get_profile (npu_profile *profile) {
std::string path (DEFAULT_PROFILE_PATH);
path += "/ne_profile." + std::to_string (taskid_) + ".rec";
bool stop_;
bool first_run_;
std::thread task_;
-
- int extended_dbuf_fd_;
- size_t extended_size_;
};
/**
prog, segment_table, static_cast <char*> (elem_metadata->getAddr ()),
cmd_path, prof_path);
- task->set_extended (model->metadata_ext_dbuf_fd, model->metadata_ext_size);
task->run (func);
status = taskid;
return 0;
}
-void
-TrinityEmulAPI::manipulateProfile (EmulTask *task, npu_profile *profile) const
-{
- int dbuf_fd = task->get_extended_dbuf_fd ();
- size_t size = task->get_extended_size ();
-
- EmulElement *elem = elem_map_.find (dbuf_fd);
- if (elem == nullptr || elem->getAddr () == nullptr) {
- std::cerr << "No available extended metadata" << std::endl;
- return;
- }
-
- if (elem->getSize () != size) {
- std::cerr << "Extended metadata size mismatch: ";
- std::cerr << elem->getSize () << " vs. " << size << std::endl;
- return;
- }
-
- npubin_meta_profile *meta_profile =
- static_cast<npubin_meta_profile *> (elem->getAddr ());
- npu_profile_layer * new_layers =
- new npu_profile_layer[meta_profile->node_entry_num + 1];
-
- npu_profile_layer * unclassified =
- &new_layers[meta_profile->node_entry_num];
-
- snprintf (unclassified->name, NPU_OPNAME_MAX - 1, "%s", "Unclassified");
- unclassified->name[NPU_OPNAME_MAX - 1] = '\x00';
- unclassified->node_id = -1;
-
- unclassified->running_cycles = 0;
- unclassified->dram_read_bytes = 0;
- unclassified->dram_write_bytes = 0;
- unclassified->sram_read_bytes = 0;
- unclassified->sram_write_bytes = 0;
-
- /** 1) parsing node table */
- std::unordered_map<uint32_t, npu_profile_layer *> node_table;
- uint32_t pos = 0;
-
- node_table.reserve (meta_profile->node_entry_num);
-
- for (uint32_t i = 0; i < meta_profile->node_entry_num; i++) {
- uint32_t id, length;
-
- memcpy (&id, meta_profile->entry_data + pos, sizeof (uint32_t));
- pos += sizeof (uint32_t);
-
- memcpy (&length, meta_profile->entry_data + pos, sizeof (uint32_t));
- pos += sizeof (uint32_t);
-
- if (length == 0) {
- std::cerr << "Zero length detected at ";
- std::cerr << id << "th node" << std::endl;
-
- delete [] new_layers;
- return;
- }
-
- std::string name (meta_profile->entry_data + pos);
- pos += length;
-
- npu_profile_layer * layer = &new_layers[i];
-
- snprintf (layer->name, NPU_OPNAME_MAX - 1, "%s", name.c_str ());
- layer->name[NPU_OPNAME_MAX - 1] = '\x00';
- layer->node_id = id;
-
- layer->running_cycles = 0;
- layer->dram_read_bytes = 0;
- layer->dram_write_bytes = 0;
- layer->sram_read_bytes = 0;
- layer->sram_write_bytes = 0;
-
- node_table.insert(std::make_pair(id, layer));
- }
-
- /** 2) parsing visa table */
- pos = meta_profile->node_table_size;
- for (uint32_t i = 0; i < meta_profile->visa_entry_num; i++) {
- uint32_t id, node_num;
-
- memcpy (&id, meta_profile->entry_data + pos, sizeof (uint32_t));
- pos += sizeof (uint32_t);
-
- memcpy (&node_num, meta_profile->entry_data + pos, sizeof (uint32_t));
- pos += sizeof (uint32_t);
-
- if (node_num > 0) {
- uint32_t * node_ids = (uint32_t *) (meta_profile->entry_data + pos);
-
- for (uint32_t j = 0; j < node_num; j++) {
- uint32_t node_id = node_ids[j];
- auto it = node_table.find (node_id);
-
- if (it != node_table.end ()) {
- npu_profile_layer * layer = it->second;
-
- /** TODO: evenly divided to fused layers */
- layer->running_cycles += profile->layers[i].running_cycles / node_num;
- layer->dram_read_bytes += profile->layers[i].dram_read_bytes / node_num;
- layer->dram_write_bytes += profile->layers[i].dram_write_bytes / node_num;
- layer->sram_read_bytes += profile->layers[i].sram_read_bytes / node_num;
- layer->sram_write_bytes += profile->layers[i].sram_write_bytes / node_num;
- layer->visa_exec_seq = -1;
- } else {
- std::cerr << "Unable to find the node ID " << node_id << std::endl;
- }
- }
- } else {
- unclassified->running_cycles += profile->layers[i].running_cycles;
- unclassified->dram_read_bytes += profile->layers[i].dram_read_bytes;
- unclassified->dram_write_bytes += profile->layers[i].dram_write_bytes;
- unclassified->sram_read_bytes += profile->layers[i].sram_read_bytes;
- unclassified->sram_write_bytes += profile->layers[i].sram_write_bytes;
- unclassified->visa_exec_seq = -1;
- }
-
- pos += sizeof (uint32_t) * node_num;
- }
-
- /** 3) profile data mapping */
- size_t num_layers = node_table.size ();
- if (node_table.size () > 0) {
- delete [] profile->layers;
-
- profile->layers = new_layers;
- profile->num_layers = num_layers + 1;
- } else {
- delete [] new_layers;
- }
-}
-
int
TrinityEmulAPI::getProfile (int taskid, npu_profile *profile) const
{
if (!task->get_profile (profile))
return -EINVAL;
- if (task->get_extended_size () != 0)
- manipulateProfile (task, profile);
-
task_map_.remove (taskid);
return 0;
return -errno;
}
- /** TODO: manipulate the profiling info later (i.e., per-visa to per-layer) */
-
return 0;
}