* other requests. FIFO is used among the same priority requests.
*/
typedef enum {
- NPU_PRIORITY_LOW = 0, /**< Low priority: tasks could be delayed or canceled */
- NPU_PRIORITY_MID = 1, /**< Mid priority: tasks could be slightly delayed */
- NPU_PRIORITY_HIGH = 2, /**< High priority: tasks should be issued immediately */
+ NPU_PRIORITY_LOW = 0, /**< Low priority: tasks could be delayed or canceled */
+ NPU_PRIORITY_MID = 1, /**< Mid priority: tasks could be slightly delayed */
+ NPU_PRIORITY_HIGH = 2, /**< High priority: tasks should be issued immediately */
+ NPU_PRIORITY_PROFILE = 3, /**< Special priority for profiling */
} npu_priority;
/**
typedef struct {
char name[NPU_OPNAME_MAX];
- uint32_t latency_ms;
- uint64_t latency_cycles;
+ uint64_t running_cycles;
- uint64_t mem_read_bytes;
- uint64_t mem_write_bytes;
- /* TBD */
+ uint64_t dram_read_bytes;
+ uint64_t dram_write_bytes;
+
+ uint64_t sram_read_bytes;
+ uint64_t sram_write_bytes;
} npu_profile_layer;
typedef struct {
uint32_t num_layers;
+
+ uint64_t total_running_cycles;
+ uint64_t dram_input_footprint;
+ uint64_t dram_output_footprint;
+
npu_profile_layer *layers;
} npu_profile;
* @param[in] task_id Identifier for each inference (obtained by runNPU_*)
* @param[out] profile Profile instance
* @return 0 if no error, otherwise a negative errno.
+ * @note This API supports the models with NPU_PRIORITY_PROFILE only.
*/
int getNPU_profile (npudev_h dev, int task_id, npu_profile *profile);
{
public:
Quantizer () : to_npu_ (true), zero_ (0), scale_ (0.0) {}
- ~Quantizer () {}
+ virtual ~Quantizer () {}
void set_direction (bool to_npu) { to_npu_ = to_npu; }
const DriverAPI * api = device_->getDriverAPI ();
assert (api != nullptr);
- void *profile_buffer;
- size_t profile_size;
- int status = api->getProfile (task_id, &profile_buffer, &profile_size);
+ profile->num_layers = 0;
+ profile->layers = nullptr;
+
+ int status = api->getProfile (task_id, profile);
if (status != 0) {
logerr (TAG, "Failed to get profile information: %d\n", status);
return status;
}
- profile->num_layers = 0;
- profile->layers = nullptr;
- if (profile_buffer != nullptr) {
- // TODO: Perform parsing
- }
-
return 0;
}
if (api == nullptr)
return -EINVAL;
+ if (model != nullptr) {
+ npuConstraint constraint = model->getConstraint ();
+ if (constraint.priority == NPU_PRIORITY_PROFILE)
+ return invoke_buffer (api, model, buffer, callback);
+ }
+
taskFunc func = std::bind (&HostInputService::invoke_buffer, this,
api, model, buffer, callback);
ThreadTask *task = new ThreadTask (id, func);
if (api == nullptr || model == nullptr)
return -EINVAL;
+ npuConstraint constraint = model->getConstraint ();
+ if (constraint.priority == NPU_PRIORITY_PROFILE)
+ return invoke_segt (api, model, segt, callback);
+
taskFunc func = std::bind (&HostInputService::invoke_segt, this,
api, model, segt, callback);
ThreadTask *task = new ThreadTask (id, func);
* @param[in] model the target model
* @param[in] buffer the target buffer
* @param[in] callback output callback
+ * @return 0 if no error, otherwise a negative errno
* @note this function should be used with TRIV driver!
*/
-void
+int
HostInputService::invoke_buffer (const DriverAPI *api, const Model *model,
Buffer *buffer, outputCallback callback)
{
input_config_t input_config;
device_state_t state;
- int ret;
+ int ret = -EINVAL;
state = api->isReady();
if (state != device_state_t::STATE_READY) {
/** should call the callback regardless of failure, to avoid deadlock */
if (callback != nullptr)
callback ();
+
+ return ret;
}
/**
* @param[in] model the target model
* @param[in] segt the target segment table
* @param[in] callback output callback
+ * @return 0 if no error, otherwise a negative errno
* @note this function should be used with TRIV2 driver!
*/
-void
+int
HostInputService::invoke_segt (const DriverAPI *api, const Model *model,
SegmentTable *segt, outputCallback callback)
{
input_config_t input_config;
device_state_t state;
npuConstraint constraint;
- int ret;
+ int ret = -EINVAL;
state = api->isReady();
if (state != device_state_t::STATE_READY) {
/** should call the callback regardless of failure, to avoid deadlock */
if (callback != nullptr)
callback ();
+
+ return ret;
}
private:
/** do not allow to directly call invoke () */
- void invoke_buffer (const DriverAPI *api, const Model *model, Buffer *buffer,
+ int invoke_buffer (const DriverAPI *api, const Model *model, Buffer *buffer,
outputCallback callback);
- void invoke_segt (const DriverAPI *api, const Model *model, SegmentTable *segt,
+ int invoke_segt (const DriverAPI *api, const Model *model, SegmentTable *segt,
outputCallback callback);
static std::unique_ptr<HostInputService> instance_;
} else
status = -EINVAL;
- if (status != 0) {
+ if (status < 0) {
/** if failed to invoke input service, directly handle callback (if exists) */
handleCallback (req);
}
void *addr, size_t size) const { return -EPERM; }
#endif
- virtual int getProfile (int task_id, void **profile_buf,
- size_t *profile_size) const { return -EPERM; }
+ virtual int getProfile (int task_id, npu_profile *profile) const { return -EPERM; }
virtual int getStatApps (npu_stat_apps *stat) const { return -EPERM; }
virtual int getStatTasks (int appid, npu_stat_tasks *stat) const { return -EPERM; }
void *addr, size_t size) const;
#endif
- int getProfile (int task_id, void **profile_buf,
- size_t *profile_size) const;
+ int getProfile (int task_id, npu_profile *profile) const;
int getStatApps (npu_stat_apps *stat) const;
int getStatTasks (int appid, npu_stat_tasks *stat) const;
int registerModel (model_config_t *model, uint64_t npu_version) const;
int deregisterModel (unsigned long long id) const;
- int getProfile (int task_id, void **profile_buf,
- size_t *profile_size) const;
+ int getProfile (int task_id, npu_profile *profile) const;
private:
static std::atomic<int> global_fd_;
#include <assert.h>
#include <atomic>
#include <memory>
+#include <chrono>
-#include <mrpsim.h>
+#include <triv2profile.h>
#include <npubinfmt.h>
#include <ne-conf.h>
#define MAX_EMUL_DEVICES (3)
#define ENV_PREFIX_SHARE "NE_PREFIX_SHARE"
#define DEFAULT_PREFIX_SHARE "/opt/trinity/share"
+#define DEFAULT_PROFILE_PATH "/tmp"
class EmulTask {
public:
- EmulTask () : stop_ (false) {}
+ EmulTask (int taskid) : taskid_ (taskid), stop_ (false), first_run_ (false) {}
void run_emul (char *prog, char **segt, char *metadata,
- const char *cmd_path) {
- while (!stop_)
- run_triv2_emul (prog, segt, metadata, cmd_path);
+ std::string cmd_path, std::string prof_path) {
+ first_run_ = true;
+
+ while (!stop_ || first_run_) {
+ run_triv2_emul (prog, segt, metadata, cmd_path.c_str (), prof_path.c_str ());
+ first_run_ = false;
+
+ if (!stop_)
+ std::this_thread::sleep_for (std::chrono::seconds (1));
+ };
delete [] segt;
}
task_.join ();
}
+ void get_profile (npu_profile *profile) {
+ std::string path (DEFAULT_PROFILE_PATH);
+ path += "/ne_profile." + std::to_string (taskid_) + ".rec";
+
+ std::ifstream ifs (path, std::ios::binary);
+ if (!ifs.good ())
+ return;
+
+ T2PF_HEAD head;
+ ifs.read ((char *) &head, sizeof (T2PF_HEAD));
+
+ uint32_t total_dump = 0;
+
+ total_dump +=
+ head.nna.num_of_dump + head.nna_dma_in.num_of_dump + head.nna_dma_out.num_of_dump;
+
+ if (total_dump > 0) {
+ profile->layers = new npu_profile_layer [total_dump];
+ profile->num_layers = total_dump;
+ profile->total_running_cycles = head.total_cycles;
+ profile->dram_input_footprint = head.nna_dma_in.access_footprint_byte;
+ profile->dram_output_footprint = head.nna_dma_out.access_footprint_byte;
+
+ for (uint32_t i = 0; i < total_dump; i++) {
+ npu_profile_layer *layer = &profile->layers[i];
+
+ T2PF_DUMP common;
+ T2PF_DUMP_NNA nna;
+ T2PF_DUMP_DMA nna_dma;
+
+ std::streampos pos;
+
+ pos = ifs.tellg ();
+ ifs.read ((char *) &common, sizeof (T2PF_DUMP));
+ ifs.seekg (pos);
+
+ memset (layer, '\x00', sizeof (npu_profile_layer));
+
+ switch (common.block_id) {
+ case TRIV2PROF_BLOCKID_NNA:
+ ifs.read ((char *) &nna, sizeof (T2PF_DUMP_NNA));
+
+ snprintf (layer->name, NPU_OPNAME_MAX, "%s", nna.op_name);
+ layer->running_cycles = common.cycle_end - common.cycle_start;
+ break;
+ case TRIV2PROF_BLOCKID_NNA_DMA_IN:
+ ifs.read ((char *) &nna_dma, sizeof (T2PF_DUMP_DMA));
+
+ snprintf (layer->name, NPU_OPNAME_MAX, "%s", "ADMA_IN");
+ layer->running_cycles = common.cycle_end - common.cycle_start;
+ layer->dram_read_bytes = nna_dma.src_addr_end - nna_dma.src_addr_start;
+ layer->sram_write_bytes = nna_dma.dest_addr_end - nna_dma.dest_addr_start;
+ break;
+ case TRIV2PROF_BLOCKID_NNA_DMA_OUT:
+ ifs.read ((char *) &nna_dma, sizeof (T2PF_DUMP_DMA));
+
+ snprintf (layer->name, NPU_OPNAME_MAX, "%s", "ADMA_OUT");
+ layer->running_cycles = common.cycle_end - common.cycle_start;
+ layer->dram_write_bytes = nna_dma.src_addr_end - nna_dma.src_addr_start;
+ layer->sram_read_bytes = nna_dma.dest_addr_end - nna_dma.dest_addr_start;
+ break;
+ default:
+ std::cerr << "Unknown block id detected: " << common.block_id << std::endl;
+ break;
+ }
+ }
+ }
+
+ ifs.close();
+ }
+
private:
+ int taskid_;
bool stop_;
+ bool first_run_;
std::thread task_;
};
else
cmd_path += "/mRPsim/triv2.cmd";
- if (input_config->input_mode == INPUT_HW) {
- int taskid = global_fd_.fetch_add (1);
- EmulTask *task = new EmulTask;
+ int taskid = global_fd_.fetch_add (1);
+ EmulTask *task = new EmulTask (taskid);
- status = task_map_.insert (taskid, task);
- if (status != 0) {
- delete [] segment_table;
- return status;
- }
-
- auto func = std::bind (&EmulTask::run_emul, task,
- prog, segment_table, static_cast <char*> (elem_metadata->getAddr ()),
- cmd_path.c_str ());
-
- task->run (func);
- status = taskid;
- } else {
- status = run_triv2_emul (prog, segment_table,
- static_cast <char*> (elem_metadata->getAddr ()),
- cmd_path.c_str ());
+ status = task_map_.insert (taskid, task);
+ if (status != 0) {
delete [] segment_table;
+ return status;
}
+
+ std::string prof_path (DEFAULT_PROFILE_PATH);
+
+ prof_path += "/ne_profile." + std::to_string (taskid);
+
+ auto func = std::bind (&EmulTask::run_emul, task,
+ prog, segment_table, static_cast <char*> (elem_metadata->getAddr ()),
+ cmd_path, prof_path);
+
+ task->run (func);
+ status = taskid;
+
+ if (input_config->input_mode != INPUT_HW)
+ task->stop ();
}
return status;
}
int
-TrinityEmulAPI::getProfile (int task_id, void **profile_buf,
- size_t *profile_size) const
+TrinityEmulAPI::getProfile (int taskid, npu_profile *profile) const
{
- // TODO: allocate the buffer and call APIs from simulator
+ if (!initialized())
+ return -EPERM;
+
+ EmulTask * task = task_map_.find (taskid);
+ if (task == nullptr)
+ return -ENOENT;
+
+ task->get_profile (profile);
+ task_map_.remove (taskid);
+
return 0;
}
#endif
int
-TrinityVision2API::getProfile (int task_id, void **profile_buf,
- size_t *profile_size) const
+TrinityVision2API::getProfile (int task_id, npu_profile *profile) const
{
- struct trinity_profile profile;
+ struct trinity_profile t_profile;
size_t size = max_buf_size;
void * buf;
int ret;
if (!buf)
return -ENOMEM;
- profile.task_id = task_id;
- profile.buf = buf;
- profile.buf_size = size;
- profile.next_size = 0;
+ t_profile.task_id = task_id;
+ t_profile.buf = buf;
+ t_profile.buf_size = size;
+ t_profile.next_size = 0;
- ret = ioctl (this->getDeviceFD (), TRINITY_IOCTL_GET_PROFILE, &profile);
+ ret = ioctl (this->getDeviceFD (), TRINITY_IOCTL_GET_PROFILE, &t_profile);
if (ret != 0)
goto ioctl_fail;
- if (profile.next_size != 0) {
- buf = realloc (profile.buf, profile.buf_size + profile.next_size);
+ if (t_profile.next_size != 0) {
+ buf = realloc (t_profile.buf, t_profile.buf_size + t_profile.next_size);
if (!buf) {
- free (profile.buf);
+ free (t_profile.buf);
return -ENOMEM;
}
- profile.buf = (char *) buf + profile.buf_size;
- profile.buf_size = profile.next_size;
- profile.next_size = 0;
+ t_profile.buf = (char *) buf + t_profile.buf_size;
+ t_profile.buf_size = t_profile.next_size;
+ t_profile.next_size = 0;
- size += profile.next_size;
+ size += t_profile.next_size;
- ret = ioctl (this->getDeviceFD (), TRINITY_IOCTL_GET_PROFILE, &profile);
+ ret = ioctl (this->getDeviceFD (), TRINITY_IOCTL_GET_PROFILE, &t_profile);
if (ret != 0)
goto ioctl_fail;
}
- *profile_buf = buf;
- *profile_size = size;
-
+ /** TODO: manipulate 'profile' from the obtained 't_profile' buffer */
+ free (buf);
return 0;
ioctl_fail:
- free (profile.buf);
+ free (buf);
return -errno;
}
*/
void putNPU_profile (npu_profile *profile)
{
- if (profile != nullptr)
- free (profile);
+ if (profile != nullptr) {
+ delete [] profile->layers;
+
+ memset (profile, '\x00', sizeof (npu_profile));
+ }
}
/**
}
return UtilTRIV2::loadModel (
- model_dir, &model_id_, NPU_PRIORITY_MID, NPU_TIMEOUT_MS);
+ model_dir, &model_id_, NPU_PRIORITY_PROFILE, NPU_TIMEOUT_MS);
}
/** @brief run the inference */
int status = UtilTRIV2::getProfile (task_id, &profile);
if (status == 0) {
if (profile.layers != nullptr) {
+ cerr << "Total System Cycles : " << profile.total_running_cycles << "\n";
+ cerr << "Dram Input Footprint (KB) : " << (profile.dram_input_footprint >> 10) << "\n";
+ cerr << "Dram Output Footprint (KB) : " << (profile.dram_output_footprint >> 10) << "\n";
+ cerr << "\n";
+
for (uint32_t i = 0; i < profile.num_layers; i++) {
cerr << "[" << i << "] " << profile.layers[i].name << "\n";
- cerr << "\tLatency (msec) : " << profile.layers[i].latency_ms << "\n";
- cerr << "\tLatency (cycles) : " << profile.layers[i].latency_cycles << "\n";
- cerr << "\tMemRead (KBytes) : " << (profile.layers[i].mem_read_bytes >> 10) << "\n";
- cerr << "\tMemWrite (KBytes) : " << (profile.layers[i].mem_write_bytes >> 10) << "\n";
+ cerr << "\tRunning Cycles : " << profile.layers[i].running_cycles << "\n";
+ cerr << "\tDram Read (KB) : " << (profile.layers[i].dram_read_bytes >> 10) << "\n";
+ cerr << "\tDram Write (KB) : " << (profile.layers[i].dram_write_bytes >> 10) << "\n";
+ cerr << "\tSRAM Read (KB) : " << (profile.layers[i].sram_read_bytes >> 10) << "\n";
+ cerr << "\tSRAM Write (KB) : " << (profile.layers[i].sram_write_bytes >> 10) << "\n";
}
- free (profile.layers);
}
+
+ putNPU_profile (&profile);
} else {
cerr << "Failed to get profile: " << status << "\n";
}
class UtilTrinity {
public:
UtilTrinity (dev_type type, bool need_model, bool verify);
- ~UtilTrinity ();
+ virtual ~UtilTrinity ();
npudev_h getDeviceHandle () { return dev_; }
lib_ver += to_string (minor) + ".";
lib_ver += to_string (extra);
+#ifndef ENABLE_EMUL
uint32_t api_level = get_api_level ();
+#endif
ss_ << std::ctime (&now_time);
ss_ << "+------------------------------------------------------------+\n";