3 * Copyright (C) 2021 Samsung Electronics
4 * Copyright (C) 2021 Dongju Chae <dongju.chae@samsung.com>
9 * @brief Model profiler for NPU Engine (NE) users.
10 * @author Dongju Chae <dongju.chae@samsung.com>
11 * @bug No known bugs except for NYI items
14 #include "ne-profiler.h"
16 ModelProfiler::ModelProfiler (const DriverAPI * api)
21 ModelProfiler::~ModelProfiler ()
23 profile_map_.clear ();
27 ModelProfiler::appendTask (int task_id, const Model * model)
29 ProfileData * data = new ProfileData (task_id, model);
30 return profile_map_.insert (task_id, data);
34 ModelProfiler::getTaskProfile (int task_id, npu_profile *profile)
36 ProfileData * data = profile_map_.find (task_id);
40 const Model * model = data->getModel ();
44 int status = api_->getProfile (task_id, profile);
48 HWmem * extended = model->getExtendedMetadata ();
49 if (extended != nullptr)
50 manipulateProfile (extended, profile);
52 profile_map_.remove (task_id);
58 ModelProfiler::manipulateProfile (HWmem * extended, npu_profile *profile)
60 npubin_meta_profile *meta_profile =
61 reinterpret_cast <npubin_meta_profile *> (extended->getData ());
62 npu_profile_layer * new_layers =
63 new npu_profile_layer[meta_profile->node_entry_num + 1];
65 npu_profile_layer * unclassified =
66 &new_layers[meta_profile->node_entry_num];
68 snprintf (unclassified->name, NPU_OPNAME_MAX - 1, "%s", "Unclassified");
69 unclassified->name[NPU_OPNAME_MAX - 1] = '\x00';
70 unclassified->node_id = -1;
72 unclassified->running_cycles = 0;
73 unclassified->dram_read_bytes = 0;
74 unclassified->dram_write_bytes = 0;
75 unclassified->sram_read_bytes = 0;
76 unclassified->sram_write_bytes = 0;
78 /** 1) parsing node table */
79 std::unordered_map<uint32_t, npu_profile_layer *> node_table;
82 node_table.reserve (meta_profile->node_entry_num);
84 for (uint32_t i = 0; i < meta_profile->node_entry_num; i++) {
87 memcpy (&id, meta_profile->entry_data + pos, sizeof (uint32_t));
88 pos += sizeof (uint32_t);
90 memcpy (&length, meta_profile->entry_data + pos, sizeof (uint32_t));
91 pos += sizeof (uint32_t);
94 std::cerr << "Zero length detected at ";
95 std::cerr << id << "th node" << std::endl;
101 std::string name (meta_profile->entry_data + pos);
104 npu_profile_layer * layer = &new_layers[i];
106 snprintf (layer->name, NPU_OPNAME_MAX - 1, "%s", name.c_str ());
107 layer->name[NPU_OPNAME_MAX - 1] = '\x00';
110 layer->running_cycles = 0;
111 layer->dram_read_bytes = 0;
112 layer->dram_write_bytes = 0;
113 layer->sram_read_bytes = 0;
114 layer->sram_write_bytes = 0;
116 node_table.insert(std::make_pair(id, layer));
119 /** 2) parsing visa table */
120 pos = meta_profile->node_table_size;
121 for (uint32_t i = 0; i < meta_profile->visa_entry_num; i++) {
122 uint32_t id, node_num;
124 memcpy (&id, meta_profile->entry_data + pos, sizeof (uint32_t));
125 pos += sizeof (uint32_t);
127 memcpy (&node_num, meta_profile->entry_data + pos, sizeof (uint32_t));
128 pos += sizeof (uint32_t);
131 uint32_t * node_ids = (uint32_t *) (meta_profile->entry_data + pos);
133 for (uint32_t j = 0; j < node_num; j++) {
134 uint32_t node_id = node_ids[j];
135 auto it = node_table.find (node_id);
137 if (it != node_table.end ()) {
138 npu_profile_layer * layer = it->second;
140 /** TODO: evenly divided to fused layers */
141 layer->running_cycles += profile->layers[i].running_cycles / node_num;
142 layer->dram_read_bytes += profile->layers[i].dram_read_bytes / node_num;
143 layer->dram_write_bytes += profile->layers[i].dram_write_bytes / node_num;
144 layer->sram_read_bytes += profile->layers[i].sram_read_bytes / node_num;
145 layer->sram_write_bytes += profile->layers[i].sram_write_bytes / node_num;
146 layer->visa_exec_seq = -1;
148 std::cerr << "Unable to find the node ID " << node_id << std::endl;
152 unclassified->running_cycles += profile->layers[i].running_cycles;
153 unclassified->dram_read_bytes += profile->layers[i].dram_read_bytes;
154 unclassified->dram_write_bytes += profile->layers[i].dram_write_bytes;
155 unclassified->sram_read_bytes += profile->layers[i].sram_read_bytes;
156 unclassified->sram_write_bytes += profile->layers[i].sram_write_bytes;
157 unclassified->visa_exec_seq = -1;
160 pos += sizeof (uint32_t) * node_num;
163 /** 3) profile data mapping */
164 size_t num_layers = node_table.size ();
165 if (num_layers > 0) {
166 delete [] profile->layers;
168 profile->layers = new_layers;
169 profile->num_layers = num_layers + 1;
171 delete [] new_layers;