3 * Copyright (C) 2021 Samsung Electronics
4 * Copyright (C) 2021 Dongju Chae <dongju.chae@samsung.com>
9 * @brief Model profiler for NPU Engine (NE) users.
10 * @author Dongju Chae <dongju.chae@samsung.com>
11 * @bug No known bugs except for NYI items
14 #include "ne-profiler.h"
16 ModelProfiler::ModelProfiler (const DriverAPI * api)
21 ModelProfiler::~ModelProfiler ()
23 profile_map_.clear ();
27 ModelProfiler::appendRequest (int req_id, const Model * model)
29 ProfileData * data = new ProfileData (req_id, model);
30 return profile_map_.insert (req_id, data);
34 ModelProfiler::getProfile (int req_id, npu_profile *profile)
36 ProfileData * data = profile_map_.find (req_id);
40 const Model * model = data->getModel ();
44 int status = api_->getProfile (req_id, profile);
48 HWmem * extended = model->getExtendedMetadata ();
49 if (extended != nullptr)
50 manipulateProfile (extended, profile);
52 profile_map_.remove (req_id);
58 ModelProfiler::manipulateProfile (HWmem * extended, npu_profile *profile)
60 npubin_meta_profile *meta_profile =
61 reinterpret_cast <npubin_meta_profile *> (extended->getData ());
62 npu_profile_layer * new_layers =
63 new npu_profile_layer[meta_profile->node_entry_num + 1];
65 npu_profile_layer * unclassified =
66 &new_layers[meta_profile->node_entry_num];
68 snprintf (unclassified->name, NPU_OPNAME_MAX - 1, "%s", "Unclassified");
69 unclassified->name[NPU_OPNAME_MAX - 1] = '\x00';
70 unclassified->node_id = -1;
72 unclassified->running_cycles = 0;
73 unclassified->start_cycles = 0;
74 unclassified->end_cycles = 0;
75 unclassified->dram_read_bytes = 0;
76 unclassified->dram_write_bytes = 0;
77 unclassified->sram_read_bytes = 0;
78 unclassified->sram_write_bytes = 0;
80 /** 1) parsing node table */
81 std::unordered_map<uint32_t, npu_profile_layer *> node_table;
84 node_table.reserve (meta_profile->node_entry_num);
86 for (uint32_t i = 0; i < meta_profile->node_entry_num; i++) {
89 memcpy (&id, meta_profile->entry_data + pos, sizeof (uint32_t));
90 pos += sizeof (uint32_t);
92 memcpy (&length, meta_profile->entry_data + pos, sizeof (uint32_t));
93 pos += sizeof (uint32_t);
96 std::cerr << "Zero length detected at ";
97 std::cerr << id << "th node" << std::endl;
103 std::string name (meta_profile->entry_data + pos);
106 npu_profile_layer * layer = &new_layers[i];
108 snprintf (layer->name, NPU_OPNAME_MAX - 1, "%s", name.c_str ());
109 layer->name[NPU_OPNAME_MAX - 1] = '\x00';
112 layer->running_cycles = 0;
113 layer->start_cycles = 0;
114 layer->end_cycles = 0;
115 layer->dram_read_bytes = 0;
116 layer->dram_write_bytes = 0;
117 layer->sram_read_bytes = 0;
118 layer->sram_write_bytes = 0;
120 node_table.insert(std::make_pair(id, layer));
123 /** 2) parsing visa table */
124 pos = meta_profile->node_table_size;
125 for (uint32_t i = 0; i < meta_profile->visa_entry_num; i++) {
126 uint32_t id, node_num;
128 memcpy (&id, meta_profile->entry_data + pos, sizeof (uint32_t));
129 pos += sizeof (uint32_t);
131 memcpy (&node_num, meta_profile->entry_data + pos, sizeof (uint32_t));
132 pos += sizeof (uint32_t);
135 uint32_t * node_ids = (uint32_t *) (meta_profile->entry_data + pos);
137 for (uint32_t j = 0; j < node_num; j++) {
138 uint32_t node_id = node_ids[j];
139 auto it = node_table.find (node_id);
141 if (it != node_table.end ()) {
142 npu_profile_layer * layer = it->second;
144 /** TODO: evenly divided to fused layers */
145 layer->running_cycles += profile->layers[i].running_cycles / node_num;
146 if (layer->start_cycles == 0)
147 layer->start_cycles = profile->layers[i].start_cycles;
148 if (layer->end_cycles < profile->layers[i].end_cycles)
149 layer->end_cycles = profile->layers[i].end_cycles;
150 layer->dram_read_bytes += profile->layers[i].dram_read_bytes / node_num;
151 layer->dram_write_bytes += profile->layers[i].dram_write_bytes / node_num;
152 layer->sram_read_bytes += profile->layers[i].sram_read_bytes / node_num;
153 layer->sram_write_bytes += profile->layers[i].sram_write_bytes / node_num;
154 layer->visa_exec_seq = -1;
156 std::cerr << "Unable to find the node ID " << node_id << std::endl;
160 unclassified->running_cycles += profile->layers[i].running_cycles;
161 unclassified->dram_read_bytes += profile->layers[i].dram_read_bytes;
162 unclassified->dram_write_bytes += profile->layers[i].dram_write_bytes;
163 unclassified->sram_read_bytes += profile->layers[i].sram_read_bytes;
164 unclassified->sram_write_bytes += profile->layers[i].sram_write_bytes;
165 unclassified->visa_exec_seq = -1;
168 pos += sizeof (uint32_t) * node_num;
171 /** 3) profile data mapping */
172 size_t num_layers = node_table.size ();
173 if (num_layers > 0) {
174 delete [] profile->layers;
176 profile->layers = new_layers;
177 profile->num_layers = num_layers + 1;
179 delete [] new_layers;