3 * Copyright (C) 2021 Samsung Electronics
4 * Copyright (C) 2021 Dongju Chae <dongju.chae@samsung.com>
9 * @brief Model profiler for NPU Engine (NE) users.
10 * @author Dongju Chae <dongju.chae@samsung.com>
11 * @bug No known bugs except for NYI items
14 #include "ne-profiler.h"
16 ModelProfiler::ModelProfiler (const DriverAPI * api)
21 ModelProfiler::~ModelProfiler ()
23 profile_map_.clear ();
27 ModelProfiler::appendRequest (int req_id, const Model * model)
29 ProfileData * data = new ProfileData (req_id, model);
30 return profile_map_.insert (req_id, data);
34 ModelProfiler::getProfile (int req_id, npu_profile *profile)
36 ProfileData * data = profile_map_.find (req_id);
40 const Model * model = data->getModel ();
44 int status = api_->getProfile (req_id, profile);
48 HWmem * extended = model->getExtendedMetadata ();
49 if (extended != nullptr)
50 manipulateProfile (extended, profile);
52 profile_map_.remove (req_id);
57 ModelProfiler::manipulateProfile (HWmem * extended, npu_profile *profile)
59 npubin_meta_profile *meta_profile =
60 reinterpret_cast <npubin_meta_profile *> (extended->getData ());
61 npu_profile_layer * new_layers =
62 new npu_profile_layer[meta_profile->node_entry_num + 1];
64 npu_profile_layer * unclassified =
65 &new_layers[meta_profile->node_entry_num];
67 snprintf (unclassified->name, NPU_OPNAME_MAX - 1, "%s", "Unclassified");
68 unclassified->name[NPU_OPNAME_MAX - 1] = '\x00';
69 unclassified->node_id = -1;
71 unclassified->running_cycles = 0;
72 unclassified->start_cycles = 0;
73 unclassified->end_cycles = 0;
74 unclassified->dram_read_bytes = 0;
75 unclassified->dram_write_bytes = 0;
76 unclassified->sram_read_bytes = 0;
77 unclassified->sram_write_bytes = 0;
79 /** 1) parsing node table */
80 std::unordered_map<uint32_t, npu_profile_layer *> node_table;
83 node_table.reserve (meta_profile->node_entry_num);
85 for (uint32_t i = 0; i < meta_profile->node_entry_num; i++) {
88 memcpy (&id, meta_profile->entry_data + pos, sizeof (uint32_t));
89 pos += sizeof (uint32_t);
91 memcpy (&length, meta_profile->entry_data + pos, sizeof (uint32_t));
92 pos += sizeof (uint32_t);
95 std::cerr << "Zero length detected at ";
96 std::cerr << id << "th node" << std::endl;
102 std::string name (meta_profile->entry_data + pos);
105 npu_profile_layer * layer = &new_layers[i];
107 snprintf (layer->name, NPU_OPNAME_MAX - 1, "%s", name.c_str ());
108 layer->name[NPU_OPNAME_MAX - 1] = '\x00';
111 layer->running_cycles = 0;
112 layer->start_cycles = 0;
113 layer->end_cycles = 0;
114 layer->dram_read_bytes = 0;
115 layer->dram_write_bytes = 0;
116 layer->sram_read_bytes = 0;
117 layer->sram_write_bytes = 0;
119 node_table.insert(std::make_pair(id, layer));
122 /** 2) parsing visa table */
123 pos = meta_profile->node_table_size;
124 for (uint32_t i = 0; i < meta_profile->visa_entry_num; i++) {
125 uint32_t id, node_num;
127 memcpy (&id, meta_profile->entry_data + pos, sizeof (uint32_t));
128 pos += sizeof (uint32_t);
130 memcpy (&node_num, meta_profile->entry_data + pos, sizeof (uint32_t));
131 pos += sizeof (uint32_t);
134 uint32_t * node_ids = (uint32_t *) (meta_profile->entry_data + pos);
136 for (uint32_t j = 0; j < node_num; j++) {
137 uint32_t node_id = node_ids[j];
138 auto it = node_table.find (node_id);
140 if (it != node_table.end ()) {
141 npu_profile_layer * layer = it->second;
143 /** TODO: evenly divided to fused layers */
144 layer->running_cycles += profile->layers[i].running_cycles / node_num;
145 if (layer->start_cycles == 0)
146 layer->start_cycles = profile->layers[i].start_cycles;
147 if (layer->end_cycles < profile->layers[i].end_cycles)
148 layer->end_cycles = profile->layers[i].end_cycles;
149 layer->dram_read_bytes += profile->layers[i].dram_read_bytes / node_num;
150 layer->dram_write_bytes += profile->layers[i].dram_write_bytes / node_num;
151 layer->sram_read_bytes += profile->layers[i].sram_read_bytes / node_num;
152 layer->sram_write_bytes += profile->layers[i].sram_write_bytes / node_num;
153 layer->visa_exec_seq = -1;
155 std::cerr << "Unable to find the node ID " << node_id << std::endl;
159 unclassified->running_cycles += profile->layers[i].running_cycles;
160 unclassified->dram_read_bytes += profile->layers[i].dram_read_bytes;
161 unclassified->dram_write_bytes += profile->layers[i].dram_write_bytes;
162 unclassified->sram_read_bytes += profile->layers[i].sram_read_bytes;
163 unclassified->sram_write_bytes += profile->layers[i].sram_write_bytes;
164 unclassified->visa_exec_seq = -1;
167 pos += sizeof (uint32_t) * node_num;
170 /** 3) profile data mapping */
171 size_t num_layers = node_table.size ();
172 if (num_layers > 0) {
173 delete [] profile->layers;
175 profile->layers = new_layers;
176 profile->num_layers = num_layers + 1;
178 delete [] new_layers;