[Profile] Revise profiling interface for TRIV2 kernel driver
authorDongju Chae <dongju.chae@samsung.com>
Wed, 9 Dec 2020 07:14:49 +0000 (16:14 +0900)
committer송욱/On-Device Lab(SR)/Staff Engineer/삼성전자 <wook16.song@samsung.com>
Mon, 14 Dec 2020 10:36:17 +0000 (19:36 +0900)
This patch revises profiling interface for TRIV2 kernel driver.
It tries to keep the same interface with simulator's one.

Signed-off-by: Dongju Chae <dongju.chae@samsung.com>
include/common/typedef.h
src/core/npu/NPUdrvAPI_emul.cc
src/core/npu/NPUdrvAPI_triv2.cc
tests/apptests/tvn_triv2_profile.cc

index 9e2f290..d75dba4 100644 (file)
@@ -324,28 +324,40 @@ typedef struct {
   npu_stat_task *stat;
 } npu_stat_tasks;
 
-/** NPU Profiling (both for emulated/real-device envionment) */
+/**
+ * NPU Profiling (both for emulated/real-device envionment)
+ *
+ * Note that negative values mean non-supported profiling info.
+ */
 
-#define NPU_OPNAME_MAX (32)
+#define NPU_PROFILE_SIZE (128)
+#define NPU_OPNAME_MAX    (32)
 
 typedef struct {
-  char name[NPU_OPNAME_MAX];
+  union {
+    struct {
+      char name[NPU_OPNAME_MAX];
 
-  uint64_t running_cycles;
+      int64_t running_cycles;
 
-  uint64_t dram_read_bytes;
-  uint64_t dram_write_bytes;
+      int64_t dram_read_bytes;
+      int64_t dram_write_bytes;
 
-  uint64_t sram_read_bytes;
-  uint64_t sram_write_bytes;
+      int64_t sram_read_bytes;
+      int64_t sram_write_bytes;
+
+      /** TODO: Add more info */
+    };
+    char reserved[NPU_PROFILE_SIZE];
+  };
 } npu_profile_layer;
 
 typedef struct {
   uint32_t num_layers;
 
-  uint64_t total_running_cycles;
-  uint64_t dram_input_footprint;
-  uint64_t dram_output_footprint;
+  int64_t total_system_cycles;
+  int64_t dram_input_footprint;
+  int64_t dram_output_footprint;
 
   npu_profile_layer *layers;
 } npu_profile;
index b2f8da9..142025b 100644 (file)
@@ -74,7 +74,7 @@ class EmulTask {
       if (total_dump > 0) {
         profile->layers = new npu_profile_layer [total_dump];
         profile->num_layers = total_dump;
-        profile->total_running_cycles = head.total_cycles;
+        profile->total_system_cycles = head.total_cycles;
         profile->dram_input_footprint = head.nna_dma_in.access_footprint_byte;
         profile->dram_output_footprint = head.nna_dma_out.access_footprint_byte;
 
index 7b023db..19d357b 100644 (file)
@@ -12,7 +12,7 @@
 #include <npubinfmt.h>
 
 constexpr int max_num_devs = ((1<<CHAR_BIT) - 1);
-constexpr size_t max_buf_size = (256 * PAGE_SIZE);
+constexpr size_t default_buf_size = (256 * PAGE_SIZE);
 
 const std::string TrinityVision2API::dev_node_base = "triv2";
 std::bitset<CHAR_BIT> TrinityVision2API::dev_bitset = 0;
@@ -430,48 +430,42 @@ int
 TrinityVision2API::getProfile (int task_id, npu_profile *profile) const
 {
   struct trinity_profile t_profile;
-  size_t size = max_buf_size;
-  void * buf;
-  int ret;
+  int ret = 0;
 
-  buf = malloc(max_buf_size);
-  if (!buf)
-    return -ENOMEM;
+  if (profile == nullptr)
+    return -EINVAL;
 
-  t_profile.task_id = task_id;
-  t_profile.buf = buf;
-  t_profile.buf_size = size;
-  t_profile.next_size = 0;
+  t_profile.taskid = task_id;
 
-  ret = ioctl (this->getDeviceFD (), TRINITY_IOCTL_GET_PROFILE, &t_profile);
+  ret = ioctl (this->getDeviceFD (), TRINITY_IOCTL_GET_PROFILE_META, &t_profile);
   if (ret != 0)
-      goto ioctl_fail;
+    return -errno;
 
-  if (t_profile.next_size != 0) {
-    buf = realloc (t_profile.buf, t_profile.buf_size + t_profile.next_size);
-    if (!buf) {
-      free (t_profile.buf);
-      return -ENOMEM;
-    }
+  /** no profiling buffer provided but it's okay */
+  if (t_profile.total_ops == 0)
+    return 0;
 
-    t_profile.buf = (char *) buf + t_profile.buf_size;
-    t_profile.buf_size = t_profile.next_size;
-    t_profile.next_size = 0;
+  profile->total_system_cycles = t_profile.total_cycles;
+  profile->dram_input_footprint = -1;
+  profile->dram_output_footprint = -1;
 
-    size += t_profile.next_size;
+  profile->num_layers = t_profile.total_ops;
+  profile->layers = new npu_profile_layer[profile->num_layers];
 
-    ret = ioctl (this->getDeviceFD (), TRINITY_IOCTL_GET_PROFILE, &t_profile);
-    if (ret != 0)
-      goto ioctl_fail;
+  t_profile.profile_buf = profile->layers;
+  ret = ioctl (this->getDeviceFD (), TRINITY_IOCTL_GET_PROFILE_BUFF, &t_profile);
+  if (ret != 0) {
+    delete [] (profile->layers);
+
+    profile->layers = nullptr;
+    profile->num_layers = 0;
+
+    return -errno;
   }
 
-  /** TODO: manipulate 'profile' from the obtained 't_profile' buffer */
-  free (buf);
-  return 0;
+  /** TODO: manipulate the profiling info later (i.e., per-visa to per-layer) */
 
-ioctl_fail:
-  free (buf);
-  return -errno;
+  return 0;
 }
 
 int
index 88c5c92..f7a2367 100644 (file)
@@ -56,19 +56,26 @@ class Tester : public UtilTRIV2
       npu_profile profile;
       int status = UtilTRIV2::getProfile (task_id, &profile);
       if (status == 0) {
-        if (profile.layers != nullptr) {
-          cerr << "Total System Cycles : " << profile.total_running_cycles << "\n";
+        cerr << "Total System Cycles : " << profile.total_system_cycles << "\n";
+        if (profile.dram_input_footprint > 0)
           cerr << "Dram Input Footprint (KB) : " << (profile.dram_input_footprint >> 10) << "\n";
+        if (profile.dram_output_footprint > 0)
           cerr << "Dram Output Footprint (KB) : " << (profile.dram_output_footprint >> 10) << "\n";
-          cerr << "\n";
 
+        if (profile.layers != nullptr) {
+          cerr << "\n";
           for (uint32_t i = 0; i < profile.num_layers; i++) {
             cerr << "[" << i << "] " << profile.layers[i].name << "\n";
-            cerr << "\tRunning Cycles  : " << profile.layers[i].running_cycles << "\n";
-            cerr << "\tDram Read  (KB) : " << (profile.layers[i].dram_read_bytes >> 10) << "\n";
-            cerr << "\tDram Write (KB) : " << (profile.layers[i].dram_write_bytes >> 10) << "\n";
-            cerr << "\tSRAM Read  (KB) : " << (profile.layers[i].sram_read_bytes >> 10) << "\n";
-            cerr << "\tSRAM Write (KB) : " << (profile.layers[i].sram_write_bytes >> 10) << "\n";
+            if (profile.layers[i].running_cycles > 0)
+              cerr << "\tRunning Cycles  : " << profile.layers[i].running_cycles << "\n";
+            if (profile.layers[i].dram_read_bytes > 0)
+              cerr << "\tDram Read  (KB) : " << (profile.layers[i].dram_read_bytes >> 10) << "\n";
+            if (profile.layers[i].dram_write_bytes > 0)
+              cerr << "\tDram Write (KB) : " << (profile.layers[i].dram_write_bytes >> 10) << "\n";
+            if (profile.layers[i].sram_read_bytes > 0)
+              cerr << "\tSRAM Read  (KB) : " << (profile.layers[i].sram_read_bytes >> 10) << "\n";
+            if (profile.layers[i].sram_write_bytes > 0)
+              cerr << "\tSRAM Write (KB) : " << (profile.layers[i].sram_write_bytes >> 10) << "\n";
           }
         }