[Profile] Add start/end cycles for profiling
authorDongju Chae <dongju.chae@samsung.com>
Tue, 6 Apr 2021 05:00:03 +0000 (14:00 +0900)
committer채동주/On-Device Lab(SR)/Staff Engineer/삼성전자 <dongju.chae@samsung.com>
Wed, 7 Apr 2021 01:55:42 +0000 (10:55 +0900)
This patch adds start/end cycles for profiling.

Signed-off-by: Dongju Chae <dongju.chae@samsung.com>
include/common/typedef.h
src/core/ne-profiler.cc
src/core/npu/NPUdrvAPI_emul.cc
tests/apptests/tvn_triv2_profile.cc

index 7c3f037..24bfc43 100644 (file)
@@ -342,6 +342,9 @@ typedef struct {
 
       int64_t visa_exec_seq;     /**< vISA global execution sequence */
 
+      int64_t start_cycles;
+      int64_t end_cycles;
+
       /** TODO: Add more info */
     } __attribute__((packed, aligned));
     char reserved[NPU_PROFILE_SIZE];
index 3b6b1c0..98d5f15 100644 (file)
@@ -70,6 +70,8 @@ ModelProfiler::manipulateProfile (HWmem * extended, npu_profile *profile)
   unclassified->node_id = -1;
 
   unclassified->running_cycles = 0;
+  unclassified->start_cycles = 0;
+  unclassified->end_cycles = 0;
   unclassified->dram_read_bytes = 0;
   unclassified->dram_write_bytes = 0;
   unclassified->sram_read_bytes = 0;
@@ -108,6 +110,8 @@ ModelProfiler::manipulateProfile (HWmem * extended, npu_profile *profile)
     layer->node_id = id;
 
     layer->running_cycles = 0;
+    layer->start_cycles = 0;
+    layer->end_cycles = 0;
     layer->dram_read_bytes = 0;
     layer->dram_write_bytes = 0;
     layer->sram_read_bytes = 0;
@@ -139,6 +143,10 @@ ModelProfiler::manipulateProfile (HWmem * extended, npu_profile *profile)
 
           /** TODO: evenly divided to fused layers */
           layer->running_cycles += profile->layers[i].running_cycles / node_num;
+          if (layer->start_cycles == 0)
+            layer->start_cycles = profile->layers[i].start_cycles;
+          if (layer->end_cycles < profile->layers[i].end_cycles)
+            layer->end_cycles = profile->layers[i].end_cycles;
           layer->dram_read_bytes += profile->layers[i].dram_read_bytes / node_num;
           layer->dram_write_bytes += profile->layers[i].dram_write_bytes / node_num;
           layer->sram_read_bytes += profile->layers[i].sram_read_bytes / node_num;
index 652e8ce..1bc6567 100644 (file)
@@ -113,18 +113,20 @@ class EmulTask {
 
           memset (layer, '\x00', sizeof (npu_profile_layer));
 
+          layer->running_cycles = common.cycle_end - common.cycle_start;
+          layer->start_cycles = common.cycle_start;
+          layer->end_cycles = common.cycle_end;
+
           switch (common.block_id) {
             case TRIV2PROF_BLOCKID_NNA:
               ifs.read ((char *) &nna, sizeof (T2PF_DUMP_NNA));
 
               snprintf (layer->name, NPU_OPNAME_MAX, "%s", nna.op_name);
-              layer->running_cycles = common.cycle_end - common.cycle_start;
               break;
             case TRIV2PROF_BLOCKID_NNA_DMA_IN:
               ifs.read ((char *) &nna_dma, sizeof (T2PF_DUMP_DMA));
 
               snprintf (layer->name, NPU_OPNAME_MAX, "%s", "ADMA_IN");
-              layer->running_cycles = common.cycle_end - common.cycle_start;
               layer->dram_read_bytes = nna_dma.src_addr_end - nna_dma.src_addr_start;
               layer->sram_write_bytes = nna_dma.dest_addr_end - nna_dma.dest_addr_start;
               break;
@@ -132,7 +134,6 @@ class EmulTask {
               ifs.read ((char *) &nna_dma, sizeof (T2PF_DUMP_DMA));
 
               snprintf (layer->name, NPU_OPNAME_MAX, "%s", "ADMA_OUT");
-              layer->running_cycles = common.cycle_end - common.cycle_start;
               layer->dram_write_bytes = nna_dma.dest_addr_end - nna_dma.dest_addr_start;
               layer->sram_read_bytes = nna_dma.src_addr_end - nna_dma.src_addr_start;
               break;
@@ -140,13 +141,11 @@ class EmulTask {
               ifs.read ((char *) &dsp, sizeof (T2PF_DUMP_DSP));
 
               snprintf (layer->name, NPU_OPNAME_MAX, "%s", dsp.op_name);
-              layer->running_cycles = common.cycle_end - common.cycle_start;
               break;
             case TRIV2PROF_BLOCKID_DSP_DMA_IN:
               ifs.read ((char *) &dsp_dma, sizeof (T2PF_DUMP_DMA));
 
               snprintf (layer->name, NPU_OPNAME_MAX, "%s", "PDMA_IN");
-              layer->running_cycles = common.cycle_end - common.cycle_start;
               layer->dram_read_bytes = dsp_dma.src_addr_end - dsp_dma.src_addr_start;
               layer->sram_write_bytes = dsp_dma.dest_addr_end - dsp_dma.dest_addr_start;
               break;
@@ -154,7 +153,6 @@ class EmulTask {
               ifs.read ((char *) &dsp_dma, sizeof (T2PF_DUMP_DMA));
 
               snprintf (layer->name, NPU_OPNAME_MAX, "%s", "PDMA_OUT");
-              layer->running_cycles = common.cycle_end - common.cycle_start;
               layer->dram_write_bytes = dsp_dma.dest_addr_end - dsp_dma.dest_addr_start;
               layer->sram_read_bytes = dsp_dma.src_addr_end - dsp_dma.src_addr_start;
               break;
index 609f6c3..175d241 100644 (file)
@@ -68,6 +68,10 @@ class Tester : public UtilTRIV2
             cerr << "[" << i << "] " << profile.layers[i].name << "\n";
             if (profile.layers[i].running_cycles > 0)
               cerr << "\tRunning Cycles  : " << profile.layers[i].running_cycles << "\n";
+            if (profile.layers[i].start_cycles > 0)
+              cerr << "\tStart Cycles  : " << profile.layers[i].start_cycles << "\n";
+            if (profile.layers[i].end_cycles > 0)
+              cerr << "\tEnd Cycles  : " << profile.layers[i].end_cycles << "\n";
             if (profile.layers[i].dram_read_bytes > 0)
               cerr << "\tDRAM Read  (KB) : " << (profile.layers[i].dram_read_bytes >> 10) << "\n";
             if (profile.layers[i].dram_write_bytes > 0)