From: Dongju Chae Date: Tue, 6 Apr 2021 05:00:03 +0000 (+0900) Subject: [Profile] Add start/end cycles for profiling X-Git-Tag: accepted/tizen/unified/20220103.130045~193 X-Git-Url: http://review.tizen.org/git/?a=commitdiff_plain;h=7ca7791d1e9d306aa2928886d7a3ef950cfcaf57;p=platform%2Fadaptation%2Fnpu%2Ftrix-engine.git [Profile] Add start/end cycles for profiling This patch adds start/end cycles for profiling. Signed-off-by: Dongju Chae --- diff --git a/include/common/typedef.h b/include/common/typedef.h index 7c3f037..24bfc43 100644 --- a/include/common/typedef.h +++ b/include/common/typedef.h @@ -342,6 +342,9 @@ typedef struct { int64_t visa_exec_seq; /**< vISA global execution sequence */ + int64_t start_cycles; + int64_t end_cycles; + /** TODO: Add more info */ } __attribute__((packed, aligned)); char reserved[NPU_PROFILE_SIZE]; diff --git a/src/core/ne-profiler.cc b/src/core/ne-profiler.cc index 3b6b1c0..98d5f15 100644 --- a/src/core/ne-profiler.cc +++ b/src/core/ne-profiler.cc @@ -70,6 +70,8 @@ ModelProfiler::manipulateProfile (HWmem * extended, npu_profile *profile) unclassified->node_id = -1; unclassified->running_cycles = 0; + unclassified->start_cycles = 0; + unclassified->end_cycles = 0; unclassified->dram_read_bytes = 0; unclassified->dram_write_bytes = 0; unclassified->sram_read_bytes = 0; @@ -108,6 +110,8 @@ ModelProfiler::manipulateProfile (HWmem * extended, npu_profile *profile) layer->node_id = id; layer->running_cycles = 0; + layer->start_cycles = 0; + layer->end_cycles = 0; layer->dram_read_bytes = 0; layer->dram_write_bytes = 0; layer->sram_read_bytes = 0; @@ -139,6 +143,10 @@ ModelProfiler::manipulateProfile (HWmem * extended, npu_profile *profile) /** TODO: evenly divided to fused layers */ layer->running_cycles += profile->layers[i].running_cycles / node_num; + if (layer->start_cycles == 0) + layer->start_cycles = profile->layers[i].start_cycles; + if (layer->end_cycles < profile->layers[i].end_cycles) + layer->end_cycles = profile->layers[i].end_cycles; layer->dram_read_bytes += profile->layers[i].dram_read_bytes / node_num; layer->dram_write_bytes += profile->layers[i].dram_write_bytes / node_num; layer->sram_read_bytes += profile->layers[i].sram_read_bytes / node_num; diff --git a/src/core/npu/NPUdrvAPI_emul.cc b/src/core/npu/NPUdrvAPI_emul.cc index 652e8ce..1bc6567 100644 --- a/src/core/npu/NPUdrvAPI_emul.cc +++ b/src/core/npu/NPUdrvAPI_emul.cc @@ -113,18 +113,20 @@ class EmulTask { memset (layer, '\x00', sizeof (npu_profile_layer)); + layer->running_cycles = common.cycle_end - common.cycle_start; + layer->start_cycles = common.cycle_start; + layer->end_cycles = common.cycle_end; + switch (common.block_id) { case TRIV2PROF_BLOCKID_NNA: ifs.read ((char *) &nna, sizeof (T2PF_DUMP_NNA)); snprintf (layer->name, NPU_OPNAME_MAX, "%s", nna.op_name); - layer->running_cycles = common.cycle_end - common.cycle_start; break; case TRIV2PROF_BLOCKID_NNA_DMA_IN: ifs.read ((char *) &nna_dma, sizeof (T2PF_DUMP_DMA)); snprintf (layer->name, NPU_OPNAME_MAX, "%s", "ADMA_IN"); - layer->running_cycles = common.cycle_end - common.cycle_start; layer->dram_read_bytes = nna_dma.src_addr_end - nna_dma.src_addr_start; layer->sram_write_bytes = nna_dma.dest_addr_end - nna_dma.dest_addr_start; break; @@ -132,7 +134,6 @@ class EmulTask { ifs.read ((char *) &nna_dma, sizeof (T2PF_DUMP_DMA)); snprintf (layer->name, NPU_OPNAME_MAX, "%s", "ADMA_OUT"); - layer->running_cycles = common.cycle_end - common.cycle_start; layer->dram_write_bytes = nna_dma.dest_addr_end - nna_dma.dest_addr_start; layer->sram_read_bytes = nna_dma.src_addr_end - nna_dma.src_addr_start; break; @@ -140,13 +141,11 @@ class EmulTask { ifs.read ((char *) &dsp, sizeof (T2PF_DUMP_DSP)); snprintf (layer->name, NPU_OPNAME_MAX, "%s", dsp.op_name); - layer->running_cycles = common.cycle_end - common.cycle_start; break; case TRIV2PROF_BLOCKID_DSP_DMA_IN: ifs.read ((char *) &dsp_dma, sizeof (T2PF_DUMP_DMA)); snprintf (layer->name, NPU_OPNAME_MAX, "%s", "PDMA_IN"); - layer->running_cycles = common.cycle_end - common.cycle_start; layer->dram_read_bytes = dsp_dma.src_addr_end - dsp_dma.src_addr_start; layer->sram_write_bytes = dsp_dma.dest_addr_end - dsp_dma.dest_addr_start; break; @@ -154,7 +153,6 @@ class EmulTask { ifs.read ((char *) &dsp_dma, sizeof (T2PF_DUMP_DMA)); snprintf (layer->name, NPU_OPNAME_MAX, "%s", "PDMA_OUT"); - layer->running_cycles = common.cycle_end - common.cycle_start; layer->dram_write_bytes = dsp_dma.dest_addr_end - dsp_dma.dest_addr_start; layer->sram_read_bytes = dsp_dma.src_addr_end - dsp_dma.src_addr_start; break; diff --git a/tests/apptests/tvn_triv2_profile.cc b/tests/apptests/tvn_triv2_profile.cc index 609f6c3..175d241 100644 --- a/tests/apptests/tvn_triv2_profile.cc +++ b/tests/apptests/tvn_triv2_profile.cc @@ -68,6 +68,10 @@ class Tester : public UtilTRIV2 cerr << "[" << i << "] " << profile.layers[i].name << "\n"; if (profile.layers[i].running_cycles > 0) cerr << "\tRunning Cycles : " << profile.layers[i].running_cycles << "\n"; + if (profile.layers[i].start_cycles > 0) + cerr << "\tStart Cycles : " << profile.layers[i].start_cycles << "\n"; + if (profile.layers[i].end_cycles > 0) + cerr << "\tEnd Cycles : " << profile.layers[i].end_cycles << "\n"; if (profile.layers[i].dram_read_bytes > 0) cerr << "\tDRAM Read (KB) : " << (profile.layers[i].dram_read_bytes >> 10) << "\n"; if (profile.layers[i].dram_write_bytes > 0)