core: npu: Apply triv2.4 profile format
authorJiho Chu <jiho.chu@samsung.com>
Fri, 1 Jul 2022 08:00:48 +0000 (17:00 +0900)
committer추지호/NPU Lab(SR)/삼성전자 <jiho.chu@samsung.com>
Fri, 22 Jul 2022 10:06:40 +0000 (19:06 +0900)
It applies triv2.4 profile format.
T24PF_HEAD structure is added and nna is doubled in the structure.
CUSE only support 2.4 profile

Signed-off-by: Jiho Chu <jiho.chu@samsung.com>
src/core/npu/NPUdrvAPI_emul.cc
utils/trinity_cuse/trinity-cuse-triv2.cc

index f158d7c..e65a96d 100644 (file)
 #define RESERVED_DSPM_SIZE (64 * 1024) /* 64 KiB */
 #define ENVNAME_DSPM_SIZE ("MRPSIM_SPM_SIZE")
 
-static uint64_t global_exec_seq = 0;
+namespace {
+
+uint64_t global_exec_seq = 0;
+
+inline void getProfileArchVersion (uint32_t version, int *major, int *minor) {
+  *major = (version >> TRIV2PROF_MAJOR_SHIFT) & 0xffff;
+  *minor = (version) & 0xffff;
+}
+
+int
+parseProfile23 (std::ifstream *ifs, npu_profile *profile) {
+  T2PF_HEAD head;
+  ifs->read ((char *) &head, sizeof (T2PF_HEAD));
+
+  uint32_t total_dump = 0;
+
+  /* Neural Network Accelerator (NNA) */
+  total_dump += head.nna.num_of_dump;
+  total_dump += head.nna_dma_in.num_of_dump;
+  total_dump += head.nna_dma_out.num_of_dump;
+
+  /* Digital Signal Processor (DSP) */
+  total_dump += head.dsp.num_of_dump;
+  total_dump += head.dsp_dma_in.num_of_dump;
+  total_dump += head.dsp_dma_out.num_of_dump;
+
+  if (total_dump > 0) {
+    profile->layers = new npu_profile_layer[total_dump];
+    profile->num_layers = total_dump;
+    profile->total_system_cycles = head.total_cycles;
+    profile->dram_input_footprint = head.nna_dma_in.access_footprint_byte;
+    profile->dram_output_footprint = head.nna_dma_out.access_footprint_byte;
+    profile->dram_input_footprint += head.dsp_dma_in.access_footprint_byte;
+    profile->dram_output_footprint += head.dsp_dma_out.access_footprint_byte;
+
+    for (uint32_t i = 0; i < total_dump; i++) {
+      npu_profile_layer *layer = &profile->layers[i];
+
+      T2PF_DUMP common;
+      T2PF_DUMP_NNA nna;
+      T2PF_DUMP_DMA nna_dma;
+      T2PF_DUMP_DSP dsp;
+      T2PF_DUMP_DMA dsp_dma;
+
+      std::streampos pos;
+
+      pos = ifs->tellg ();
+      ifs->read ((char *) &common, sizeof (T2PF_DUMP));
+      ifs->seekg (pos);
+
+      memset (layer, '\x00', sizeof (npu_profile_layer));
+
+      layer->running_cycles = common.cycle_end - common.cycle_start;
+      layer->start_cycles = common.cycle_start;
+      layer->end_cycles = common.cycle_end;
+      layer->visa_prog_seq = i;
+      /* In the first run, program sequence == exec sequence */
+      layer->visa_exec_seq = global_exec_seq++;
+
+      switch (common.block_id) {
+        case TRIV2PROF_BLOCKID_NNA:
+          ifs->read ((char *) &nna, sizeof (T2PF_DUMP_NNA));
+
+          layer->visa_opcode = nna.opcode;
+          snprintf (layer->name, NPU_OPNAME_MAX, "%s", nna.op_name);
+          break;
+        case TRIV2PROF_BLOCKID_NNA_DMA_IN:
+          ifs->read ((char *) &nna_dma, sizeof (T2PF_DUMP_DMA));
+
+          layer->visa_opcode = 0x02;
+          snprintf (layer->name, NPU_OPNAME_MAX, "%s", "ADMA_IN");
+          layer->dram_read_bytes = nna_dma.src_addr_end - nna_dma.src_addr_start;
+          layer->sram_write_bytes = nna_dma.dest_addr_end - nna_dma.dest_addr_start;
+          break;
+        case TRIV2PROF_BLOCKID_NNA_DMA_OUT:
+          ifs->read ((char *) &nna_dma, sizeof (T2PF_DUMP_DMA));
+
+          layer->visa_opcode = 0x03;
+          snprintf (layer->name, NPU_OPNAME_MAX, "%s", "ADMA_OUT");
+          layer->dram_write_bytes = nna_dma.dest_addr_end - nna_dma.dest_addr_start;
+          layer->sram_read_bytes = nna_dma.src_addr_end - nna_dma.src_addr_start;
+          break;
+        case TRIV2PROF_BLOCKID_DSP:
+          ifs->read ((char *) &dsp, sizeof (T2PF_DUMP_DSP));
+
+          layer->visa_opcode = dsp.opcode;
+          snprintf (layer->name, NPU_OPNAME_MAX, "%s", dsp.op_name);
+          break;
+        case TRIV2PROF_BLOCKID_DSP_DMA_IN:
+          ifs->read ((char *) &dsp_dma, sizeof (T2PF_DUMP_DMA));
+
+          layer->visa_opcode = 0x40;
+          snprintf (layer->name, NPU_OPNAME_MAX, "%s", "PDMA_IN");
+          layer->dram_read_bytes = dsp_dma.src_addr_end - dsp_dma.src_addr_start;
+          layer->sram_write_bytes = dsp_dma.dest_addr_end - dsp_dma.dest_addr_start;
+          break;
+        case TRIV2PROF_BLOCKID_DSP_DMA_OUT:
+          ifs->read ((char *) &dsp_dma, sizeof (T2PF_DUMP_DMA));
+
+          layer->visa_opcode = 0x41;
+          snprintf (layer->name, NPU_OPNAME_MAX, "%s", "PDMA_OUT");
+          layer->dram_write_bytes = dsp_dma.dest_addr_end - dsp_dma.dest_addr_start;
+          layer->sram_read_bytes = dsp_dma.src_addr_end - dsp_dma.src_addr_start;
+          break;
+        default:
+          logerr (TAG, "Unknown block id detected: %u", common.block_id);
+          delete[] profile->layers;
+          profile->layers = nullptr;
+          return -EINVAL;
+      }
+    }
+  }
+
+  return 0;
+}
+
+int
+parseProfile24 (std::ifstream *ifs, npu_profile *profile) {
+  T24PF_HEAD head;
+  ifs->read ((char *) &head, sizeof (T24PF_HEAD));
+
+  uint32_t total_dump = 0;
+
+  /* Neural Network Accelerator (NNA) */
+  total_dump += head.nna0.num_of_dump;
+  total_dump += head.nna1.num_of_dump;
+  total_dump += head.dma_in.num_of_dump;
+  total_dump += head.dma_out.num_of_dump;
+
+  /* Digital Signal Processor (DSP) */
+  total_dump += head.dsp.num_of_dump;
+
+  if (total_dump > 0) {
+    profile->layers = new npu_profile_layer[total_dump];
+    profile->num_layers = total_dump;
+    profile->total_system_cycles = head.total_cycles;
+    profile->dram_input_footprint = head.dma_in.access_footprint_byte;
+    profile->dram_output_footprint = head.dma_out.access_footprint_byte;
+
+    for (uint32_t i = 0; i < total_dump; i++) {
+      npu_profile_layer *layer = &profile->layers[i];
+
+      T2PF_DUMP common;
+      T2PF_DUMP_NNA nna;
+      T2PF_DUMP_DMA dma;
+      T2PF_DUMP_DSP dsp;
+
+      std::streampos pos;
+
+      pos = ifs->tellg ();
+      ifs->read ((char *) &common, sizeof (T2PF_DUMP));
+      ifs->seekg (pos);
+
+      memset (layer, '\x00', sizeof (npu_profile_layer));
+
+      layer->running_cycles = common.cycle_end - common.cycle_start;
+      layer->start_cycles = common.cycle_start;
+      layer->end_cycles = common.cycle_end;
+      layer->visa_prog_seq = i;
+      /* In the first run, program sequence == exec sequence */
+      layer->visa_exec_seq = global_exec_seq++;
+
+      switch (common.block_id) {
+        case TRIV2PROF_BLOCKID_NNA0:
+        case TRIV2PROF_BLOCKID_NNA1:
+          ifs->read ((char *) &nna, sizeof (T2PF_DUMP_NNA));
+
+          layer->visa_opcode = nna.opcode;
+          snprintf (layer->name, NPU_OPNAME_MAX, "%s", nna.op_name);
+          break;
+        case TRIV2PROF_BLOCKID_DMA_IN:
+          ifs->read ((char *) &dma, sizeof (T2PF_DUMP_DMA));
+
+          layer->visa_opcode = 0x02;
+          snprintf (layer->name, NPU_OPNAME_MAX, "%s", "DMA_IN");
+          layer->dram_read_bytes = dma.src_addr_end - dma.src_addr_start;
+          layer->sram_write_bytes = dma.dest_addr_end - dma.dest_addr_start;
+          break;
+        case TRIV2PROF_BLOCKID_DMA_OUT:
+          ifs->read ((char *) &dma, sizeof (T2PF_DUMP_DMA));
+
+          layer->visa_opcode = 0x03;
+          snprintf (layer->name, NPU_OPNAME_MAX, "%s", "DMA_OUT");
+          layer->dram_write_bytes = dma.dest_addr_end - dma.dest_addr_start;
+          layer->sram_read_bytes = dma.src_addr_end - dma.src_addr_start;
+          break;
+        case TRIV2PROF_BLOCKID_DSP:
+          ifs->read ((char *) &dsp, sizeof (T2PF_DUMP_DSP));
+
+          layer->visa_opcode = dsp.opcode;
+          snprintf (layer->name, NPU_OPNAME_MAX, "%s", dsp.op_name);
+          break;
+        default:
+          logerr (TAG, "Unknown block id detected: %u", common.block_id);
+          delete[] profile->layers;
+          profile->layers = nullptr;
+          return -EINVAL;
+      }
+    }
+  }
+
+  return 0;
+}
+
+};  // namespace
 
 class EmulReq {
  public:
@@ -116,16 +320,15 @@ class EmulElement {
   model_config_t *getModelConfig (uint64_t id) { return model_config_map_.find (id); }
 
   void unsetModelConfig (uint64_t id) { model_config_map_.remove (id); }
-
-  void setNpuTops (uint64_t tops) {
-    /** for backward-compatibility, 0-tops is regarded as 8-tops */
-    if (tops == 0)
-      tops_ = 8;
-    else
-      tops_ = tops;
+  void setNpuVersion(uint64_t npu_version) {
+    npu_version_ = npu_version;
   }
 
-  uint64_t getNpuTops () const { return tops_; }
+  uint64_t getNpuVersion() const { return npu_version_; }
+  uint64_t getNpuTops () const {
+    uint64_t tops = NPU_VERSION_TOPS(npu_version_);
+    return tops != 0 ? tops : 8;
+  }
 
  private:
   static std::atomic<int> global_id_;
@@ -135,7 +338,7 @@ class EmulElement {
   size_t size_; /**< the allocated size */
 
   ThreadSafeMap<uint64_t, model_config_t> model_config_map_;
-  uint64_t tops_; /**< npu tops */
+  uint64_t npu_version_; /**< npu version */
 };
 
 /**
@@ -150,7 +353,7 @@ EmulElement::EmulElement (size_t size) {
 
   size_ = size;
   dmabuf_ = global_id_.fetch_add (1);
-  tops_ = 8;
+  npu_version_ = 0;
 }
 
 /** @brief dmabuf-to-element map */
@@ -432,7 +635,7 @@ TrinityEmulAPI::registerModel (model_config_t *model_config, uint64_t npu_versio
   memcpy (config, model_config, sizeof (model_config_t));
 
   elem->setModelConfig (config);
-  elem->setNpuTops (NPU_VERSION_TOPS (npu_version));
+  elem->setNpuVersion (npu_version);
 
   return 0;
 }
@@ -546,10 +749,18 @@ TrinityEmulAPI::runInput (input_config_t *input_config) const {
       delete[] segt;
     } else {
       std::string cmd_path (prefix_share_);
-      if (elem_model->getNpuTops () == 2)
-        cmd_path += "/mRPsim/triv2_2tops.cmd";
-      else
-        cmd_path += "/mRPsim/triv2.cmd";
+      if (NPU_VERSION_MINOR(elem_model->getNpuVersion()) == 3) {
+        if (elem_model->getNpuTops() == 2)
+          cmd_path += "/mRPsim/triv-3.9.0_2tops.cmd";
+        else
+          cmd_path += "/mRPsim/triv-3.9.0.cmd";
+      } else if (NPU_VERSION_MINOR(elem_model->getNpuVersion()) == 4) {
+        cmd_path += "/mRPsim/triv-4.0.0.cmd";
+      } else {
+        logerr (TAG, "Invalid model version: minor(%d)(%d)\n", model->version, NPU_VERSION_MINOR(model->version));
+        delete[] segt;
+        return -EINVAL;
+      }
 
       std::string prof_path (prefix_profile_);
       prof_path += "/ne_profile_" + std::to_string (getpid ());
@@ -625,111 +836,33 @@ TrinityEmulAPI::getProfile (int req_id, npu_profile *profile) const {
   T2PF_HEAD head;
   ifs.read ((char *) &head, sizeof (T2PF_HEAD));
 
+  ifs.clear ();
+  ifs.seekg (0, std::ios::beg);
+
   if (head.fmt_vesion != TRIV2PROF_FMT_VER) {
-    logerr (TAG, "Profile data format mismatch (%x vs. %x)", head.fmt_vesion, TRIV2PROF_FMT_VER);
+    ifs.close ();
     return -EINVAL;
   }
 
-  uint32_t total_dump = 0;
-
-  /* Neual Network Accelerator (NNA) */
-  total_dump += head.nna.num_of_dump;
-  total_dump += head.nna_dma_in.num_of_dump;
-  total_dump += head.nna_dma_out.num_of_dump;
-
-  /* Digital Signal Processor (DSP) */
-  total_dump += head.dsp.num_of_dump;
-  total_dump += head.dsp_dma_in.num_of_dump;
-  total_dump += head.dsp_dma_out.num_of_dump;
-
-  if (total_dump > 0) {
-    profile->layers = new npu_profile_layer[total_dump];
-    profile->num_layers = total_dump;
-    profile->total_system_cycles = head.total_cycles;
-    profile->dram_input_footprint = head.nna_dma_in.access_footprint_byte;
-    profile->dram_output_footprint = head.nna_dma_out.access_footprint_byte;
-    profile->dram_input_footprint += head.dsp_dma_in.access_footprint_byte;
-    profile->dram_output_footprint += head.dsp_dma_out.access_footprint_byte;
-
-    for (uint32_t i = 0; i < total_dump; i++) {
-      npu_profile_layer *layer = &profile->layers[i];
-
-      T2PF_DUMP common;
-      T2PF_DUMP_NNA nna;
-      T2PF_DUMP_DMA nna_dma;
-      T2PF_DUMP_DSP dsp;
-      T2PF_DUMP_DMA dsp_dma;
-
-      std::streampos pos;
-
-      pos = ifs.tellg ();
-      ifs.read ((char *) &common, sizeof (T2PF_DUMP));
-      ifs.seekg (pos);
-
-      memset (layer, '\x00', sizeof (npu_profile_layer));
-
-      layer->running_cycles = common.cycle_end - common.cycle_start;
-      layer->start_cycles = common.cycle_start;
-      layer->end_cycles = common.cycle_end;
-      layer->visa_prog_seq = i;
-      /* In the first run, program sequence == exec sequence */
-      layer->visa_exec_seq = global_exec_seq++;
-
-      switch (common.block_id) {
-        case TRIV2PROF_BLOCKID_NNA:
-          ifs.read ((char *) &nna, sizeof (T2PF_DUMP_NNA));
-
-          layer->visa_opcode = nna.opcode;
-          snprintf (layer->name, NPU_OPNAME_MAX, "%s", nna.op_name);
-          break;
-        case TRIV2PROF_BLOCKID_NNA_DMA_IN:
-          ifs.read ((char *) &nna_dma, sizeof (T2PF_DUMP_DMA));
-
-          layer->visa_opcode = 0x02;
-          snprintf (layer->name, NPU_OPNAME_MAX, "%s", "ADMA_IN");
-          layer->dram_read_bytes = nna_dma.src_addr_end - nna_dma.src_addr_start;
-          layer->sram_write_bytes = nna_dma.dest_addr_end - nna_dma.dest_addr_start;
-          break;
-        case TRIV2PROF_BLOCKID_NNA_DMA_OUT:
-          ifs.read ((char *) &nna_dma, sizeof (T2PF_DUMP_DMA));
-
-          layer->visa_opcode = 0x03;
-          snprintf (layer->name, NPU_OPNAME_MAX, "%s", "ADMA_OUT");
-          layer->dram_write_bytes = nna_dma.dest_addr_end - nna_dma.dest_addr_start;
-          layer->sram_read_bytes = nna_dma.src_addr_end - nna_dma.src_addr_start;
-          break;
-        case TRIV2PROF_BLOCKID_DSP:
-          ifs.read ((char *) &dsp, sizeof (T2PF_DUMP_DSP));
-
-          layer->visa_opcode = dsp.opcode;
-          snprintf (layer->name, NPU_OPNAME_MAX, "%s", dsp.op_name);
-          break;
-        case TRIV2PROF_BLOCKID_DSP_DMA_IN:
-          ifs.read ((char *) &dsp_dma, sizeof (T2PF_DUMP_DMA));
-
-          layer->visa_opcode = 0x40;
-          snprintf (layer->name, NPU_OPNAME_MAX, "%s", "PDMA_IN");
-          layer->dram_read_bytes = dsp_dma.src_addr_end - dsp_dma.src_addr_start;
-          layer->sram_write_bytes = dsp_dma.dest_addr_end - dsp_dma.dest_addr_start;
-          break;
-        case TRIV2PROF_BLOCKID_DSP_DMA_OUT:
-          ifs.read ((char *) &dsp_dma, sizeof (T2PF_DUMP_DMA));
+  int major, minor;
+  getProfileArchVersion(head.arch_vesion, &major, &minor);
+  if (major != 2) {
+    logerr (TAG, "Invalid profile arch version (%d.%d)", major, minor);
+    ifs.close ();
+    return -EINVAL;
+  }
 
-          layer->visa_opcode = 0x41;
-          snprintf (layer->name, NPU_OPNAME_MAX, "%s", "PDMA_OUT");
-          layer->dram_write_bytes = dsp_dma.dest_addr_end - dsp_dma.dest_addr_start;
-          layer->sram_read_bytes = dsp_dma.src_addr_end - dsp_dma.src_addr_start;
-          break;
-        default:
-          logerr (TAG, "Unknown block id detected: %u", common.block_id);
-          delete[] profile->layers;
-          profile->layers = nullptr;
-          ifs.close ();
-          return -EINVAL;
-      }
-    }
+  int ret;
+  if (minor == 3) {
+    ret = parseProfile23 (&ifs, profile);
+  } else if (minor == 4) {
+    ret = parseProfile24 (&ifs, profile);
+  } else {
+    logerr (TAG, "Invalid profile arch version (%d.%d)", major, minor);
+    ret = -EINVAL;
   }
 
   ifs.close ();
-  return 0;
+
+  return ret;
 }
index 5179f17..467ee62 100644 (file)
@@ -207,31 +207,18 @@ class EmulProfile {
   void *getData () const { return profile_.layers; }
   size_t getDataSize () const { return profile_.num_layers * sizeof (npu_profile_layer); }
 
-  bool parse () {
-    std::ifstream ifs (prof_path_, std::ios::binary);
-    if (!ifs.good ()) {
-      std::cerr << "Failed to find the profile data " << prof_path_ << "\n";
-      return false;
-    }
-
-    profile_.prof_path = strdup (prof_path_.c_str ());
-    if (!profile_.prof_path) {
-      std::cerr << "Unable to duplicate the profile path " << prof_path_ << "\n";
-      return false;
-    }
+  void getProfileArchVersion (uint32_t version, int *major, int *minor) {
+    *major = (version >> TRIV2PROF_MAJOR_SHIFT) & 0xffff;
+    *minor = (version) & 0xffff;
+  }
 
+  bool parseProfile23 (std::ifstream &ifs) {
     T2PF_HEAD head;
     ifs.read ((char *) &head, sizeof (T2PF_HEAD));
 
-    if (head.fmt_vesion != TRIV2PROF_FMT_VER) {
-      std::cerr << "Profile data format mismatch: "
-                << "(" << head.fmt_vesion << " vs. " << TRIV2PROF_FMT_VER << ")\n";
-      return false;
-    }
-
     uint32_t total_dump = 0;
 
-    /* Neual Network Accelerator (NNA) */
+    /* Neural Network Accelerator (NNA) */
     total_dump += head.nna.num_of_dump;
     total_dump += head.nna_dma_in.num_of_dump;
     total_dump += head.nna_dma_out.num_of_dump;
@@ -270,16 +257,20 @@ class EmulProfile {
         layer->running_cycles = common.cycle_end - common.cycle_start;
         layer->start_cycles = common.cycle_start;
         layer->end_cycles = common.cycle_end;
+        layer->visa_prog_seq = i;
+        layer->visa_exec_seq = exec_seq_++;
 
         switch (common.block_id) {
           case TRIV2PROF_BLOCKID_NNA:
             ifs.read ((char *) &nna, sizeof (T2PF_DUMP_NNA));
 
+            layer->visa_opcode = nna.opcode;
             snprintf (layer->name, NPU_OPNAME_MAX, "%s", nna.op_name);
             break;
           case TRIV2PROF_BLOCKID_NNA_DMA_IN:
             ifs.read ((char *) &nna_dma, sizeof (T2PF_DUMP_DMA));
 
+            layer->visa_opcode = 0x02;
             snprintf (layer->name, NPU_OPNAME_MAX, "%s", "ADMA_IN");
             layer->dram_read_bytes = nna_dma.src_addr_end - nna_dma.src_addr_start;
             layer->sram_write_bytes = nna_dma.dest_addr_end - nna_dma.dest_addr_start;
@@ -287,6 +278,7 @@ class EmulProfile {
           case TRIV2PROF_BLOCKID_NNA_DMA_OUT:
             ifs.read ((char *) &nna_dma, sizeof (T2PF_DUMP_DMA));
 
+            layer->visa_opcode = 0x03;
             snprintf (layer->name, NPU_OPNAME_MAX, "%s", "ADMA_OUT");
             layer->dram_write_bytes = nna_dma.dest_addr_end - nna_dma.dest_addr_start;
             layer->sram_read_bytes = nna_dma.src_addr_end - nna_dma.src_addr_start;
@@ -294,11 +286,13 @@ class EmulProfile {
           case TRIV2PROF_BLOCKID_DSP:
             ifs.read ((char *) &dsp, sizeof (T2PF_DUMP_DSP));
 
+            layer->visa_opcode = dsp.opcode;
             snprintf (layer->name, NPU_OPNAME_MAX, "%s", dsp.op_name);
             break;
           case TRIV2PROF_BLOCKID_DSP_DMA_IN:
             ifs.read ((char *) &dsp_dma, sizeof (T2PF_DUMP_DMA));
 
+            layer->visa_opcode = 0x40;
             snprintf (layer->name, NPU_OPNAME_MAX, "%s", "PDMA_IN");
             layer->dram_read_bytes = dsp_dma.src_addr_end - dsp_dma.src_addr_start;
             layer->sram_write_bytes = dsp_dma.dest_addr_end - dsp_dma.dest_addr_start;
@@ -306,6 +300,7 @@ class EmulProfile {
           case TRIV2PROF_BLOCKID_DSP_DMA_OUT:
             ifs.read ((char *) &dsp_dma, sizeof (T2PF_DUMP_DMA));
 
+            layer->visa_opcode = 0x41;
             snprintf (layer->name, NPU_OPNAME_MAX, "%s", "PDMA_OUT");
             layer->dram_write_bytes = dsp_dma.dest_addr_end - dsp_dma.dest_addr_start;
             layer->sram_read_bytes = dsp_dma.src_addr_end - dsp_dma.src_addr_start;
@@ -314,18 +309,150 @@ class EmulProfile {
             std::cerr << "Unknown block id detected: " << common.block_id << std::endl;
             delete[] profile_.layers;
             profile_.layers = nullptr;
-            ifs.close ();
             return false;
         }
       }
     }
 
-    ifs.close ();
     return true;
   }
 
+  int parseProfile24 (std::ifstream &ifs) {
+    T24PF_HEAD head;
+    ifs.read ((char *) &head, sizeof (T24PF_HEAD));
+
+    uint32_t total_dump = 0;
+
+    /* Neural Network Accelerator (NNA) */
+    total_dump += head.nna0.num_of_dump;
+    total_dump += head.nna1.num_of_dump;
+    total_dump += head.dma_in.num_of_dump;
+    total_dump += head.dma_out.num_of_dump;
+
+    /* Digital Signal Processor (DSP) */
+    total_dump += head.dsp.num_of_dump;
+
+    if (total_dump > 0) {
+      profile_.layers = new npu_profile_layer[total_dump];
+      profile_.num_layers = total_dump;
+      profile_.total_system_cycles = head.total_cycles;
+      profile_.dram_input_footprint = head.dma_in.access_footprint_byte;
+      profile_.dram_output_footprint = head.dma_out.access_footprint_byte;
+
+      for (uint32_t i = 0; i < total_dump; i++) {
+        npu_profile_layer *layer = &profile_.layers[i];
+
+        T2PF_DUMP common;
+        T2PF_DUMP_NNA nna;
+        T2PF_DUMP_DMA dma;
+        T2PF_DUMP_DSP dsp;
+
+        std::streampos pos;
+
+        pos = ifs.tellg ();
+        ifs.read ((char *) &common, sizeof (T2PF_DUMP));
+        ifs.seekg (pos);
+
+        memset (layer, '\x00', sizeof (npu_profile_layer));
+
+        layer->running_cycles = common.cycle_end - common.cycle_start;
+        layer->start_cycles = common.cycle_start;
+        layer->end_cycles = common.cycle_end;
+        layer->visa_prog_seq = i;
+        layer->visa_exec_seq = exec_seq_++;
+
+        switch (common.block_id) {
+          case TRIV2PROF_BLOCKID_NNA0:
+          case TRIV2PROF_BLOCKID_NNA1:
+            ifs.read ((char *) &nna, sizeof (T2PF_DUMP_NNA));
+
+            layer->visa_opcode = nna.opcode;
+            snprintf (layer->name, NPU_OPNAME_MAX, "%s", nna.op_name);
+            break;
+          case TRIV2PROF_BLOCKID_DMA_IN:
+            ifs.read ((char *) &dma, sizeof (T2PF_DUMP_DMA));
+
+            layer->visa_opcode = 0x02;
+            snprintf (layer->name, NPU_OPNAME_MAX, "%s", "ADMA_IN");
+            layer->dram_read_bytes = dma.src_addr_end - dma.src_addr_start;
+            layer->sram_write_bytes = dma.dest_addr_end - dma.dest_addr_start;
+            break;
+          case TRIV2PROF_BLOCKID_DMA_OUT:
+            ifs.read ((char *) &dma, sizeof (T2PF_DUMP_DMA));
+
+            layer->visa_opcode = 0x03;
+            snprintf (layer->name, NPU_OPNAME_MAX, "%s", "ADMA_OUT");
+            layer->dram_write_bytes = dma.dest_addr_end - dma.dest_addr_start;
+            layer->sram_read_bytes = dma.src_addr_end - dma.src_addr_start;
+            break;
+          case TRIV2PROF_BLOCKID_DSP:
+            ifs.read ((char *) &dsp, sizeof (T2PF_DUMP_DSP));
+
+            layer->visa_opcode = dsp.opcode;
+            snprintf (layer->name, NPU_OPNAME_MAX, "%s", dsp.op_name);
+            break;
+          default:
+            std::cerr << "Unknown block id detected: " << common.block_id << std::endl;
+            delete[] profile_.layers;
+            profile_.layers = nullptr;
+            return false;
+        }
+      }
+    }
+
+    return true;
+  }
+
+  bool parse () {
+    std::ifstream ifs (prof_path_, std::ios::binary);
+    if (!ifs.good ()) {
+      std::cerr << "Failed to find the profile data " << prof_path_ << "\n";
+      return false;
+    }
+
+    profile_.prof_path = strdup (prof_path_.c_str ());
+    if (!profile_.prof_path) {
+      std::cerr << "Unable to duplicate the profile path " << prof_path_ << "\n";
+      return false;
+    }
+
+    T2PF_HEAD head;
+    ifs.read ((char *) &head, sizeof (T2PF_HEAD));
+
+    ifs.clear ();
+    ifs.seekg (0, std::ios::beg);
+
+    if (head.fmt_vesion != TRIV2PROF_FMT_VER) {
+      ifs.close ();
+      return -EINVAL;
+    }
+
+    int major, minor;
+    getProfileArchVersion(head.arch_vesion, &major, &minor);
+    if (major != 2) {
+      std::cerr <<"Invalid profile arch version (" << major << "." << minor << ")" << std::endl;
+      ifs.close ();
+      return -EINVAL;
+    }
+
+    int ret;
+    if (minor == 3) {
+      ret = parseProfile23 (ifs);
+    } else if (minor == 4) {
+      ret = parseProfile24 (ifs);
+    } else {
+      std::cerr <<"Invalid profile arch version (" << major << "." << minor << ")" << std::endl;
+      ret = -EINVAL;
+    }
+
+    ifs.close ();
+
+    return ret;
+  }
+
  private:
   int req_id_;
+  uint64_t exec_seq_;
   std::string prof_path_;
   npu_profile profile_;
 };
@@ -492,7 +619,7 @@ triv2_release (trinity_cuse_context *ctx) {
  */
 static int
 triv2_get_version (trinity_cuse_context *ctx, uint32_t *version) {
-  *version = trinity_gen_ver (TRINITY_DEV_VISION2_CUSE, 2, 0, 0);
+  *version = trinity_gen_ver (TRINITY_DEV_VISION2_CUSE, 2, 4, 0);
   return 0;
 }
 
@@ -657,9 +784,6 @@ triv2_run_input (trinity_cuse_context *ctx, const trinity_cuse_input *in, trinit
   if (stat == nullptr)
     return -ENOENT;
 
-  if (model->getVersion () != 3)
-    return -EINVAL;
-
   EmulDmabuf *dbuf_model = global_dmabuf_map.find (model->getDbufFD ());
   if (dbuf_model == nullptr)
     return -ENOENT;
@@ -705,10 +829,7 @@ triv2_run_input (trinity_cuse_context *ctx, const trinity_cuse_input *in, trinit
   triv2_get_tops (ctx, &tops);
 
   std::string cmd_path (ctx->prefix_share);
-  if (tops == 2)
-    cmd_path += "/mRPsim/triv2_2tops.cmd";
-  else
-    cmd_path += "/mRPsim/triv2.cmd";
+  cmd_path += "/mRPsim/triv-4.0.0.cmd";
 
   int req_id = in->req_id;
   std::string prof_path (ctx->prefix_profile);