*/
#include <NPUdrvAPI.h>
+#include <GEMdrvAPI.h>
#include <NPUemul.h>
+#include <npubinfmt.h>
+#include <TrinityCore.h>
+#include <DataGen.h>
+
+#include <sys/mman.h>
#include <sys/ioctl.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
+#include <iostream>
+
+#define PAGE_SIZE 4096
+
+using namespace std;
+using namespace trinity_vision;
+
+/** @brief NPU Emulation using TrinityCore */
+class NPUCoreEmul : public TrinityCore<64>, public DataGen {
+ public:
+ NPUCoreEmul() {
+ model_set = input_set = false;
+ meta.size = 0;
+ meta.buffer_size = 0;
+ }
+
+ /** @brief set npu model for emulation */
+ void set_model (npu_model_config *config) {
+ model_config = *config;
+ set_meta (model_config.dmabuf_id);
+ model_set = true;
+ }
+
+ /** @brief set input buffer for emulation */
+ void set_input (npu_input_config *config) {
+ input_config = *config;
+ input_set = true;
+ }
+
+ /** @brief run inference using emulation */
+ int run (void) {
+ if (!model_set || !input_set)
+ return 0;
+
+ if (meta.size == 0 || meta.buffer_size == 0)
+ return 0;
+
+ void *model = gem_mmap (model_config.dmabuf_id, 0, meta.size, PROT_READ);
+ void *input = gem_mmap (input_config.dmabuf_id, 0, meta.buffer_size, PROT_READ);
+ int ret;
+
+ if (!model || !input)
+ return 0;
+
+ model_base = static_cast<char*>(model);
+ buffer_base = static_cast<char*>(input);
+ program_base = model_base + NPUBIN_META_SIZE;
+ weight_base = program_base + meta.program_size;
+
+ ret = run_inference ();
+
+ gem_munmap (model, meta.size);
+ gem_munmap (input, meta.buffer_size);
+
+ return ret;
+ }
+
+ protected:
+ /** @brief set metadata from the dmabuf */
+ void set_meta (int dmabuf_id) {
+ /** in test mode, dmabuf_fd == dmabuf_id */
+ void *meta_addr = gem_mmap (dmabuf_id, 0, NPUBIN_META_SIZE, PROT_READ);
+ memcpy(&meta, meta_addr, NPUBIN_META_SIZE);
+ gem_munmap (meta_addr, NPUBIN_META_SIZE);
+ }
+
+ /** @brief get the current opcode */
+ visa_opcode get_opcode () {
+ return static_cast<visa_opcode>(PC[0]);
+ }
+
+ /** @brief dump the output data to file and read again to memeory */
+ void dump_fmap_out (uint32_t out_addr) {
+ Point1D<int8_t, 8> data_mem;
+
+ /** TrinityCore provides API to dump data to a file only */
+ trinity_fmap_out.write_data_file("/tmp", "output_fmap", true, true, true);
+
+ data_mem.clean();
+ if (read_data_file("/tmp", "output_fmap", data_mem)) {
+ memcpy(buffer_base + out_addr,
+ data_mem.get_p_data(),
+ data_mem.get_mem_size());
+ }
+ }
+
+ /** @brief run ESUM (elementwise sum) op */
+ void run_esum_op (TRINITY_CORE_PARA_OP& op) {
+ esum_fields esum;
+
+ memcpy(&esum, PC, sizeof(esum_fields));
+
+ op.info.OPCODE = 7;
+ op.cfg.WGT_QBIT = 1;
+ op.cfg.FLT_H = 1;
+ op.cfg.FLT_W = 1;
+ op.data_size.OUT_H = esum.out_h_m1 + 1;
+ op.data_size.OUT_W = esum.out_w_m1 + 1;
+ op.data_size.OUT_D = esum.out_d_m1 + 1;
+ op.data_size.IN0_D = esum.out_d_m1 + 1;
+ op.quant.IN0_ZERO = esum.in0_zero;
+ op.quant.IN1_ZERO = esum.in1_zero;
+ op.quant.OUT_ZERO = esum.out_zero;
+ op.quant.OUT_LSHAMT = esum.out_shamt;
+ op.quant.IN0_LSHAMT = 20;
+ op.quant.IN0_RSHAMT = esum.in0_shamt;
+ op.quant.IN1_LSHAMT = 20;
+ op.quant.IN1_RSHAMT = esum.in1_shamt;
+ op.quant.IN0_MULT = esum.in0_mult;
+ op.quant.IN1_MULT = esum.in1_mult;
+ op.quant.OUT_MULT = esum.out_mult;
+
+ if (alloc_trinity_data (op, esum.in0_eaddr0, esum.in1_eaddr0, -1)) {
+ dump_fmap_out(esum.out_eaddr0);
+ }
+
+ PC += sizeof(esum_fields);
+ }
+
+ /** @brief run MAXP (max pooling) op */
+ void run_maxp_op (TRINITY_CORE_PARA_OP& op) {
+ maxp_fields maxp;
+
+ memcpy(&maxp, PC, sizeof(maxp_fields));
+
+ op.info.OPCODE = 4;
+ op.cfg.WGT_QBIT = 1;
+ op.cfg.FLT_H = maxp.flt_h_m1 + 1;
+ op.cfg.FLT_W = maxp.flt_w_m1 + 1;
+ op.cfg.STR_Y = maxp.str_y_m1 + 1;
+ op.cfg.STR_X = maxp.str_y_m1 + 1;
+ op.cfg.PAD_L = maxp.pad_l;
+ op.cfg.PAD_R = maxp.pad_r;
+ op.cfg.PAD_T = maxp.pad_t;
+ op.cfg.PAD_B = maxp.pad_b;
+ op.data_size.OUT_H = maxp.out_h_m1 + 1;
+ op.data_size.OUT_W = maxp.out_w_m1 + 1;
+ op.data_size.OUT_D = maxp.out_d_m1 + 1;
+ op.data_size.IN0_D = maxp.out_d_m1 + 1;
+ op.quant.IN0_ZERO = 0;
+ op.quant.OUT_ZERO = 0;
+
+ if (alloc_trinity_data (op, maxp.in0_eaddr0, -1, -1)) {
+ dump_fmap_out(maxp.out_eaddr0);
+ }
+
+ PC += sizeof(maxp_fields);
+ }
+
+ /** @brief run CONV (normal convolution) op */
+ void run_conv_op (TRINITY_CORE_PARA_OP& op) {
+ conv_fields conv;
+
+ memcpy(&conv, PC, sizeof(conv_fields));
+
+ op.info.OPCODE = 0;
+ op.info.CNV_RELU_EN = conv.relu_en;
+ op.cfg.CNV_BIAS_EN = conv.bias_en;
+ op.cfg.WGT_QBIT = conv.wgt_q + 1;
+ op.cfg.FLT_H = conv.flt_h_m1 + 1;
+ op.cfg.FLT_W = conv.flt_w_m1 + 1;
+ op.cfg.STR_Y = conv.str_y_m1 + 1;
+ op.cfg.STR_X = conv.str_y_m1 + 1;
+ op.cfg.PAD_L = conv.pad_l;
+ op.cfg.PAD_R = conv.pad_r;
+ op.cfg.PAD_T = conv.pad_t;
+ op.cfg.PAD_B = conv.pad_b;
+ op.data_size.OUT_H = conv.out_h_m1 + 1;
+ op.data_size.OUT_W = conv.out_w_m1 + 1;
+ op.data_size.OUT_D = conv.out_d_m1 + 1;
+ op.data_size.IN0_D = conv.in0_d_m1 + 1;
+ op.quant.IN0_ZERO = conv.in0_zero;
+ op.quant.OUT_ZERO = conv.out_zero;
+ op.quant.OUT_LSHAMT = conv.out_shamt;
+ op.quant.OUT_MULT = conv.out_mult;
+
+ if (alloc_trinity_data (op, conv.in0_eaddr0, -1, conv.wgt_eaddr0)) {
+ dump_fmap_out(conv.out_eaddr0);
+ }
+
+ PC += sizeof(conv_fields);
+ }
+
+ /** @brief allocate data for emulation */
+ bool alloc_trinity_data (TRINITY_CORE_PARA_OP& op,
+ int32_t in0_eaddr, int32_t in1_eaddr, int32_t wgt_eaddr) {
+ TRINITY_FMAP_PARA para_fmap_in;
+ TRINITY_FMAP_PARA para_fmap_out;
+ WGT_PARA para_weight;
+ SIZE3D size3d_in, size3d_out;
+
+ if (!calc_tensor_data_size(op, para_fmap_in, para_weight, para_fmap_out)) {
+ cerr << "Fail to parse a trinity op" << endl;
+ return false;
+ }
+
+ size3d_in.depth = para_fmap_in.depth;
+ size3d_in.height = para_fmap_in.height;
+ size3d_in.width = para_fmap_in.width;
+
+ size3d_out.depth = para_fmap_out.depth;
+ size3d_out.height = para_fmap_out.height;
+ size3d_out.width = para_fmap_out.width;
+
+ /** input fmap */
+ Point1D<int8_t, 8> input_data;
+ int32_t input_size = size3d_in.depth * size3d_in.height * size3d_in.width;
+
+ input_data.clean();
+ input_data.alloc (input_size);
+ memcpy(input_data.get_p_data(), buffer_base + in0_eaddr, input_size);
+
+ trinity_fmap_in.alloc(size3d_in);
+ trinity_fmap_in.set_data(input_data);
+
+ /** input esum (optional) */
+ if (in1_eaddr != -1) {
+ input_data.clean();
+ input_data.alloc (input_size);
+ memcpy(input_data.get_p_data(), buffer_base + in1_eaddr, input_size);
+
+ trinity_esum_in.alloc(size3d_in);
+ trinity_esum_in.set_data(input_data);
+ }
+
+ /** weight (optional) */
+ if (wgt_eaddr != -1) {
+ Point1D<int8_t, 8> wgt_data;
+ int32_t weight_size = para_weight.CACL_SIZE_BYTE;
+
+ wgt_data.clean();
+ wgt_data.alloc (weight_size);
+ memcpy(wgt_data.get_p_data(), weight_base + wgt_eaddr, weight_size);
+
+ trinity_weight.alloc(para_weight);
+ trinity_weight.set_data(wgt_data, WBIN_NORMAL);
+ }
+
+ /** output fmap */
+ trinity_fmap_out.alloc(size3d_out);
+
+ do_main_operation(op,
+ trinity_fmap_in, trinity_esum_in,
+ trinity_weight, trinity_fmap_out);
+
+ return true;
+ }
+
+ /** inference main loop */
+ int run_inference () {
+ bool stop = false;
+
+ PC = program_base;
+
+ while (!stop && PC < program_base + meta.program_size) {
+ visa_opcode opcode = get_opcode ();
+ TRINITY_CORE_PARA_OP op;
+
+ switch (opcode) {
+ case VISA_RELU: /** @todo */
+ case VISA_AVGP: /** @todo */
+ case VISA_NOP: /** do nothing */
+ PC += sizeof(uint32_t);
+ break;
+ case VISA_SAW: /** program end */
+ PC += sizeof(uint32_t);
+ stop = true;
+ break;
+ case VISA_ESUM:
+ run_esum_op (op);
+ break;
+ case VISA_MAXP:
+ run_maxp_op (op);
+ break;
+ default:
+ if ((opcode & 0xFE) == VISA_CONV)
+ run_conv_op (op);
+ else if ((opcode & 0xFE) == VISA_CONVE)
+ PC += sizeof(uint32_t); /** @todo */
+ else
+ stop = true;
+ break;
+ }
+ }
+
+ return 0;
+ }
+
+ private:
+ npubin_meta meta;
+ npu_model_config model_config;
+ npu_input_config input_config;
+
+ bool model_set;
+ bool input_set;
+
+ char *PC;
+ char *model_base;
+ char *program_base;
+ char *weight_base;
+ char *buffer_base;
+
+ TR_FMAP trinity_fmap_in;
+ TR_FMAP trinity_esum_in;
+ TR_FMAP trinity_fmap_out;
+ TrinityWgt trinity_weight;
+};
+
+static NPUCoreEmul emul;
+
/**
* @brief npu ioctl emulation
* @param[in] fd file descriptor of the device
*/
int npu_ioctl_emul (int fd, unsigned long cmd, void *arg)
{
+ if (!arg)
+ return -EINVAL;
+
switch (cmd) {
case SRNPU_IOCTL_WAIT_READY:
usleep(10000); /** 10 ms */
return 0;
}
case SRNPU_IOCTL_SET_MODEL:
- /** @todo */
+ {
+ npu_model_config * model_config;
+ model_config = static_cast<npu_model_config *>(arg);
+ emul.set_model(model_config);
+ return 0;
+ }
case SRNPU_IOCTL_RUN_INPUT:
- /** @todo */
- return 0;
+ {
+ npu_input_config * input_config;
+ input_config = static_cast<npu_input_config *>(arg);
+ emul.set_input(input_config);
+ return emul.run();
+ }
default:
return -EINVAL;
}