src/core/ne-handler.cc

   1 /**
   2  * Proprietary
   3  * Copyright (C) 2020 Samsung Electronics
   4  * Copyright (C) 2020 Dongju Chae <dongju.chae@samsung.com>
   5  */
   6 /**
   7  * @file ne-host-handler.cc
   8  * @date 03 Apr 2020
   9  * @brief Implementation of APIs to access NPU from Host
  10  * @see https://code.sec.samsung.net/confluence/display/ODLC/2020+Overall+Software+Stack
  11  * @author Dongju Chae <dongju.chae@samsung.com>
  12  * @bug No known bugs except for NYI items
  13  */
  14
  15 #include "ne-handler.h"
  16
  17 #include <libnpuhost.h>
  18 #include <npubinfmt.h>
  19 #include <NPUdrvAPI.h>
  20 #include <CommPlugin.h>
  21
  22 #include <string.h>
  23 #include <assert.h>
  24
  25 #include <condition_variable>
  26 #include <functional>
  27 #include <atomic>
  28 #include <map>
  29
  30 #define TAG _N2
  31
  32 #define INIT_HOST_HANDLER(handler, dev) \
  33   Device *tdev = static_cast <Device *> (dev); \
  34   if (tdev == nullptr) return -EINVAL; \
  35   HostHandler *handler = tdev->getHostHandler (); \
  36   if (handler == nullptr) return -EINVAL;
  37
  38 /** just for backward-compatability */
  39 npudev_h HostHandler::latest_dev_ = nullptr;
  40
  41 /** implement libnpuhost APIs */
  42
  43 /**
  44  * @brief Returns the number of available NPU devices.
  45  * @return @c The number of NPU devices.
  46  * @retval 0 if no NPU devices available. if positive (number of NPUs) if NPU devices available. otherwise, a negative error value.
  47  * @note the caller should call putNPUdevice() to release the device handle
  48  */
  49 int getnumNPUdeviceByType (dev_type type)
  50 {
  51   return HostHandler::getNumDevices (type);
  52 }
  53
  54 /**
  55  * @brief Returns the handle of the chosen NPU devices.
  56  * @param[out] dev The NPU device handle
  57  * @param[in] id The NPU id to get the handle. 0 <= id < getnumNPUdeviceByType().
  58  * @return @c 0 if no error. otherwise a negative error value
  59  * @note the caller should call putNPUdevice() to release the device handle
  60  */
  61 int getNPUdeviceByType (npudev_h *dev, dev_type type, uint32_t id)
  62 {
  63   return HostHandler::getDevice (dev, type, id);
  64 }
  65
  66 /**
  67  * @brief release the NPU device instance obtained by getDevice ()
  68  * @param[in] dev the NPU device handle
  69  */
  70 void putNPUdevice (npudev_h dev)
  71 {
  72   if (dev != nullptr)
  73     delete static_cast<Device *> (dev);
  74 }
  75
  76 /**
  77  * @brief Send the NN model to NPU.
  78  * @param[in] dev The NPU device handle
  79  * @param[in] modelfile The filepath to the compiled NPU NN model in any buffer_type
  80  * @param[out] modelid The modelid allocated for this instance of NN model.
  81  * @return @c 0 if no error. otherwise a negative error value
  82  *
  83  * @detail For ASR devices, which do not accept models, but have models
  84  *         embedded in devices, you do not need to call register and
  85  *         register calls for ASR are ignored.
  86  *
  87  * @todo Add a variation: in-memory model register.
  88  */
  89 int registerNPUmodel (npudev_h dev, generic_buffer *modelfile, uint32_t *modelid)
  90 {
  91   INIT_HOST_HANDLER (host_handler, dev);
  92
  93   return host_handler->registerModel (modelfile, modelid);
  94 }
  95
  96 /**
  97  * @brief Remove the NN model from NPU
  98  * @param[in] dev The NPU device handle
  99  * @param[in] modelid The model to be removed from the NPU.
 100  * @return @c 0 if no error. otherwise a negative error value
 101  * @detail This may incur some latency with memory compatcion.
 102  */
 103 int unregisterNPUmodel(npudev_h dev, uint32_t modelid)
 104 {
 105   INIT_HOST_HANDLER (host_handler, dev);
 106
 107   return host_handler->unregisterModel (modelid);
 108 }
 109
 110 /**
 111  * @brief Remove all NN models from NPU
 112  * @param[in] dev The NPU device handle
 113  * @return @c 0 if no error. otherwise a negative error value
 114  */
 115 int unregisterNPUmodel_all(npudev_h dev)
 116 {
 117   INIT_HOST_HANDLER (host_handler, dev);
 118
 119   return host_handler->unregisterModels ();
 120 }
 121
 122 /**
 123  * @brief [OPTIONAL] Set the data layout for input/output tensors
 124  * @param[in] dev The NPU device handle
 125  * @param[in] modelid The ID of model whose layouts are set
 126  * @param[in] info_in the layout/type info for input tensors
 127  * @param[in] info_out the layout/type info for output tensors
 128  * @return @c 0 if no error. otherwise a negative error value
 129  * @note if this function is not called, default layout/type will be used.
 130  */
 131 int setNPU_dataInfo(npudev_h dev, uint32_t modelid,
 132     tensors_data_info *info_in, tensors_data_info *info_out)
 133 {
 134   INIT_HOST_HANDLER (host_handler, dev);
 135
 136   return host_handler->setDataInfo (modelid, info_in, info_out);
 137 }
 138
 139 /**
 140  * @brief [OPTIONAL] Set the inference constraint for next NPU inferences
 141  * @param[in] dev The NPU device handle
 142  * @param[in] modelid The target model id
 143  * @param[in] constraint inference constraint (e.g., timeout, priority)
 144  * @return @c 0 if no error. otherwise a negative error value
 145  * @note If this function is not called, default values are used.
 146  */
 147 int setNPU_constraint(npudev_h dev, uint32_t modelid, npuConstraint constraint)
 148 {
 149   INIT_HOST_HANDLER (host_handler, dev);
 150
 151   return host_handler->setConstraint (modelid, constraint);
 152 }
 153
 154 /**
 155  * @brief Execute inference. Wait (block) until the output is available.
 156  * @param[in] dev The NPU device handle
 157  * @param[in] modelid The model to be inferred.
 158  * @param[in] input The input data to be inferred.
 159  * @param[out] output The output result. The caller MUST allocate appropriately before calling this.
 160  * @return @c 0 if no error. otherwise a negative error value
 161  *
 162  * @detail This is a syntactic sugar of runNPU_async().
 163  *         CAUTION: There is a memcpy for the output buffer.
 164  */
 165 int runNPU_sync(npudev_h dev, uint32_t modelid, const input_buffers *input,
 166     output_buffers *output)
 167 {
 168   INIT_HOST_HANDLER (host_handler, dev);
 169
 170   return host_handler->runSync (modelid, input, output);
 171 }
 172
 173 /**
 174  * @brief Invoke NPU inference. Unblocking call.
 175  * @param[in] dev The NPU device handle
 176  * @param[in] modelid The model to be inferred.
 177  * @param[in] input The input data to be inferred.
 178  * @param[in] cb The output buffer handler.
 179  * @param[out] sequence The sequence number returned with runNPU_async.
 180  * @param[in] data The data given as a parameter to the runNPU_async call.
 181  * @param[in] mode Configures how this operation works.
 182  * @return @c 0 if no error. otherwise a negative error value
 183  */
 184 int runNPU_async(npudev_h dev, uint32_t modelid, const input_buffers *input,
 185     npuOutputNotify cb, uint64_t *sequence, void *data,
 186     npu_async_mode mode)
 187 {
 188   INIT_HOST_HANDLER (host_handler, dev);
 189
 190   return host_handler->runAsync (modelid, input, cb, data, mode, sequence);
 191 }
 192
 193 /**
 194  * @brief Allocate a generic buffer with the requested buffer type.
 195  * @param[in] dev The NPU device handle
 196  * @param[in/out] Buffer the buffer pointer where memory is allocated.
 197  * @return 0 if no error, otherwise a negative errno.
 198  */
 199 int allocNPU_genericBuffer (npudev_h dev, generic_buffer * buffer)
 200 {
 201   INIT_HOST_HANDLER (host_handler, dev);
 202
 203   return host_handler->allocGenericBuffer (buffer);
 204 }
 205
 206 /**
 207  * @brief Free the generic buffer and remove the address mapping
 208  * @param[in] dev The NPU device handle
 209  * @param[in] buffer the model buffer
 210  * @return 0 if no error, otherwise a negative errno.
 211  */
 212 int cleanNPU_genericBuffer (npudev_h dev, generic_buffer * buffer)
 213 {
 214   INIT_HOST_HANDLER (host_handler, dev);
 215
 216   return host_handler->deallocGenericBuffer (buffer);
 217 }
 218
 219 /**
 220  * @brief Allocate generic buffers, which have multiple instances of generic_buffer
 221  * @param[in] dev The NPU device handle
 222  * @param[in/out] buffers generic buffers.
 223  * @return 0 if no error, otherwise a negative errno.
 224  * @note it reuses allocGenericBuffer().
 225  */
 226 int allocNPU_genericBuffers (npudev_h dev, generic_buffers * buffers)
 227 {
 228   INIT_HOST_HANDLER (host_handler, dev);
 229
 230   return host_handler->allocGenericBuffer (buffers);
 231 }
 232
 233 /**
 234  * @brief Free generic buffers allocated by allocGenericBuffers().
 235  * @param[in] dev The NPU device handle
 236  * @param[in/out] buffers generic buffers.
 237  * @note it reuses cleanGenericbuffer().
 238  * @return 0 if no error, otherwise a negative errno.
 239  */
 240 int cleanNPU_genericBuffers (npudev_h dev, generic_buffers * buffers)
 241 {
 242   INIT_HOST_HANDLER (host_handler, dev);
 243
 244   return host_handler->deallocGenericBuffer (buffers);
 245 }
 246
 247 /**
 248  * @brief alias of allocNPU_genericBuffer for model buffer
 249  */
 250 int allocNPU_modelBuffer (npudev_h dev, generic_buffer * model)
 251 {
 252   return allocNPU_genericBuffer (dev, model);
 253 }
 254
 255 /**
 256  * @brief alias of cleanNPU_genericBuffer for model buffer
 257  */
 258 int cleanNPU_modelBuffer (npudev_h dev, generic_buffer * model)
 259 {
 260   return cleanNPU_genericBuffer (dev, model);
 261 }
 262
 263 /**
 264  * @brief alias of allocNPU_genericBuffer for input buffer
 265  */
 266 int allocNPU_inputBuffer (npudev_h dev, generic_buffer * input)
 267 {
 268   return allocNPU_genericBuffer (dev, input);
 269 }
 270
 271 /**
 272  * @brief alias of cleanNPU_genericBuffer for input buffer
 273  */
 274 int cleanNPU_inputBuffer (npudev_h dev, generic_buffer * input)
 275 {
 276   return cleanNPU_genericBuffer (dev, input);
 277 }
 278
 279 /**
 280  * @brief alias of allocNPU_genericBuffers for input buffers
 281  */
 282 int allocNPU_inputBuffers (npudev_h dev, input_buffers * input)
 283 {
 284   return allocNPU_genericBuffers (dev, input);
 285 }
 286
 287 /**
 288  * @brief alias of cleanNPU_genericBuffers for input buffers
 289  */
 290 int cleanNPU_inputBuffers (npudev_h dev, input_buffers * input)
 291 {
 292   return cleanNPU_genericBuffers (dev, input);
 293 }
 294
 295 /**
 296  * @brief get the current memory status for the given device
 297  * @param[in] dev The NPU device handle
 298  * @param[out] alloc_total The size of allocated memory until now
 299  * @param[out] free_total The size of freed memory until now
 300  * @return @c 0 if no error. otherwise a negatice error value
 301  */
 302 int getNPU_memoryStatus(npudev_h dev, size_t *alloc_total, size_t *free_total)
 303 {
 304   INIT_HOST_HANDLER (host_handler, dev);
 305
 306   return host_handler->getMemoryStatus (alloc_total, free_total);
 307 }
 308
 309 /**
 310  * @brief Get the current device status to be used
 311  * @param[in] dev The NPU device handle
 312  * @param[out] status the device status
 313  * @param[out] num_requests the number of running requests (or pending)
 314  * @return 0 if no error, otherwise a negative errno.
 315  */
 316 int getNPU_deviceStatus(npudev_h dev, npu_status *status, uint32_t *num_requests)
 317 {
 318   INIT_HOST_HANDLER (host_handler, dev);
 319
 320   return host_handler->getDeviceStatus (status, num_requests);
 321 }
 322
 323 /**
 324  * @brief Get metadata for NPU model
 325  * @param[in] model The path of model binary file
 326  * @param[in] need_extra whether you want to extract the extra data in metadata
 327  * @return the metadata structure to be filled if no error, otherwise nullptr
 328  *
 329  * @note For most npu-engine users, the extra data is not useful because it will be
 330  *       used for second-party users (e.g., compiler, simulator).
 331  *       Also, the caller needs to free the metadata.
 332  *
 333  * @note the caller needs to free the metadata
 334  */
 335 npubin_meta * getNPUmodel_metadata (const char *model, bool need_extra)
 336 {
 337   npubin_meta *meta;
 338   FILE *fp;
 339   size_t ret;
 340
 341   if (!model)
 342     return nullptr;
 343
 344   fp = fopen (model, "rb");
 345   if (!fp) {
 346     logerr (TAG, "Failed to open the model binary: %d\n", -errno);
 347     return nullptr;
 348   }
 349
 350   meta = (npubin_meta *) malloc (NPUBIN_META_SIZE);
 351   if (!meta) {
 352     logerr (TAG, "Failed to allocate metadata\n");
 353     goto exit_err;
 354   }
 355
 356   ret = fread (meta, 1, NPUBIN_META_SIZE, fp);
 357   if (ret != NPUBIN_META_SIZE) {
 358     logerr (TAG, "Failed to read the metadata\n");
 359     goto exit_free;
 360   }
 361
 362   if (!CHECK_NPUBIN (meta->magiccode)) {
 363     logerr (TAG, "Invalid metadata provided\n");
 364     goto exit_free;
 365   }
 366
 367   if (need_extra && NPUBIN_META_EXTRA (meta->magiccode) > 0) {
 368     npubin_meta *new_meta;
 369
 370     new_meta = (npubin_meta *) realloc (meta, NPUBIN_META_TOTAL_SIZE(meta->magiccode));
 371     if (!new_meta) {
 372       logerr (TAG, "Failed to allocate extra metadata\n");
 373       goto exit_free;
 374     }
 375
 376     ret = fread (new_meta->reserved_extra, 1, NPUBIN_META_EXTRA_SIZE (meta->magiccode), fp);
 377     if (ret != NPUBIN_META_EXTRA_SIZE (meta->magiccode)) {
 378       logerr (TAG, "Invalid extra metadata provided\n");
 379       free (new_meta);
 380       goto exit_err;
 381     }
 382
 383     meta = new_meta;
 384   }
 385
 386   fclose (fp);
 387
 388   return meta;
 389
 390 exit_free:
 391   free (meta);
 392 exit_err:
 393   fclose (fp);
 394
 395   return nullptr;
 396 }
 397
 398 /** implement methods of HostHandler class */
 399
 400 /** @brief host handler constructor */
 401 HostHandler::HostHandler (Device *device)
 402   : device_(device),
 403     /* ignored as we don't use double buffering anymore, but for backward-compatibility */
 404     async_mode_ (NPUASYNC_WAIT)
 405 {
 406 }
 407
 408 /** @brief host handler destructor */
 409 HostHandler::~HostHandler ()
 410 {
 411 }
 412
 413 /**
 414  * @brief register model from generic buffer
 415  * @param[in] model_buf model buffer
 416  * @param[out] modelid model id
 417  * @return 0 if no error. otherwise a negative errno
 418  */
 419 int
 420 HostHandler::registerModel (generic_buffer *model_buf, uint32_t *modelid)
 421 {
 422   if (model_buf == nullptr || modelid == nullptr) {
 423     logerr (TAG, "Invalid arguments given\n");
 424     return -EINVAL;
 425   }
 426
 427   Model *model = nullptr;
 428   int status = device_->setModel (model_buf, &model);
 429   if (status != 0) {
 430     logerr (TAG, "Failed to set model: %d\n", status);
 431     return status;
 432   }
 433
 434   assert (model != nullptr);
 435
 436   status = models_.insert (model->getID(), model);
 437   if (status != 0) {
 438     logerr (TAG, "Failed to insert model id\n");
 439     delete model;
 440     return status;
 441   }
 442
 443   *modelid = model->getID();
 444   return 0;
 445 }
 446
 447 /**
 448  * @brief remove the registered model
 449  * @param[in] modelid model id
 450  * @return 0 if no error. otherwise a negative errno
 451  */
 452 int
 453 HostHandler::unregisterModel (uint32_t modelid)
 454 {
 455   Model *model = models_.find (modelid);
 456   if (model == nullptr)
 457     return -ENOENT;
 458
 459   int status = device_->unsetModel (model);
 460   if (status != 0) {
 461     logerr (TAG, "Failed to unset model: %d\n", status);
 462     return status;
 463   }
 464
 465   return models_.remove (modelid);
 466 }
 467
 468 /**
 469  * @brief remove all registered models
 470  * @return 0
 471  */
 472 int
 473 HostHandler::unregisterModels ()
 474 {
 475   models_.clear ();
 476   return 0;
 477 }
 478
 479 /**
 480  * @brief Set the data layout for input/output tensors
 481  * @param[in] modelid The ID of model whose layouts are set
 482  * @param[in] in the layout/type info for input tensors
 483  * @param[in] out the layout/type info for output tensors
 484  * @return @c 0 if no error. otherwise a negative error value
 485  * @note if this function is not called, default layout/type will be used.
 486  */
 487 int
 488 HostHandler::setDataInfo (uint32_t modelid, tensors_data_info *in,
 489     tensors_data_info *out)
 490 {
 491   Model *model = models_.find (modelid);
 492   if (model == nullptr)
 493     return -ENOENT;
 494
 495   return model->setDataInfo (in, out);
 496 }
 497
 498 /**
 499  * @brief Set the inference constraint for next NPU inferences
 500  * @param[in] modelid The target model id
 501  * @param[in] constraint inference constraint (e.g., timeout, priority)
 502  * @return @c 0 if no error. otherwise a negative error value
 503  * @note If this function is not called, default values are used.
 504  */
 505 int
 506 HostHandler::setConstraint (uint32_t modelid, npuConstraint constraint)
 507 {
 508   Model *model = models_.find (modelid);
 509   if (model == nullptr)
 510     return -ENOENT;
 511
 512   model->setConstraint (constraint);
 513
 514   return 0;
 515 }
 516
 517 /**
 518  * @brief find and return model instance
 519  * @param[in] modelid model id
 520  * @return model instance if found. otherwise nullptr
 521  */
 522 Model *
 523 HostHandler::getModel (uint32_t modelid)
 524 {
 525   return models_.find (modelid);
 526 }
 527
 528 /** @brief dummay callback for runSync. */
 529 class callbackSync {
 530   public:
 531     callbackSync (output_buffers *output) : output_(output), done_(false) {}
 532
 533     static void callback (output_buffers *output, uint64_t sequence, void *data) {
 534       callbackSync *sync = static_cast<callbackSync *>(data);
 535       sync->callback (output, sequence);
 536     }
 537
 538     void callback (output_buffers *output, uint64_t sequence) {
 539       if (output_ != nullptr) {
 540         /** just copy internal variables of output buffers */
 541         memcpy (output_, output, sizeof (output_buffers));
 542       }
 543       done_ = true;
 544       cv_.notify_one ();
 545     }
 546
 547     void wait () {
 548       std::unique_lock<std::mutex> lock (m_);
 549       cv_.wait (lock, [this]() { return done_; });
 550     }
 551
 552   private:
 553     std::mutex m_;
 554     std::condition_variable cv_;
 555     output_buffers *output_;
 556     bool done_;
 557 };
 558
 559 /**
 560  * @brief Execute inference. Wait (block) until the output is available.
 561  * @param[in] modelid The model to be inferred.
 562  * @param[in] input The input data to be inferred.
 563  * @param[out] output The output result.
 564  * @return @c 0 if no error. otherwise a negative error value
 565  */
 566 int
 567 HostHandler::runSync (uint32_t modelid, const input_buffers *input,
 568     output_buffers *output)
 569 {
 570   callbackSync sync (output);
 571   int status = runAsync (modelid, input, callbackSync::callback,
 572       static_cast <void*> (&sync), NPUASYNC_DROP_OLD, nullptr);
 573   if (status == 0) {
 574     /** sync needs to wait callback */
 575     sync.wait ();
 576   }
 577   return status;
 578 }
 579
 580 /**
 581  * @brief Invoke NPU inference. Unblocking call.
 582  * @param[in] modelid The model to be inferred.
 583  * @param[in] input The input data to be inferred.
 584  * @param[in] cb The output buffer handler.
 585  * @param[in] cb_data The data given as a parameter to the runNPU_async call.
 586  * @param[in] mode Configures how this operation works.
 587  * @param[out] sequence The sequence number returned with runNPU_async.
 588  * @return @c 0 if no error. otherwise a negative error value
 589  */
 590 int
 591 HostHandler::runAsync (uint32_t modelid, const input_buffers *input,
 592     npuOutputNotify cb, void *cb_data, npu_async_mode mode, uint64_t *sequence)
 593 {
 594   Model *model = nullptr;
 595
 596   if (device_->needModel()) {
 597     model = getModel (modelid);
 598     if (model == nullptr)
 599       return -ENOENT;
 600   }
 601
 602   /* check the given model before running */
 603   if (!model->finalize ()) {
 604     logerr (TAG, "Failed to finalize the model. Please see the log messages\n");
 605     return -EINVAL;
 606   }
 607
 608   device_->setAsyncMode (mode);
 609   return device_->run (NPUINPUT_HOST, model, input, cb, cb_data, sequence);
 610 }
 611
 612 /**
 613  * @brief get number of available devices
 614  * @param[in] type device type
 615  * @return number of devices
 616  */
 617 int
 618 HostHandler::getNumDevices (dev_type type)
 619 {
 620   return DriverAPI::getNumDevices (type);
 621 }
 622
 623 /**
 624  * @brief get device instance
 625  * @param[out] dev device instance
 626  * @param[in] type device type
 627  * @param[in] id device id
 628  * @return 0 if no error. otherwise a negative errno
 629  */
 630 int
 631 HostHandler::getDevice (npudev_h *dev, dev_type type, uint32_t id)
 632 {
 633   int num_devices = getNumDevices (type);
 634
 635   /** check the validity of device id */
 636   if (!(num_devices > 0 && id < static_cast<uint32_t>(num_devices))) {
 637     logerr (TAG, "Invalid arguments provided\n");
 638     return -ENODEV;
 639   }
 640
 641   Device *device = Device::createInstance (type, id);
 642   if (device == nullptr) {
 643     logerr (TAG, "Failed to create a device with the given type\n");
 644     return -EINVAL;
 645   }
 646
 647   *dev = device;
 648   /** This is just for backward-compatility; we don't guarantee its corresness */
 649   latest_dev_ = *dev;
 650
 651   return 0;
 652 }
 653
 654 /**
 655  * @brief allocate generic buffer (just for users)
 656  * @param[out] buffer buffer instance
 657  * @return 0 if no error. otherwise a negative errno
 658  */
 659 int
 660 HostHandler::allocGenericBuffer (generic_buffer *buffer)
 661 {
 662   if (buffer == NULL)
 663     return -EINVAL;
 664
 665   if (buffer->size == 0) {
 666     logerr (TAG, "Invalid size\n");
 667     return -EINVAL;
 668   }
 669
 670   if (buffer->size > UINT32_MAX) {
 671     logerr (TAG, "Don't support such a large size");
 672     return -ENOMEM;
 673   }
 674
 675   switch (buffer->type) {
 676     case BUFFER_FILE:
 677       /* nothing to do */
 678       if (buffer->filepath == nullptr)
 679         return -EINVAL;
 680       break;
 681     case BUFFER_MAPPED:
 682     {
 683       /* now, npu-engine always provides dmabuf-based allocation */
 684       void *addr = nullptr;
 685       int dmabuf = device_->allocMemory (buffer->size, &addr);
 686       if (dmabuf < 0)
 687         return dmabuf;
 688
 689       buffer->dmabuf = dmabuf;
 690       buffer->offset = 0;
 691       buffer->addr = addr;
 692     } break;
 693     default:
 694       return -EINVAL;
 695   }
 696
 697   return 0;
 698 }
 699
 700 /**
 701  * @brief deallocate generic buffer (just for users)
 702  * @param[in] buffer buffer instance
 703  * @return 0 if no error. otherwise a negative errno
 704  */
 705 int
 706 HostHandler::deallocGenericBuffer (generic_buffer *buffer)
 707 {
 708   if (buffer == NULL)
 709     return -EINVAL;
 710
 711   switch (buffer->type) {
 712     case BUFFER_FILE:
 713       /** always true cuz nothing to do */
 714       break;
 715     case BUFFER_MAPPED:
 716       return device_->deallocMemory (buffer->dmabuf, buffer->size, buffer->addr);
 717     default:
 718       return -EINVAL;
 719   }
 720
 721   return 0;
 722 }
 723
 724 /**
 725  * @brief allocate multiple generic buffers (just for users)
 726  * @param[out] buffers multi-buffer instance
 727  * @return 0 if no error. otherwise a negative errno
 728  */
 729 int
 730 HostHandler::allocGenericBuffer (generic_buffers *buffers)
 731 {
 732   uint32_t idx;
 733   int status = 0;
 734
 735   if (buffers == NULL || buffers->num_buffers < 1)
 736     return -EINVAL;
 737
 738   for (idx = 0; idx < buffers->num_buffers; idx++) {
 739     status = allocGenericBuffer (&buffers->bufs[idx]);
 740     if (status != 0)
 741       goto free_buffer;
 742   }
 743
 744   return 0;
 745
 746 free_buffer:
 747   for (idx = idx - 1; idx >= 0; idx--) {
 748     deallocGenericBuffer (&buffers->bufs[idx]);
 749   }
 750
 751   return status;
 752 }
 753
 754 /**
 755  * @brief deallocate multiple generic buffers (just for users)
 756  * @param[in] buffers multi-buffer instance
 757  * @return 0 if no error. otherwise a negative errno
 758  */
 759 int
 760 HostHandler::deallocGenericBuffer (generic_buffers *buffers)
 761 {
 762   if (buffers == NULL || buffers->num_buffers < 1)
 763     return -EINVAL;
 764
 765   for (uint32_t idx = 0; idx < buffers->num_buffers; idx++)
 766     deallocGenericBuffer (&buffers->bufs[idx]);
 767   buffers->num_buffers = 0;
 768
 769   return 0;
 770 }
 771
 772 /**
 773  * @brief get the current memory status
 774  * @param[out] alloc_total The size of allocated memory until now
 775  * @param[out] free_total The size of freed memory until now
 776  * @return 0 if no error. otherwise a negatice error value
 777  */
 778 int
 779 HostHandler::getMemoryStatus (size_t *alloc_total, size_t *free_total)
 780 {
 781   /** API is always set in initialize () */
 782   const DriverAPI * api = device_->getDriverAPI ();
 783   assert (api != nullptr);
 784
 785   return api->getMemoryStatus (alloc_total, free_total);
 786 }
 787
 788 /**
 789  * @brief Get the current device status to be used
 790  * @param[out] status the device status
 791  * @param[out] num_requests the number of running requests (or pending)
 792  * @return 0 if no error, otherwise a negative errno.
 793  */
 794 int
 795 HostHandler::getDeviceStatus (npu_status *status, uint32_t *num_requests)
 796 {
 797   /** API is always set in initialize () */
 798   const DriverAPI * api = device_->getDriverAPI ();
 799   assert (api != nullptr);
 800
 801   device_state_t state = api->isReady ();
 802   if (state == device_state_t::STATE_READY) {
 803     *num_requests = api->numRequests ();
 804     if (*num_requests > 0)
 805       *status = NPU_READY;
 806     else
 807       *status = NPU_IDLE;
 808   } else {
 809     *num_requests = 0;
 810     *status = NPU_ERROR;
 811   }
 812
 813   return 0;
 814 }
 815
 816 /** implement methods of Device class */
 817
 818 /** @brief constructor of device */
 819 Device::Device (dev_type type, int id, bool need_model)
 820   : comm_ (CommPlugin::getCommPlugin()), type_ (type), id_ (id), need_model_ (true),
 821     mode_ (NPUASYNC_WAIT), initialized_ (false), atomic_flag_ (ATOMIC_FLAG_INIT)
 822 {
 823 }
 824
 825 /**
 826  * @brief create device instance depending on device type and id
 827  * @param[in] type device type
 828  * @param[in] id device id
 829  * @return device instance
 830  */
 831 Device *
 832 Device::createInstance (dev_type type, int id)
 833 {
 834   Device *device = nullptr;
 835
 836   switch (type & DEVICETYPE_MASK) {
 837     case DEVICETYPE_TRIV:
 838       device = new TrinityVision (id);
 839       break;
 840     case DEVICETYPE_TRIV2:
 841       device = new TrinityVision2 (id);
 842       break;
 843     case DEVICETYPE_TRIA:
 844       device = new TrinityAsr (id);
 845       break;
 846     default:
 847       break;
 848   }
 849
 850   if (device != nullptr && device->init () != 0) {
 851     delete device;
 852     device = nullptr;
 853   }
 854
 855   return device;
 856 }
 857
 858 /**
 859  * @brief device initialization
 860  * @return 0 if no error, otherwise a negative errno
 861  * @note Init failures come from createDriverAPI() only.
 862  */
 863 int
 864 Device::init ()
 865 {
 866   /** should be initilizaed only once */
 867   if (!atomic_flag_.test_and_set()) {
 868     /** create the corresponding driver API */
 869     api_ = DriverAPI::createDriverAPI (type_, id_);
 870     if (api_.get() == nullptr) {
 871       atomic_flag_.clear();
 872       logerr (TAG, "Failed to create driver API\n");
 873       return -EINVAL;
 874     }
 875
 876     handler_.reset (new HostHandler (this));
 877     scheduler_.reset (new Scheduler (api_.get()));
 878     mem_ = MemAllocator::createInstance (api_.get());
 879
 880     initialized_ = true;  /** c++11 does not provide test() of atomic flag */
 881   }
 882
 883   return 0;
 884 }
 885
 886 /**
 887  * @brief stop all requests from this device
 888  * @param[in] force_stop indicate the schedduler waits until to handle previous requests
 889  * @return 0 if no error, otherwise a negative errno
 890  */
 891 int
 892 Device::stop (bool force_stop)
 893 {
 894   if (!initialized ()) {
 895     logerr (TAG, "Uninitialized device; should use libnpuhost APIs\n");
 896     return -EPERM;
 897   }
 898
 899   Request *req = new Request (NPUINPUT_STOP);
 900   req->setForceStop (force_stop);
 901   return scheduler_->submitRequest (req);
 902 }
 903
 904 /**
 905  * @brief allocate generic memory buffer
 906  * @param[in] size the size to allocate
 907  * @param[out] addr the mapped address
 908  * @return dmabuf fd if no error, otherwise a negative errno
 909  */
 910 int
 911 Device::allocMemory (size_t size, void **addr)
 912 {
 913   if (!initialized ()) {
 914     logerr (TAG, "Uninitialized device; should use libnpuhost APIs\n");
 915     return -EPERM;
 916   }
 917
 918   if (size == 0 || addr == nullptr) {
 919     logerr (TAG, "Invalid arguments\n");
 920     return -EINVAL;
 921   }
 922
 923   return mem_->allocMemory (size, addr);
 924 }
 925
 926 /**
 927  * @brief deallocate generic memory buffer
 928  * @param[in] dmabuf_fd dmabuf file descriptor
 929  * @param[in] size buffer size
 930  * @param[in] addr mapped addr
 931  * @return 0 if no error, otherwise a negative errno
 932  */
 933 int
 934 Device::deallocMemory (int dmabuf_fd, size_t size, void * addr)
 935 {
 936   if (!initialized ()) {
 937     logerr (TAG, "Uninitialized device; should use libnpuhost APIs\n");
 938     return -EPERM;
 939   }
 940
 941   if (dmabuf_fd < 0 || size == 0 || addr == nullptr) {
 942     logerr (TAG, "Invalid arguments\n");
 943     return -EINVAL;
 944   }
 945
 946   return mem_->deallocMemory (dmabuf_fd, size, addr);
 947 }
 948
 949 /**
 950  * @brief extract the buffer instance from input generic buffers
 951  * @param[in] meta the model metadata
 952  * @param[in] input the input generic buffers
 953  * @return the buffer instance
 954  */
 955 Buffer *
 956 TrinityVision::prepareInputBuffers (const Metadata *meta, const input_buffers *input)
 957 {
 958   if (meta == nullptr || input == nullptr ||
 959       meta->getInputNum() != input->num_buffers) {
 960     logerr (TAG, "Invalid metadata info provided\n");
 961     return nullptr;
 962   }
 963
 964   Buffer * buffer;
 965   const generic_buffer *first = &input->bufs[0];
 966   if (first->type == BUFFER_DMABUF) {
 967     buffer = mem_->allocBuffer (new HWmemExternal);
 968     if (buffer == nullptr)
 969       return nullptr;
 970
 971     buffer->setDmabuf (first->dmabuf);
 972     buffer->setOffset (first->offset);
 973     buffer->setSize (meta->getBufferSize());
 974   } else {
 975     buffer = mem_->allocBuffer (new HWmemDevice);
 976     if (buffer == nullptr)
 977       return nullptr;
 978
 979     int status = buffer->alloc (meta->getBufferSize ());
 980     if (status != 0) {
 981       logerr (TAG, "Failed to allocate buffer: %d\n", status);
 982       delete buffer;
 983       return nullptr;
 984     }
 985   }
 986
 987   int status = buffer->createTensors (meta);
 988   if (status != 0) {
 989     logerr (TAG, "Failed to create tensors: %d\n", status);
 990     delete buffer;
 991     buffer = nullptr;
 992   }
 993
 994   return buffer;
 995 }
 996
 997 /**
 998  * @brief implementation of TRIV's setModel ()
 999  * @param[in] model_buf the model generic buffer
1000  * @param[out] model the model instance
1001  * @return 0 if no error, otherwise a negative errno
1002  */
1003 int
1004 TrinityVision::setModel (const generic_buffer *model_buf, Model ** model_ptr)
1005 {
1006   if (!initialized ()) {
1007     logerr (TAG, "Uninitialized device; should use libnpuhost APIs\n");
1008     return -EPERM;
1009   }
1010
1011   if (model_buf == nullptr || model_ptr == nullptr)
1012     return -EINVAL;
1013
1014   Model *model = nullptr;
1015   HWmem * hwmem_prog = nullptr;
1016   HWmem * hwmem_weight = nullptr;
1017   int status;
1018
1019   /** In TRIV1, model data (including program/weight) should be contiguous */
1020
1021   switch (model_buf->type) {
1022   case BUFFER_FILE:
1023   case BUFFER_MAPPED:
1024     model = mem_->allocModel (new HWmemDevice);
1025     if (model == nullptr) {
1026       logerr (TAG, "Failed to allocate model\n");
1027       return -ENOMEM;
1028     }
1029
1030     status = model->alloc (model_buf->size);
1031     if (status != 0) {
1032       logerr (TAG, "Failed to allocate model: %d\n", status);
1033       goto delete_exit;
1034     }
1035
1036     /** extract the whole model data */
1037     status = comm_.extractGenericBuffer (model_buf, model->getData(), nullptr);
1038     if (status != 0) {
1039       logerr (TAG, "Failed to extract generic buffer: %d\n", status);
1040       goto delete_exit;
1041     }
1042     break;
1043   default:
1044     return -EINVAL;
1045   }
1046
1047   status = model->setMetadata (model->getData());
1048   if (status != 0)
1049     goto delete_exit;
1050
1051   /** allocate program (optional; NOP) */
1052   if (model->getMetadata()->getProgramSize() > 0) {
1053     hwmem_prog = new HWmem (new HWmemChunk);
1054     model->setProgramData (hwmem_prog);
1055
1056     hwmem_prog->setParent (model);
1057     hwmem_prog->setOffset (model->getMetadata()->getMetaSize());
1058     status = hwmem_prog->alloc (model->getMetadata()->getProgramSize());
1059     if (status != 0) {
1060       logerr (TAG, "Failed to allocate program\n");
1061       goto delete_exit;
1062     }
1063   }
1064
1065   /** allocate weight (optional) */
1066   if (model->getMetadata()->getWeightSize() > 0) {
1067     hwmem_weight = new HWmem (new HWmemChunk);
1068     model->setWeightData (hwmem_weight);
1069
1070     hwmem_weight->setParent (model);
1071     hwmem_weight->setOffset (model->getMetadata()->getMetaSize() +
1072         model->getMetadata()->getProgramSize());
1073     status = hwmem_weight->alloc (model->getMetadata()->getWeightSize());
1074     if (status != 0) {
1075       logerr (TAG, "Failed to allocate program\n");
1076       goto delete_exit;
1077     }
1078   }
1079
1080   if (hwmem_prog != nullptr) {
1081     /** register this model to the driver */
1082     model_config_t config;
1083     config.dbuf_fd = hwmem_prog->getDmabuf ();
1084     config.program_size = hwmem_prog->getSize ();
1085     config.program_offset_addr = hwmem_prog->getOffset ();
1086     if (hwmem_weight != nullptr)
1087       config.weight_offset_addr = hwmem_weight->getOffset ();
1088
1089     status = api_->registerModel (&config);
1090     if (status != 0)
1091       goto delete_exit;
1092
1093     model->setInternalID(config.id);
1094   }
1095
1096   *model_ptr = model;
1097   return status;
1098
1099 delete_exit:
1100   delete model;
1101   return status;
1102 }
1103
1104 /**
1105  * @brief implementation of TRIV's unsetModel ()
1106  * @param[in] model the model instance
1107  * @return 0 if no error, otherwise a negative errno
1108  */
1109 int
1110 TrinityVision::unsetModel (Model * model)
1111 {
1112   if (!initialized ()) {
1113     logerr (TAG, "Uninitialized device; should use libnpuhost APIs\n");
1114     return -EPERM;
1115   }
1116
1117   if (model == nullptr) {
1118     logerr (TAG, "Invalid model instance\n");
1119     return -EINVAL;
1120   }
1121
1122   if (model->getMetadata()->getProgramSize() > 0)
1123     return api_->deregisterModel (model->getInternalID ());
1124
1125   return 0;
1126 }
1127
1128 /**
1129  * @brief implementation of TRIV's run()
1130  * @param[in] opmode input opmode
1131  * @param[in] model the model instance
1132  * @param[in] input generic buffers of input data
1133  * @param[in] cb the output callback
1134  * @param[in] cb_data the output callback data
1135  * @param[out] sequence The sequence number returned with runNPU_async.
1136  */
1137 int
1138 TrinityVision::run (npu_input_opmode opmode, const Model *model,
1139     const input_buffers *input, npuOutputNotify cb, void *cb_data,
1140     uint64_t *sequence)
1141 {
1142   if (!initialized ()) {
1143     logerr (TAG, "Uninitialized device; should use libnpuhost APIs\n");
1144     return -EPERM;
1145   }
1146
1147   if (opmode != NPUINPUT_HOST) {
1148     logerr (TAG, "TRIV supports only host inputservice\n");
1149     return -EINVAL;
1150   }
1151
1152   if (model == nullptr || input == nullptr) {
1153     logerr (TAG, "TRIV requires both model and input buffers\n");
1154     return -EINVAL;
1155   }
1156
1157   Buffer *buffer = prepareInputBuffers (model->getMetadata(), input);
1158   if (buffer == nullptr) {
1159     logerr (TAG, "Failed to extract buffer instance\n");
1160     return -EINVAL;
1161   }
1162
1163   if (!buffer->isExternal ()) {
1164     for (uint32_t idx = 0; idx < input->num_buffers; idx++) {
1165       auto func = std::bind (TrinityVision::manipulateData, model, idx, true,
1166           std::placeholders::_1, std::placeholders::_2, std::placeholders::_3);
1167       int status = comm_.extractGenericBuffer (&input->bufs[idx],
1168           buffer->getInputTensor(idx)->getData(), func);
1169       if (status != 0) {
1170         logerr (TAG, "Failed to feed input buffer: %d\n", status);
1171         return status;
1172       }
1173     }
1174   }
1175
1176   /** this device uses CMA buffer */
1177
1178   Request *req = new Request (opmode);
1179   req->setModel (model);
1180   req->setBuffer (buffer);
1181
1182   if (cb != nullptr)
1183     req->setCallback (std::bind (&TrinityVision::callback, this, req, cb, cb_data));
1184
1185   if (sequence != nullptr)
1186     *sequence = req->getID();
1187
1188   return scheduler_->submitRequest (req);
1189 }
1190
1191 /**
1192  * @brief callback of TRIV2 request
1193  * @param[in] req the request instance
1194  * @param[in] cb callback for completion
1195  * @param[in] cb_data callback data
1196  * @note The callback invoke does not gurantee the request was successful
1197  * @todo Check the request failures
1198  */
1199 void
1200 TrinityVision::callback (Request *req, npuOutputNotify cb, void *cb_data)
1201 {
1202   const Model *model = req->getModel ();
1203   Buffer *buffer = req->getBuffer ();
1204   output_buffers output = {
1205     .num_buffers = buffer->getOutputNum ()
1206   };
1207
1208   for (uint32_t idx = 0; idx < output.num_buffers; idx++) {
1209     uint32_t output_tensor_size = model->getOutputTensorSize (idx);
1210
1211     if (buffer->isExternal ()) {
1212       output.bufs[idx].type = BUFFER_DMABUF;
1213       output.bufs[idx].size = output_tensor_size;
1214       output.bufs[idx].addr = buffer->getOutputTensor(idx)->getData();
1215     } else {
1216       output.bufs[idx].type = BUFFER_MAPPED;
1217       output.bufs[idx].size = output_tensor_size;
1218       /** user needs to free this */
1219       output.bufs[idx].addr = malloc (output_tensor_size);
1220
1221       auto func = std::bind (TrinityVision::manipulateData, model, idx, false,
1222           std::placeholders::_1, std::placeholders::_2, std::placeholders::_3);
1223       int status = comm_.insertGenericBuffer (buffer->getOutputTensor(idx)->getData(),
1224           &output.bufs[idx], func);
1225       if (status != 0) {
1226         logerr (TAG, "Failed to return output buffer: %d\n", status);
1227       }
1228     }
1229   }
1230
1231   cb (&output, req->getID(), cb_data);
1232
1233   delete buffer;
1234 }
1235
1236 /**
1237  * @brief extract the segment table instance from input generic buffers
1238  * @param[in] model the model instance
1239  * @param[in] input the input generic buffers
1240  * @return the segment table instance
1241  */
1242 SegmentTable *
1243 TrinityVision2::prepareSegmentTable (const Model *model, const input_buffers *input)
1244 {
1245   if (model == nullptr || input == nullptr) {
1246     logerr (TAG, "Invalid arguments provided\n");
1247     return nullptr;
1248   }
1249
1250   const Metadata *meta = model->getMetadata ();
1251   if (meta == nullptr ||
1252       meta->getInputNum() != input->num_buffers) {
1253     logerr (TAG, "Invalid metadata info provided\n");
1254     return nullptr;
1255   }
1256
1257   SegmentTable * segt = mem_->allocSegmentTable (new HWmemDevice);
1258   int status = segt->alloc ();
1259   if (status != 0) {
1260     logerr (TAG, "Failed to allocate segment table: %d\n", status);
1261     goto delete_segt;
1262   }
1263
1264   status = segt->createSegments (model, input);
1265   if (status != 0) {
1266     logerr (TAG, "Failed to create segments: %d\n", status);
1267     goto delete_segt;
1268   }
1269
1270   return segt;
1271
1272 delete_segt:
1273   delete segt;
1274   return nullptr;
1275 }
1276
1277 /**
1278  * @brief implementation of TRIV2's setModel ()
1279  * @param[in] model_buf the model generic buffer
1280  * @param[out] model the model instance
1281  * @return 0 if no error, otherwise a negative errno
1282  */
1283 int
1284 TrinityVision2::setModel (const generic_buffer *model_buf, Model ** model_ptr)
1285 {
1286   if (!initialized ()) {
1287     logerr (TAG, "Uninitialized device; should use libnpuhost APIs\n");
1288     return -EPERM;
1289   }
1290
1291   if (model_buf == nullptr || model_ptr == nullptr)
1292     return -EINVAL;
1293
1294   Model *model;
1295   int status;
1296
1297   switch (model_buf->type) {
1298   case BUFFER_FILE:
1299   case BUFFER_MAPPED:
1300     model = mem_->allocModel (new HWmemDevice);
1301     if (model == nullptr) {
1302       logerr (TAG, "Failed to allocate model\n");
1303       return -ENOMEM;
1304     }
1305
1306     status = model->alloc (NPUBIN_META_SIZE);
1307     if (status != 0) {
1308       logerr (TAG, "Failed to allocate model: %d\n", status);
1309       goto delete_exit;
1310     }
1311
1312     status = comm_.extractGenericBuffer (model_buf, model->getData(), nullptr,
1313         0, NPUBIN_META_SIZE);
1314     if (status != 0) {
1315       logerr (TAG, "Failed to extract generic buffer: %d\n", status);
1316       goto delete_exit;
1317     }
1318     break;
1319   default:
1320     return -EINVAL;
1321   }
1322
1323   status = model->setMetadata (model->getData());
1324   if (status != 0)
1325     goto delete_exit;
1326
1327   /** allocate program (optional; NOP) */
1328   if (model->getMetadata()->getProgramSize() > 0) {
1329     HWmem * hwmem_prog = new HWmem (new HWmemDevice);
1330     hwmem_prog->setDriverAPI (api_.get());
1331
1332     model->setProgramData (hwmem_prog);
1333
1334     status = hwmem_prog->alloc (model->getMetadata()->getProgramSize());
1335     if (status != 0) {
1336       logerr (TAG, "Failed to allocate program\n");
1337       goto delete_exit;
1338     }
1339
1340     status = comm_.extractGenericBuffer (model_buf, hwmem_prog->getData(), nullptr,
1341         model->getMetadata()->getMetaSize(),
1342         model->getMetadata()->getProgramSize());
1343     if (status != 0) {
1344       logerr (TAG, "Failed to extract generic buffer: %d\n", status);
1345       goto delete_exit;
1346     }
1347
1348     /** register this model to the driver */
1349     model_config_t config;
1350     config.dbuf_fd = hwmem_prog->getDmabuf ();
1351     config.program_size = hwmem_prog->getSize ();
1352     config.program_offset_addr = 0;
1353
1354     status = api_->registerModel (&config);
1355     if (status != 0)
1356       goto delete_exit;
1357
1358     model->setInternalID(config.id);
1359   }
1360
1361   /** allocate weight (optional) */
1362   if (model->getMetadata()->getWeightSize() > 0) {
1363     HWmem * hwmem_weight = new HWmem (new HWmemDevice);
1364     hwmem_weight->setDriverAPI (api_.get());
1365
1366     model->setWeightData (hwmem_weight);
1367
1368     status = hwmem_weight->alloc (model->getMetadata()->getWeightSize());
1369     if (status != 0) {
1370       logerr (TAG, "Failed to allocate program\n");
1371       goto delete_exit;
1372     }
1373
1374     status = comm_.extractGenericBuffer (model_buf, hwmem_weight->getData(), nullptr,
1375         model->getMetadata()->getMetaSize() + model->getMetadata()->getProgramSize(),
1376         model->getMetadata()->getWeightSize());
1377     if (status != 0) {
1378       logerr (TAG, "Failed to extract generic buffer: %d\n", status);
1379       goto delete_exit;
1380     }
1381   }
1382
1383   *model_ptr = model;
1384   return status;
1385
1386 delete_exit:
1387   delete model;
1388   return status;
1389 }
1390
1391 /**
1392  * @brief implementation of TRIV2's unsetModel ()
1393  * @param[in] model the model instance
1394  * @return 0 if no error, otherwise a negative errno
1395  */
1396 int
1397 TrinityVision2::unsetModel (Model * model)
1398 {
1399   if (!initialized ()) {
1400     logerr (TAG, "Uninitialized device; should use libnpuhost APIs\n");
1401     return -EPERM;
1402   }
1403
1404   if (model == nullptr) {
1405     logerr (TAG, "Invalid model instance\n");
1406     return -EINVAL;
1407   }
1408
1409   if (model->getMetadata()->getProgramSize() > 0)
1410     return api_->deregisterModel (model->getInternalID ());
1411
1412   return 0;
1413 }
1414
1415 /** @brief implementation of TRIV2's run() */
1416 int
1417 TrinityVision2::run (npu_input_opmode opmode, const Model *model,
1418     const input_buffers *input, npuOutputNotify cb, void *cb_data,
1419     uint64_t *sequence)
1420 {
1421   if (!initialized ()) {
1422     logerr (TAG, "Uninitialized device; should use libnpuhost APIs\n");
1423     return -EPERM;
1424   }
1425
1426   if (opmode != NPUINPUT_HOST && opmode != NPUINPUT_HW_RECURRING)
1427     return -EINVAL;
1428
1429   /** this device uses segment table */
1430   SegmentTable * segt = prepareSegmentTable (model, input);
1431   if (segt == nullptr) {
1432     logerr (TAG, "Failed to create segment table instance\n");
1433     return -EINVAL;
1434   }
1435
1436   /** extract input data */
1437   for (uint32_t idx = 0; idx < input->num_buffers; idx++) {
1438     size_t max_seg_size = segt->getInputSegment(idx)->getSize();
1439     uint32_t seg_offset = segt->getInputSegmentOffset(idx);
1440
1441     if (input->bufs[idx].size + seg_offset > max_seg_size) {
1442       logerr (TAG, "Too large input data provided: max segment size (%zu)\n",
1443           max_seg_size);
1444       return -ERANGE;
1445     }
1446
1447     if (!segt->getInputSegment(idx)->isExternal ()) {
1448       auto func = std::bind (TrinityVision2::manipulateData, model, idx, true,
1449           std::placeholders::_1, std::placeholders::_2, std::placeholders::_3);
1450       int status = comm_.extractGenericBuffer (
1451           &input->bufs[idx],
1452           segt->getInputSegment(idx)->getData() + seg_offset,
1453           func);
1454       if (status != 0) {
1455         logerr (TAG, "Failed to feed input segment: %d\n", status);
1456         return status;
1457       }
1458     }
1459   }
1460
1461   Request *req = new Request (opmode);
1462   req->setModel (model);
1463   req->setSegmentTable (segt);
1464   req->setCallback (std::bind (&TrinityVision2::callback, this, req, cb, cb_data));
1465
1466   if (sequence)
1467     *sequence = req->getID();
1468
1469   return scheduler_->submitRequest (req);
1470 }
1471
1472 /** @brief callback of TRIV2 request */
1473 void
1474 TrinityVision2::callback (Request *req, npuOutputNotify cb, void *cb_data)
1475 {
1476   const Model *model = req->getModel ();
1477   SegmentTable *segt = req->getSegmentTable ();
1478   output_buffers output = {
1479     .num_buffers = segt->getNumOutputSegments ()
1480   };
1481
1482   for (uint32_t idx = 0; idx < output.num_buffers; idx++) {
1483     uint32_t output_tensor_size = model->getOutputTensorSize (idx);
1484
1485     output.bufs[idx].type = BUFFER_MAPPED;
1486     output.bufs[idx].size = output_tensor_size;
1487     /** user needs to free this */
1488     output.bufs[idx].addr = calloc (1, output_tensor_size);
1489
1490     auto func = std::bind (TrinityVision2::manipulateData, model, idx, false,
1491         std::placeholders::_1, std::placeholders::_2, std::placeholders::_3);
1492     int status = comm_.insertGenericBuffer (
1493         segt->getOutputSegment(idx)->getData() + segt->getOutputSegmentOffset(idx),
1494         &output.bufs[idx], func);
1495
1496     if (status != 0) {
1497       logerr (TAG, "Failed to return output buffer: %d\n", status);
1498     }
1499   }
1500
1501   cb (&output, req->getID(), cb_data);
1502
1503   delete segt;
1504 }
1505
1506 /** @brief implementation of TRIA's run(): WIP */
1507 int
1508 TrinityAsr::run (npu_input_opmode opmode, const Model *model,
1509     const input_buffers *input, npuOutputNotify cb, void *cb_data,
1510     uint64_t *sequence)
1511 {
1512   if (!initialized ()) {
1513     logerr (TAG, "Uninitialized device; should use libnpuhost APIs\n");
1514     return -EPERM;
1515   }
1516
1517   if (opmode != NPUINPUT_HOST)
1518     return -EINVAL;
1519
1520   Buffer * buffer;
1521   int status;
1522   /** ASR does not require model and support only a single tensor */
1523   const generic_buffer *first_buf = &input->bufs[0];
1524   if (first_buf->type == BUFFER_DMABUF) {
1525     buffer = mem_->allocBuffer (new HWmemExternal);
1526     if (buffer == nullptr)
1527       return -ENOMEM;
1528
1529     buffer->setDmabuf (first_buf->dmabuf);
1530     buffer->setOffset (first_buf->offset);
1531     buffer->setSize (first_buf->size);
1532   } else {
1533     buffer = mem_->allocBuffer (new HWmemDevice);
1534     if (buffer == nullptr)
1535       return -ENOMEM;
1536
1537     status = buffer->alloc (first_buf->size);
1538     if (status != 0) {
1539       delete buffer;
1540       return status;
1541     }
1542   }
1543
1544   status = buffer->createTensors ();
1545   if (status != 0) {
1546     logerr (TAG, "Failed to create tensors: %d\n", status);
1547     delete buffer;
1548     return status;
1549   }
1550
1551   if (!buffer->isExternal ()) {
1552     status = comm_.extractGenericBuffer (first_buf,
1553         buffer->getInputTensor(0)->getData(), nullptr);
1554     if (status != 0)
1555       return status;
1556   }
1557
1558   Request *req = new Request (opmode);
1559   req->setBuffer (buffer);
1560   req->setCallback (std::bind (&TrinityAsr::callback, this, req, cb, cb_data));
1561
1562   if (sequence)
1563     *sequence = req->getID();
1564
1565   return scheduler_->submitRequest (req);
1566 }
1567
1568 /** @brief callback of TRIA request: WIP */
1569 void
1570 TrinityAsr::callback (Request *req, npuOutputNotify cb, void *cb_data)
1571 {
1572 }
1573
1574 /** Implement data manipulation (each device may have different impl.) */
1575
1576 #ifdef ENABLE_MANIP
1577
1578 #define do_quantized_memcpy(type) do {\
1579     idx = 0;\
1580     if (quant) {\
1581       while (idx < num_elems) {\
1582           val = ((type *) src)[idx];\
1583           val = val / _scale;\
1584           val += _zero_point;\
1585           val = (val > 255.0) ? 255.0 : 0.0;\
1586           ((uint8_t *) dst)[idx++] = (uint8_t) val;\
1587       }\
1588     } else {\
1589       while (idx < num_elems) {\
1590           val = *(uint8_t *) src;\
1591           val -= _zero_point;\
1592           val *= _scale;\
1593           ((type *) dst)[idx++] = (type) val;\
1594           dst = (void*)(((uint8_t *) dst) + data_size);\
1595           src = (void*)(((uint8_t *) src) + 1);\
1596       }\
1597     }\
1598   } while (0)
1599
1600 /**
1601  * @brief memcpy during quantization
1602  */
1603 static void memcpy_with_quant (bool quant, data_type type, float scale, uint32_t zero_point,
1604     void *dst, const void *src, uint32_t num_elems)
1605 {
1606   double _scale = (double) scale;
1607   double _zero_point = (double) zero_point;
1608   double val;
1609   uint32_t data_size = get_data_size (type);
1610   uint32_t idx;
1611
1612   switch (type) {
1613     case DATA_TYPE_INT8:
1614       do_quantized_memcpy (int8_t);
1615       break;
1616     case DATA_TYPE_UINT8:
1617       do_quantized_memcpy (uint8_t);
1618       break;
1619     case DATA_TYPE_INT16:
1620       do_quantized_memcpy (int16_t);
1621       break;
1622     case DATA_TYPE_UINT16:
1623       do_quantized_memcpy (uint16_t);
1624       break;
1625     case DATA_TYPE_INT32:
1626       do_quantized_memcpy (int32_t);
1627       break;
1628     case DATA_TYPE_UINT32:
1629       do_quantized_memcpy (uint32_t);
1630       break;
1631     case DATA_TYPE_INT64:
1632       do_quantized_memcpy (int64_t);
1633       break;
1634     case DATA_TYPE_UINT64:
1635       do_quantized_memcpy (uint64_t);
1636       break;
1637     case DATA_TYPE_FLOAT32:
1638       do_quantized_memcpy (float);
1639       break;
1640     case DATA_TYPE_FLOAT64:
1641       do_quantized_memcpy (double);
1642       break;
1643     default:
1644       logerr (TAG, "Unsupported datatype %d\n", type);
1645   }
1646 }
1647
1648 /**
1649  * @brief perform data manipulation
1650  * @param[in] model model instance
1651  * @param[in] idx tensor index
1652  * @param[in] is_input indicate it's input manipulation
1653  * @param[out] dst destination buffer
1654  * @param[in] src source buffer (feature map)
1655  * @param[in] size size to be copied
1656  * @return size of memory copy if no error, otherwise zero
1657  *
1658  * @note the input data format should be NHWC
1659  * @detail rules for the memory address of activations in NPU HW.
1660  *         (https://code.sec.samsung.net/confluence/pages/viewpage.action?pageId=146491864)
1661  *
1662  * 1) Special case (depth == 3)
1663  * - addr(x,y,z) = addr(0,0,0) + (z) + 3 * (x + width * y)
1664  *
1665  * 2) Common case
1666  * - addr(x,y,z) = addr(0,0,0) + (z % MPA_L) + MPA_L * (x + width * (y + height * (z / MPA_L)))
1667  *
1668  * Thus, if depth is not a multiple of MPA_L (i.e., 64), zero padding is required
1669  */
1670 size_t
1671 TrinityVision::manipulateData (const Model *model, uint32_t idx, bool is_input,
1672     void *dst, void *src, size_t size)
1673 {
1674   const Metadata *meta = model->getMetadata();
1675   const tensor_data_info* info;
1676   const uint32_t *dims;
1677   uint32_t zero_point;
1678   float scale;
1679
1680   /** extract required information from the metadata */
1681   if (is_input) {
1682     if (idx >= meta->getInputNum()) {
1683       logerr (TAG, "Wrong information for input tensors in metadata\n");
1684       return 0;
1685     }
1686
1687     info = model->getInputDataInfo (idx);
1688     dims = meta->getInputDims (idx);
1689     zero_point = meta->getInputQuantZero (idx);
1690     scale = meta->getInputQuantScale (idx);
1691   } else {
1692     if (idx >= meta->getOutputNum()) {
1693       logerr (TAG, "Wrong information for output tensors in metadata\n");
1694       return 0;
1695     }
1696
1697     info = model->getOutputDataInfo (idx);
1698     dims = meta->getOutputDims (idx);
1699     zero_point = meta->getOutputQuantZero (idx);
1700     scale = meta->getOutputQuantScale (idx);
1701   }
1702
1703   if (info == nullptr) {
1704     logerr (TAG, "Unmatched tensors info\n");
1705     return 0;
1706   }
1707
1708   uint32_t batch = dims[0];
1709   uint32_t height = dims[1];
1710   uint32_t width = dims[2];
1711   uint32_t depth = dims[3];
1712
1713   uint32_t data_size = get_data_size (info->type);
1714   if (data_size == 0) {
1715     logerr (TAG, "Invalid data size\n");
1716     return 0;
1717   }
1718
1719   bool need_quantization = false;
1720   /**
1721    * note that we assume DATA_TYPE_SRNPU is the smallest data type that we consider.
1722    * Also, DATA_TYPE_SRNPU and uint8_t may be regarded as the same in the view of apps.
1723    */
1724   if (info->type != DATA_TYPE_SRNPU) {
1725     assert (data_size >= get_data_size (DATA_TYPE_SRNPU));
1726
1727     if (data_size > get_data_size (DATA_TYPE_SRNPU) ||
1728         !(zero_point == default_quant_zero && scale == default_quant_scale))
1729       need_quantization = true;
1730   }
1731
1732   /** check data manipulation is required */
1733   if (depth != 3 && depth != 64 && info->layout != DATA_LAYOUT_SRNPU) {
1734     uint32_t MPA_L = DATA_GRANULARITY;
1735     uint32_t n, h, w, d;
1736     uint32_t std_offset;  /* standard offset in NHWC data format */
1737     uint32_t npu_offset;  /* npu offset in NPU HW data format*/
1738     uint32_t src_offset;
1739     uint32_t dst_offset;
1740     uint32_t slice_size;
1741
1742     /* @todo we currently support only NHWC */
1743     if (info->layout != DATA_LAYOUT_NHWC) {
1744       logerr (TAG, "data manipulation is supported for NHWC only\n");
1745       return 0;
1746     }
1747
1748     for (n = 0; n < batch; n++) {
1749       for (h = 0; h < height; h++) {
1750         for (w = 0; w < width; w++) {
1751           for (d = 0; d < depth; d += MPA_L) {
1752             std_offset = d + depth * (w + width * (h + n * height));
1753             npu_offset = MPA_L * (w + width * (h + (n + d / MPA_L) * height));
1754             slice_size = (depth - d >= MPA_L) ? MPA_L : depth - d;
1755
1756             if (is_input) {
1757               src_offset = std_offset * data_size;
1758               dst_offset = npu_offset;
1759             } else {
1760               src_offset = npu_offset;
1761               dst_offset = std_offset * data_size;
1762             }
1763
1764             /* if depth is not a multiple of MPA_L, add zero paddings (not exact values) */
1765             if (need_quantization) {
1766               memcpy_with_quant (is_input, info->type, scale, zero_point,
1767                   static_cast<char*>(dst) + dst_offset,
1768                   static_cast<char*>(src) + src_offset,
1769                   slice_size);
1770             } else {
1771               memcpy (
1772                   static_cast<char*>(dst) + dst_offset,
1773                   static_cast<char*>(src) + src_offset,
1774                   slice_size);
1775             }
1776           }
1777         }
1778       }
1779     }
1780   } else if (need_quantization) {
1781     /** depth == 3 || depth == 64; special cases which can directly copy input tensor data */
1782     memcpy_with_quant (is_input, info->type, scale, zero_point,
1783         dst, src, is_input ? size / data_size : size);
1784   } else {
1785     memcpy (dst, src, size);
1786   }
1787
1788   return size;
1789 }
1790
1791 #else
1792
1793 size_t
1794 TrinityVision::manipulateData (const Model *model, uint32_t idx, bool is_input,
1795     void *dst, void *src, size_t size)
1796 {
1797   memcpy (dst, src, size);
1798   return size;
1799 }
1800
1801 #endif
1802
1803 /** other device types don't have data manip impl. yet */
1804
1805 size_t
1806 TrinityVision2::manipulateData (const Model *model, uint32_t idx, bool is_input,
1807     void *dst, void *src, size_t size)
1808 {
1809   memcpy (dst, src, size);
1810   return size;
1811 }
1812
1813 size_t
1814 TrinityAsr::manipulateData (const Model *model, uint32_t idx, bool is_input,
1815     void *dst, void *src, size_t size)
1816 {
1817   memcpy (dst, src, size);
1818   return size;
1819 }