inference-engine/src/gna_plugin/gna_plugin.hpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4
   5 #pragma once
   6
   7 #include "cpp_interfaces/base/ie_plugin_base.hpp"
   8 #include "dnn.h"
   9 #include "gna_memory.hpp"
  10 #include "gna_device.hpp"
  11 #include <map>
  12 #include <unordered_map>
  13 #include <list>
  14 #include <string>
  15 #include <utility>
  16 #include <memory>
  17 #include <vector>
  18 #include <tuple>
  19 #include <gna-api-status.h>
  20 #include <gna-api.h>
  21 #include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
  22 #include <cpp_interfaces/impl/ie_plugin_internal.hpp>
  23 #include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
  24 #include <graph_tools.hpp>
  25 #include "gna_allocator.hpp"
  26 #include "gna_api_wrapper.hpp"
  27 #include "gna_plugin_policy.hpp"
  28
  29 namespace GNAPluginNS {
  30
  31 void ConvertToInt16(int16_t *ptr_dst,
  32                     const float *ptr_src,
  33                     const uint32_t num_rows,
  34                     const uint32_t num_columns,
  35                     const float scale_factor);
  36 void ConvertToFloat(float *ptr_dst,
  37                     int32_t *ptr_src,
  38                     const uint32_t num_rows,
  39                     const uint32_t num_columns,
  40                     const float scale_factor);
  41
  42 int16_t ConvertFloatToInt16(float src);
  43
  44 class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::enable_shared_from_this<GNAPlugin> {
  45  protected:
  46     AmIntelDnn dnn;
  47     using dnn_ptr = std::shared_ptr<CPPWrapper<intel_nnet_type_t>>;
  48
  49     /**
  50      * @brief - copy of nnet structure and indicator that related infer request not yet synced
  51      */
  52     std::vector<std::tuple<dnn_ptr, int32_t, InferenceEngine::BlobMap>> nnets;
  53
  54     std::unordered_map<std::string, intel_dnn_orientation_t> orientation_in;
  55     intel_dnn_orientation_t orientation_out = kDnnUnknownOrientation;
  56
  57     /**
  58      * temporary solution to support multiple scale factors
  59      * @return
  60      */
  61     float get_input_scale_factor() const;
  62     std::unordered_map<std::string, double> input_scale_factor;
  63
  64     double output_scale_factor = 1.0;
  65     uint32_t num_rotate_rows = 0;
  66     uint32_t num_rotate_columns = 0;
  67
  68
  69     uint32_t num_feature_maps = 1;
  70     uint32_t num_memory_bytes;
  71
  72     std::unordered_map<std::string, std::list<std::vector<void *>>::iterator> ptr_inputs_global_id;
  73     std::list<std::vector<void *>> ptr_inputs_global_storage;
  74
  75     std::vector<void *>& get_ptr_inputs_global(std::string name);
  76
  77     std::vector<void *> ptr_outputs_global;
  78
  79     uint32_t *ptr_active_indices = NULL;
  80     uint32_t num_active_indices = 0;
  81     uint32_t num_group_in = 0;
  82     uint32_t num_bytes_weight;
  83     uint32_t num_bytes_per_output = 0;
  84
  85     bool use_dynamic_quantization = false;
  86     bool compact_mode = true;
  87     bool exclusive_async_requests = false;
  88     bool uniformPwlDesign = false;
  89     uint8_t gna_lib_async_threads_num = 1;
  90     bool gna_openmp_multithreading = false;
  91     // precision of GNA hardware model
  92     InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16;
  93
  94     bool performance_counting = false;
  95
  96     intel_dnn_number_type_t output_type = kDnnInt;
  97     std::string utterance_name;
  98
  99     // internal types
 100     enum LayerType {
 101         Input,
 102         Convolution,
 103         ReLU,
 104         LeakyReLU,
 105         Sigmoid,
 106         TanH,
 107         Activation,
 108         Pooling,
 109         FullyConnected,
 110         InnerProduct,
 111         Reshape,
 112         Split,
 113         Slice,
 114         Eltwise,
 115         ScaleShift,
 116         Clamp,
 117         Concat,
 118         Copy,
 119         Permute,
 120         Memory,
 121         Power,
 122         Crop,
 123         NO_TYPE
 124     };
 125
 126  public:
 127     explicit GNAPlugin(const std::map<std::string, std::string>& configMap);
 128     /**
 129      * @brief construct from aot rather then from cnn network
 130      */
 131     GNAPlugin() = default;
 132
 133     void LoadNetwork(InferenceEngine::ICNNNetwork &network) override;
 134     using InferenceEngine::IInferencePluginInternal::Infer;
 135
 136     void Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) override;
 137     void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) override;
 138     void AddExtension(InferenceEngine::IExtensionPtr extension) override;
 139     void SetConfig(const std::map<std::string, std::string> &config) override;
 140     void LoadNetwork(InferenceEngine::IExecutableNetwork::Ptr &executableNetwork,
 141                      InferenceEngine::ICNNNetwork &network,
 142                      const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; }
 143     void Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &result) override;
 144     void SetLogCallback(InferenceEngine::IErrorListener &listener) override {};
 145     void Reset();
 146     /**
 147      * @deprecated Use the version with config parameter
 148      */
 149     void QueryNetwork(const InferenceEngine::ICNNNetwork &network,
 150                       InferenceEngine::QueryNetworkResult &res) const override;
 151     void QueryNetwork(const InferenceEngine::ICNNNetwork &network,
 152                       const std::map<std::string, std::string>& config,
 153                       InferenceEngine::QueryNetworkResult &res) const override;
 154     uint32_t QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result);
 155     void Wait(uint32_t idx = 0);
 156
 157     /**
 158      *
 159      * @param sync - points to gna sync point
 160      * @param idx - points to
 161      * @param result
 162      */
 163     void Wait(uint32_t sync, InferenceEngine::Blob &result);
 164
 165     void Export(const std::string &fileName);
 166     InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName
 167         , const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; }
 168     InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName);
 169
 170
 171     bool IsExclusiveAsyncRequests() { return exclusive_async_requests; }
 172
 173     /**
 174      * utility to provide input and output blobs externally to be used by InferenceEngine request API clients
 175      */
 176     InferenceEngine::Blob::Ptr GetInputBlob(std::string name, InferenceEngine::Precision precision);
 177     InferenceEngine::Blob::Ptr GetOutputBlob(InferenceEngine::Precision precision);
 178     /**
 179      * helpers to provide inputs info on AOT network
 180      */
 181     InferenceEngine::InputsDataMap GetInputs() {return inputsDataMap;}
 182     InferenceEngine::OutputsDataMap GetOutputs() {return outputsDataMap;}
 183     /**
 184      * QueryState API
 185      * @return
 186      */
 187      std::vector<InferenceEngine::IMemoryStateInternal::Ptr>  QueryState();
 188
 189      /**
 190       * test-wise API
 191       */
 192      void SetPolicy(Policy p) {policy = p;}
 193
 194  protected:
 195     Policy policy;
 196     uint32_t num_cnn_rows_out = 0;
 197     bool done = false;
 198     std::string dumpXNNPath;
 199     intel_gna_proc_t gna_proc_type = static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE);
 200
 201     void DumpXNNToFile() const;
 202     void CreateLayerPrimitive(InferenceEngine::CNNLayerPtr);
 203     void AffinePrimitive(InferenceEngine::CNNLayerPtr, bool isDiag = false);
 204     void AffineFilterPrimitive(InferenceEngine::CNNLayerPtr);
 205     void DiagonalPrimitive(InferenceEngine::CNNLayerPtr);
 206     void ConvolutionPrimitive(InferenceEngine::CNNLayerPtr);
 207     void PermutePrimitive(InferenceEngine::CNNLayerPtr);
 208     void PoolingPrimitive(InferenceEngine::CNNLayerPtr);
 209     void PowerPrimitive(InferenceEngine::CNNLayerPtr);
 210     void ConcatPrimitive(InferenceEngine::CNNLayerPtr);
 211     void CropPrimitive(InferenceEngine::CNNLayerPtr);
 212     void EltwisePrimitive(InferenceEngine::CNNLayerPtr);
 213     void SplitPrimitive(InferenceEngine::CNNLayerPtr);
 214     void SlicePrimitive(InferenceEngine::CNNLayerPtr);
 215     void PWLPrimitive(InferenceEngine::CNNLayerPtr);
 216     void CopyPrimitive(InferenceEngine::CNNLayerPtr);
 217     bool AreLayersSupported(InferenceEngine::ICNNNetwork& network, std::string& errMessage);
 218     LayerType LayerTypeFromStr(std::string const &str) const;
 219     /**
 220      * maps tpe of connection to input and output layers also stores gna_pointer for alloc request
 221      */
 222     class GNAMemoryLayer {
 223         InferenceEngine::CNNLayerPtr inputLayer;
 224         InferenceEngine::CNNLayerPtr outputLayer;
 225      public:
 226         GNAMemoryLayer(InferenceEngine::CNNLayerPtr inLayer, InferenceEngine::CNNLayerPtr outLayer) :
 227             inputLayer(inLayer), outputLayer(outLayer) {
 228         }
 229
 230         InferenceEngine::CNNLayerPtr getInput() { return inputLayer; }
 231         InferenceEngine::CNNLayerPtr getOutput() { return outputLayer; }
 232
 233         /**
 234          * pointer to gna memory request
 235          */
 236         void *gna_ptr = nullptr;
 237         /**
 238          * gna memory of this size is reserved
 239          */
 240         size_t  reserved_size = 0;
 241         /**
 242          * gna memory of this offset from gna_ptr
 243          */
 244         size_t  reserved_offset = 0;
 245     };
 246
 247     class GNAConcatLayer {
 248         InferenceEngine::CNNLayerPtr concatLayer;
 249
 250      public:
 251         explicit GNAConcatLayer(InferenceEngine::CNNLayerPtr layer) :
 252                                         concatLayer(layer)
 253                                         {}
 254
 255         InferenceEngine::CNNLayerPtr getConcat() { return concatLayer; }
 256         /**
 257          * pointer to gna memory request
 258          */
 259         void *gna_ptr = nullptr;
 260         /**
 261          * gna memory of this size is reserved for concat
 262          */
 263         size_t reserved_size = 0;
 264         bool output_allocation_flag = false;
 265         /**
 266          * gna memory of this offset from gna_ptr
 267          */
 268         struct ConcatConnectedLayerInfo {
 269             ConcatConnectedLayerInfo(const std::string& n,
 270                                     size_t o) :
 271                                      name(n),
 272                                      offset(o) {}
 273             std::string name = "";
 274             size_t offset = 0;
 275         };
 276
 277         std::vector<ConcatConnectedLayerInfo> concatInputLayers;
 278     };
 279
 280     // Split, Slice
 281     class GNASplitLayer {
 282         InferenceEngine::CNNLayerPtr splitLayer;
 283
 284      public:
 285         explicit GNASplitLayer(InferenceEngine::CNNLayerPtr layer) :
 286                                         splitLayer(layer),
 287                                         splitInputLayer()
 288                                         {}
 289
 290         InferenceEngine::CNNLayerPtr getSplit() { return splitLayer; }
 291         /**
 292          * gna memory of this size is reserved for split
 293          */
 294         size_t reserved_size = 0;
 295         bool output_allocation_flag = false;
 296         /**
 297          * gna memory of this offset from gna_ptr
 298          */
 299         struct SplitConnectedLayerInfo {
 300             SplitConnectedLayerInfo() {}
 301             SplitConnectedLayerInfo(std::string& n,
 302                                     size_t o,
 303                                     size_t p) :
 304                                      name(n),
 305                                      offset(o),
 306                                      pure_size(p) {}
 307
 308             SplitConnectedLayerInfo& operator=
 309                     (SplitConnectedLayerInfo const& layerInfo) {
 310                 this->name      = layerInfo.name;
 311                 this->offset    = layerInfo.offset;
 312                 this->pure_size = layerInfo.pure_size;
 313                 return *this;
 314             }
 315             std::string name = "";
 316             size_t offset    = 0;
 317             size_t pure_size = 0;
 318         };
 319         SplitConnectedLayerInfo splitInputLayer;
 320         std::vector<SplitConnectedLayerInfo> splitOutputLayers;
 321     };
 322
 323     class GNACropLayer {
 324         InferenceEngine::CNNLayerPtr cropLayer;
 325
 326     public:
 327         explicit GNACropLayer(InferenceEngine::CNNLayerPtr layer) :
 328         cropLayer(layer)
 329         {}
 330
 331         InferenceEngine::CNNLayerPtr getCrop() { return cropLayer; }
 332         /**
 333          * pointer to gna croped memory beginning
 334          */
 335         void *gna_ptr = nullptr;
 336     };
 337     using MemoryConnection = std::list<std::pair<std::string, GNAMemoryLayer>>;
 338     using ConcatConnection = std::unordered_map<std::string, GNAConcatLayer>;
 339     using SplitConnection  = std::unordered_map<std::string, GNASplitLayer>;
 340     using CropConnection  = std::unordered_map<std::string, GNACropLayer>;
 341     // layers with extra storage for connections and additional
 342     // non trivial processing
 343     MemoryConnection memory_connection;
 344     ConcatConnection concat_connection;
 345     SplitConnection  split_connection;
 346     CropConnection   crop_connection;
 347     void fillMemoryConnections(std::unordered_map<std::string,
 348                                  std::vector<InferenceEngine::CNNLayerPtr>> &memoryPairs);
 349
 350     void fillConcatConnections(InferenceEngine::CNNLayerPtr layer);
 351     void fillSplitConnections(InferenceEngine::CNNLayerPtr layer);
 352     /**
 353      * maps layer name to dnn.component, in topological sort prev nodes will be initialized
 354      */
 355     using DnnComponentsForLayer = std::list<std::pair<std::string, intel_dnn_component_t>>;
 356     DnnComponentsForLayer dnnComponentsForLayer;
 357
 358     /**
 359      * @brief returns corresponding dnn layer for topology layer
 360      * @param __layer
 361      * @return
 362      */
 363     intel_dnn_component_t * findDnnLayer(InferenceEngine::CNNLayerPtr __layer);
 364
 365     using allocator_type = PolymorphAllocator<uint8_t>;
 366     using gna_memory_type = GNAMemory<allocator_type>;
 367
 368     std::unique_ptr<GNADeviceHelper> gnadevice;
 369     /**
 370      * @brief size of RW segment without extra memory for parallel execution
 371      */
 372     uint32_t rwSegmentSize = 0;
 373     std::unique_ptr<gna_memory_type> gnamem;
 374
 375     /**
 376      * Fill in the Affine layer weights
 377      * @param layer - affine layer pointer
 378      * @param ptrWeights - pointer to weights memory
 379      * @param offset - memory before offset value will be zeroed
 380      * @param isQuantized - information about layer quantization
 381      */
 382     void FillWeightOfAligningFilter(InferenceEngine::CNNLayerPtr layer, void* ptrWeights, size_t offset, bool isQuantized = false);
 383
 384     /**
 385      * Connects either memory output, or generic output to a layer
 386      * @param layer - layer pointer
 387      * @param ptr - pointer to pointer where to store  output layer information
 388      * @param sz - sizeof output blob
 389      * @param ptr_inputs - sizeof output blob
 390      */
 391     void connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr_outputs, void *ptr_inputs, size_t sz);
 392     /**
 393      * Connects certain input to this layer
 394      * @param layer - layer that we connect input to
 395      * @param pVoid - pointer that  holds current layer pointer in gna_mem request
 396      * @param num_data_bytes_in - size
 397      * @param offset - num bytes to advance in buffer
 398      * @param idx - index of input port that we are connecting
 399      * @return layer used as input
 400      */
 401     struct ConnectionDetails {
 402         InferenceEngine::CNNLayerPtr  input;
 403         bool needTransposeWeights = false;
 404         InferenceEngine::CNNLayerPtr permute;
 405         ConnectionDetails(InferenceEngine::CNNLayerPtr input,
 406                           bool bTranspose = false,
 407                           InferenceEngine::CNNLayerPtr permute = nullptr)
 408             : input(input)
 409             , needTransposeWeights(bTranspose)
 410             , permute(permute) {
 411         }
 412     };
 413     ConnectionDetails connectInput(InferenceEngine::CNNLayerPtr layer,
 414                       void *pVoid,
 415                       size_t num_data_bytes_in,
 416                       int32_t offset = 0,
 417                       int idx = 0);
 418
 419     void ImportFrames(void *ptr_dst,
 420                      const void *ptr_src,
 421                      InferenceEngine::Precision input_precision,
 422                      intel_dnn_orientation_t orientation,
 423                      uint32_t num_frames,
 424                      uint32_t num_group,
 425                      uint32_t num_vector_elements,
 426                      uint32_t num_vector_stride);
 427
 428     void ExportScores(void *ptr_dst,
 429                      void *ptr_src,
 430                      intel_dnn_orientation_t orientation,
 431                      uint32_t num_frames,
 432                      uint32_t num_group,
 433                      uint32_t num_vector_elements,
 434                      uint32_t num_active_elements,
 435                      uint32_t num_vector_stride,
 436                      uint32_t num_bytes_per_element_input,
 437                      uint32_t num_bytes_per_element);
 438
 439     friend void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst,
 440                     const float *ptr_src,
 441                     const uint32_t num_rows,
 442                     const uint32_t num_columns,
 443                     const float scale_factor);
 444     friend void GNAPluginNS::ConvertToFloat(float *ptr_dst,
 445                     int32_t *ptr_src,
 446                     const uint32_t num_rows,
 447                     const uint32_t num_columns,
 448                     const float scale_factor);
 449
 450     friend int16_t GNAPluginNS::ConvertFloatToInt16(float src);
 451
 452     template <typename T, typename U>
 453     void copyInputData(T *dst,
 454                     const U *src,
 455                     uint32_t num_frames,
 456                     uint32_t num_group,
 457                     uint32_t num_vector_elements,
 458                     uint32_t num_vector_stride,
 459                     intel_dnn_orientation_t orientation);
 460
 461     template <typename T, typename U>
 462     void copyInputDataWithSplit(T *const dst,
 463                     const U *src,
 464                     const GNASplitLayer& splitInfo,
 465                     size_t precision_size);
 466     /**
 467      * @brief GNA affine layers are always have activation atached, while IR not
 468      */
 469     void insertIdentityLayer(std::vector<InferenceEngine::CNNLayerPtr> &layers);
 470
 471     /**
 472      * @brief GNA cannot support broadcast - so we will tile weights and biases for scaleshift layer
 473      */
 474     void substituteScaleShiftBroadCast(std::vector<InferenceEngine::CNNLayerPtr> &layers);
 475
 476
 477     /**
 478      * @brief GNA convolution layers have deinterleaved layout, while affine one doesn't
 479      * so between convolution and affine layers permute layers need to be inserted,
 480      * current MO approach is to insert such permutations
 481      * since GNA-HW already support conv->affine in permuted for, this pass inverses MO behavior
 482      * so its remove permutations of certain form conv->conv, and between conv->affine
 483      * and insert permutation between conv->affine if they are missed in IR
 484      * @param layers
 485      */
 486     void reversePermutations(std::vector<InferenceEngine::CNNLayerPtr> &layers);
 487
 488
 489     /**
 490      * brief @search for specific patter in the graph (6 layers are replaced by single one)
 491      * @param layers
 492      */
 493     void substitutePRelu(std::vector<InferenceEngine::CNNLayerPtr> &layers);
 494
 495     std::vector<InferenceEngine::CNNLayerPtr> getCandidatesForIdentityInsertion(const InferenceEngine::CNNLayerPtr layer);
 496
 497     /**
 498      * diagonal layer insertion required in cases where activation followed by split layers, or any other
 499      * topology changing layers
 500      */
 501     void insertDiagonalLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
 502
 503     /**
 504      * @brief MaxPool can be reordered with activation, on GNA there is a strategy to have conv->maxpool->activation
 505      * it means maxpool receives 4 bytes, and produces 4 bytes
 506      */
 507     void reorderMaxPool(std::vector<InferenceEngine::CNNLayerPtr> & layers);
 508
 509     /**
 510      * copy layer insertion required in cases where input layer does not have output memory
 511      */
 512     void insertCopyLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
 513
 514     /**
 515      * aligned filter layer insertion required in cases when split/slice have output connections on not aligned addresses
 516      */
 517     void insertAligningFilterLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
 518
 519     intel_dnn_component_t * find_first_unused_input(InferenceEngine::CNNLayerPtr current);
 520     std::map<std::string, int> bytes_alllocated_for_input;
 521     InferenceEngine::InputsDataMap inputsDataMap;
 522
 523     InferenceEngine::SizeVector outputDims;
 524     InferenceEngine::OutputsDataMap outputsDataMap;
 525 };
 526 }  // namespace GNAPluginNS