Publishing 2019 R1 content
[platform/upstream/dldt.git] / inference-engine / src / gna_plugin / gna_plugin.hpp
1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
3 //
4
5 #pragma once
6
7 #include "cpp_interfaces/base/ie_plugin_base.hpp"
8 #include "dnn.h"
9 #include "gna_memory.hpp"
10 #include "gna_device.hpp"
11 #include <map>
12 #include <unordered_map>
13 #include <list>
14 #include <string>
15 #include <utility>
16 #include <memory>
17 #include <vector>
18 #include <tuple>
19 #include <gna-api-status.h>
20 #include <gna-api.h>
21 #include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
22 #include <cpp_interfaces/impl/ie_plugin_internal.hpp>
23 #include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
24 #include <graph_tools.hpp>
25 #include "gna_allocator.hpp"
26 #include "gna_api_wrapper.hpp"
27 #include "gna_plugin_policy.hpp"
28
29 namespace GNAPluginNS {
30
31 void ConvertToInt16(int16_t *ptr_dst,
32                     const float *ptr_src,
33                     const uint32_t num_rows,
34                     const uint32_t num_columns,
35                     const float scale_factor);
36 void ConvertToFloat(float *ptr_dst,
37                     int32_t *ptr_src,
38                     const uint32_t num_rows,
39                     const uint32_t num_columns,
40                     const float scale_factor);
41
42 int16_t ConvertFloatToInt16(float src);
43
44 class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::enable_shared_from_this<GNAPlugin> {
45  protected:
46     AmIntelDnn dnn;
47     using dnn_ptr = std::shared_ptr<CPPWrapper<intel_nnet_type_t>>;
48
49     /**
50      * @brief - copy of nnet structure and indicator that related infer request not yet synced
51      */
52     std::vector<std::tuple<dnn_ptr, int32_t, InferenceEngine::BlobMap>> nnets;
53
54     std::unordered_map<std::string, intel_dnn_orientation_t> orientation_in;
55     intel_dnn_orientation_t orientation_out = kDnnUnknownOrientation;
56
57     /**
58      * temporary solution to support multiple scale factors
59      * @return
60      */
61     float get_input_scale_factor() const;
62     std::unordered_map<std::string, double> input_scale_factor;
63
64     double output_scale_factor = 1.0;
65     uint32_t num_rotate_rows = 0;
66     uint32_t num_rotate_columns = 0;
67
68
69     uint32_t num_feature_maps = 1;
70     uint32_t num_memory_bytes;
71
72     std::unordered_map<std::string, std::list<std::vector<void *>>::iterator> ptr_inputs_global_id;
73     std::list<std::vector<void *>> ptr_inputs_global_storage;
74
75     std::vector<void *>& get_ptr_inputs_global(std::string name);
76
77     std::vector<void *> ptr_outputs_global;
78
79     uint32_t *ptr_active_indices = NULL;
80     uint32_t num_active_indices = 0;
81     uint32_t num_group_in = 0;
82     uint32_t num_bytes_weight;
83     uint32_t num_bytes_per_output = 0;
84
85     bool use_dynamic_quantization = false;
86     bool compact_mode = true;
87     bool exclusive_async_requests = false;
88     bool uniformPwlDesign = false;
89     uint8_t gna_lib_async_threads_num = 1;
90     bool gna_openmp_multithreading = false;
91     // precision of GNA hardware model
92     InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16;
93
94     bool performance_counting = false;
95
96     intel_dnn_number_type_t output_type = kDnnInt;
97     std::string utterance_name;
98
99     // internal types
100     enum LayerType {
101         Input,
102         Convolution,
103         ReLU,
104         LeakyReLU,
105         Sigmoid,
106         TanH,
107         Activation,
108         Pooling,
109         FullyConnected,
110         InnerProduct,
111         Reshape,
112         Split,
113         Slice,
114         Eltwise,
115         ScaleShift,
116         Clamp,
117         Concat,
118         Copy,
119         Permute,
120         Memory,
121         Power,
122         Crop,
123         NO_TYPE
124     };
125
126  public:
127     explicit GNAPlugin(const std::map<std::string, std::string>& configMap);
128     /**
129      * @brief construct from aot rather then from cnn network
130      */
131     GNAPlugin() = default;
132
133     void LoadNetwork(InferenceEngine::ICNNNetwork &network) override;
134     using InferenceEngine::IInferencePluginInternal::Infer;
135
136     void Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) override;
137     void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) override;
138     void AddExtension(InferenceEngine::IExtensionPtr extension) override;
139     void SetConfig(const std::map<std::string, std::string> &config) override;
140     void LoadNetwork(InferenceEngine::IExecutableNetwork::Ptr &executableNetwork,
141                      InferenceEngine::ICNNNetwork &network,
142                      const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; }
143     void Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &result) override;
144     void SetLogCallback(InferenceEngine::IErrorListener &listener) override {};
145     void Reset();
146     /**
147      * @deprecated Use the version with config parameter
148      */
149     void QueryNetwork(const InferenceEngine::ICNNNetwork &network,
150                       InferenceEngine::QueryNetworkResult &res) const override;
151     void QueryNetwork(const InferenceEngine::ICNNNetwork &network,
152                       const std::map<std::string, std::string>& config,
153                       InferenceEngine::QueryNetworkResult &res) const override;
154     uint32_t QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result);
155     void Wait(uint32_t idx = 0);
156
157     /**
158      *
159      * @param sync - points to gna sync point
160      * @param idx - points to
161      * @param result
162      */
163     void Wait(uint32_t sync, InferenceEngine::Blob &result);
164
165     void Export(const std::string &fileName);
166     InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName
167         , const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; }
168     InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName);
169
170
171     bool IsExclusiveAsyncRequests() { return exclusive_async_requests; }
172
173     /**
174      * utility to provide input and output blobs externally to be used by InferenceEngine request API clients
175      */
176     InferenceEngine::Blob::Ptr GetInputBlob(std::string name, InferenceEngine::Precision precision);
177     InferenceEngine::Blob::Ptr GetOutputBlob(InferenceEngine::Precision precision);
178     /**
179      * helpers to provide inputs info on AOT network
180      */
181     InferenceEngine::InputsDataMap GetInputs() {return inputsDataMap;}
182     InferenceEngine::OutputsDataMap GetOutputs() {return outputsDataMap;}
183     /**
184      * QueryState API
185      * @return
186      */
187      std::vector<InferenceEngine::IMemoryStateInternal::Ptr>  QueryState();
188
189      /**
190       * test-wise API
191       */
192      void SetPolicy(Policy p) {policy = p;}
193
194  protected:
195     Policy policy;
196     uint32_t num_cnn_rows_out = 0;
197     bool done = false;
198     std::string dumpXNNPath;
199     intel_gna_proc_t gna_proc_type = static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE);
200
201     void DumpXNNToFile() const;
202     void CreateLayerPrimitive(InferenceEngine::CNNLayerPtr);
203     void AffinePrimitive(InferenceEngine::CNNLayerPtr, bool isDiag = false);
204     void AffineFilterPrimitive(InferenceEngine::CNNLayerPtr);
205     void DiagonalPrimitive(InferenceEngine::CNNLayerPtr);
206     void ConvolutionPrimitive(InferenceEngine::CNNLayerPtr);
207     void PermutePrimitive(InferenceEngine::CNNLayerPtr);
208     void PoolingPrimitive(InferenceEngine::CNNLayerPtr);
209     void PowerPrimitive(InferenceEngine::CNNLayerPtr);
210     void ConcatPrimitive(InferenceEngine::CNNLayerPtr);
211     void CropPrimitive(InferenceEngine::CNNLayerPtr);
212     void EltwisePrimitive(InferenceEngine::CNNLayerPtr);
213     void SplitPrimitive(InferenceEngine::CNNLayerPtr);
214     void SlicePrimitive(InferenceEngine::CNNLayerPtr);
215     void PWLPrimitive(InferenceEngine::CNNLayerPtr);
216     void CopyPrimitive(InferenceEngine::CNNLayerPtr);
217     bool AreLayersSupported(InferenceEngine::ICNNNetwork& network, std::string& errMessage);
218     LayerType LayerTypeFromStr(std::string const &str) const;
219     /**
220      * maps tpe of connection to input and output layers also stores gna_pointer for alloc request
221      */
222     class GNAMemoryLayer {
223         InferenceEngine::CNNLayerPtr inputLayer;
224         InferenceEngine::CNNLayerPtr outputLayer;
225      public:
226         GNAMemoryLayer(InferenceEngine::CNNLayerPtr inLayer, InferenceEngine::CNNLayerPtr outLayer) :
227             inputLayer(inLayer), outputLayer(outLayer) {
228         }
229
230         InferenceEngine::CNNLayerPtr getInput() { return inputLayer; }
231         InferenceEngine::CNNLayerPtr getOutput() { return outputLayer; }
232
233         /**
234          * pointer to gna memory request
235          */
236         void *gna_ptr = nullptr;
237         /**
238          * gna memory of this size is reserved
239          */
240         size_t  reserved_size = 0;
241         /**
242          * gna memory of this offset from gna_ptr
243          */
244         size_t  reserved_offset = 0;
245     };
246
247     class GNAConcatLayer {
248         InferenceEngine::CNNLayerPtr concatLayer;
249
250      public:
251         explicit GNAConcatLayer(InferenceEngine::CNNLayerPtr layer) :
252                                         concatLayer(layer)
253                                         {}
254
255         InferenceEngine::CNNLayerPtr getConcat() { return concatLayer; }
256         /**
257          * pointer to gna memory request
258          */
259         void *gna_ptr = nullptr;
260         /**
261          * gna memory of this size is reserved for concat
262          */
263         size_t reserved_size = 0;
264         bool output_allocation_flag = false;
265         /**
266          * gna memory of this offset from gna_ptr
267          */
268         struct ConcatConnectedLayerInfo {
269             ConcatConnectedLayerInfo(const std::string& n,
270                                     size_t o) :
271                                      name(n),
272                                      offset(o) {}
273             std::string name = "";
274             size_t offset = 0;
275         };
276
277         std::vector<ConcatConnectedLayerInfo> concatInputLayers;
278     };
279
280     // Split, Slice
281     class GNASplitLayer {
282         InferenceEngine::CNNLayerPtr splitLayer;
283
284      public:
285         explicit GNASplitLayer(InferenceEngine::CNNLayerPtr layer) :
286                                         splitLayer(layer),
287                                         splitInputLayer()
288                                         {}
289
290         InferenceEngine::CNNLayerPtr getSplit() { return splitLayer; }
291         /**
292          * gna memory of this size is reserved for split
293          */
294         size_t reserved_size = 0;
295         bool output_allocation_flag = false;
296         /**
297          * gna memory of this offset from gna_ptr
298          */
299         struct SplitConnectedLayerInfo {
300             SplitConnectedLayerInfo() {}
301             SplitConnectedLayerInfo(std::string& n,
302                                     size_t o,
303                                     size_t p) :
304                                      name(n),
305                                      offset(o),
306                                      pure_size(p) {}
307
308             SplitConnectedLayerInfo& operator=
309                     (SplitConnectedLayerInfo const& layerInfo) {
310                 this->name      = layerInfo.name;
311                 this->offset    = layerInfo.offset;
312                 this->pure_size = layerInfo.pure_size;
313                 return *this;
314             }
315             std::string name = "";
316             size_t offset    = 0;
317             size_t pure_size = 0;
318         };
319         SplitConnectedLayerInfo splitInputLayer;
320         std::vector<SplitConnectedLayerInfo> splitOutputLayers;
321     };
322
323     class GNACropLayer {
324         InferenceEngine::CNNLayerPtr cropLayer;
325
326     public:
327         explicit GNACropLayer(InferenceEngine::CNNLayerPtr layer) :
328         cropLayer(layer)
329         {}
330
331         InferenceEngine::CNNLayerPtr getCrop() { return cropLayer; }
332         /**
333          * pointer to gna croped memory beginning
334          */
335         void *gna_ptr = nullptr;
336     };
337     using MemoryConnection = std::list<std::pair<std::string, GNAMemoryLayer>>;
338     using ConcatConnection = std::unordered_map<std::string, GNAConcatLayer>;
339     using SplitConnection  = std::unordered_map<std::string, GNASplitLayer>;
340     using CropConnection  = std::unordered_map<std::string, GNACropLayer>;
341     // layers with extra storage for connections and additional
342     // non trivial processing
343     MemoryConnection memory_connection;
344     ConcatConnection concat_connection;
345     SplitConnection  split_connection;
346     CropConnection   crop_connection;
347     void fillMemoryConnections(std::unordered_map<std::string,
348                                  std::vector<InferenceEngine::CNNLayerPtr>> &memoryPairs);
349
350     void fillConcatConnections(InferenceEngine::CNNLayerPtr layer);
351     void fillSplitConnections(InferenceEngine::CNNLayerPtr layer);
352     /**
353      * maps layer name to dnn.component, in topological sort prev nodes will be initialized
354      */
355     using DnnComponentsForLayer = std::list<std::pair<std::string, intel_dnn_component_t>>;
356     DnnComponentsForLayer dnnComponentsForLayer;
357
358     /**
359      * @brief returns corresponding dnn layer for topology layer
360      * @param __layer
361      * @return
362      */
363     intel_dnn_component_t * findDnnLayer(InferenceEngine::CNNLayerPtr __layer);
364
365     using allocator_type = PolymorphAllocator<uint8_t>;
366     using gna_memory_type = GNAMemory<allocator_type>;
367
368     std::unique_ptr<GNADeviceHelper> gnadevice;
369     /**
370      * @brief size of RW segment without extra memory for parallel execution
371      */
372     uint32_t rwSegmentSize = 0;
373     std::unique_ptr<gna_memory_type> gnamem;
374
375     /**
376      * Fill in the Affine layer weights
377      * @param layer - affine layer pointer
378      * @param ptrWeights - pointer to weights memory
379      * @param offset - memory before offset value will be zeroed
380      * @param isQuantized - information about layer quantization
381      */
382     void FillWeightOfAligningFilter(InferenceEngine::CNNLayerPtr layer, void* ptrWeights, size_t offset, bool isQuantized = false);
383
384     /**
385      * Connects either memory output, or generic output to a layer
386      * @param layer - layer pointer
387      * @param ptr - pointer to pointer where to store  output layer information
388      * @param sz - sizeof output blob
389      * @param ptr_inputs - sizeof output blob
390      */
391     void connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr_outputs, void *ptr_inputs, size_t sz);
392     /**
393      * Connects certain input to this layer
394      * @param layer - layer that we connect input to
395      * @param pVoid - pointer that  holds current layer pointer in gna_mem request
396      * @param num_data_bytes_in - size
397      * @param offset - num bytes to advance in buffer
398      * @param idx - index of input port that we are connecting
399      * @return layer used as input
400      */
401     struct ConnectionDetails {
402         InferenceEngine::CNNLayerPtr  input;
403         bool needTransposeWeights = false;
404         InferenceEngine::CNNLayerPtr permute;
405         ConnectionDetails(InferenceEngine::CNNLayerPtr input,
406                           bool bTranspose = false,
407                           InferenceEngine::CNNLayerPtr permute = nullptr)
408             : input(input)
409             , needTransposeWeights(bTranspose)
410             , permute(permute) {
411         }
412     };
413     ConnectionDetails connectInput(InferenceEngine::CNNLayerPtr layer,
414                       void *pVoid,
415                       size_t num_data_bytes_in,
416                       int32_t offset = 0,
417                       int idx = 0);
418
419     void ImportFrames(void *ptr_dst,
420                      const void *ptr_src,
421                      InferenceEngine::Precision input_precision,
422                      intel_dnn_orientation_t orientation,
423                      uint32_t num_frames,
424                      uint32_t num_group,
425                      uint32_t num_vector_elements,
426                      uint32_t num_vector_stride);
427
428     void ExportScores(void *ptr_dst,
429                      void *ptr_src,
430                      intel_dnn_orientation_t orientation,
431                      uint32_t num_frames,
432                      uint32_t num_group,
433                      uint32_t num_vector_elements,
434                      uint32_t num_active_elements,
435                      uint32_t num_vector_stride,
436                      uint32_t num_bytes_per_element_input,
437                      uint32_t num_bytes_per_element);
438
439     friend void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst,
440                     const float *ptr_src,
441                     const uint32_t num_rows,
442                     const uint32_t num_columns,
443                     const float scale_factor);
444     friend void GNAPluginNS::ConvertToFloat(float *ptr_dst,
445                     int32_t *ptr_src,
446                     const uint32_t num_rows,
447                     const uint32_t num_columns,
448                     const float scale_factor);
449
450     friend int16_t GNAPluginNS::ConvertFloatToInt16(float src);
451
452     template <typename T, typename U>
453     void copyInputData(T *dst,
454                     const U *src,
455                     uint32_t num_frames,
456                     uint32_t num_group,
457                     uint32_t num_vector_elements,
458                     uint32_t num_vector_stride,
459                     intel_dnn_orientation_t orientation);
460
461     template <typename T, typename U>
462     void copyInputDataWithSplit(T *const dst,
463                     const U *src,
464                     const GNASplitLayer& splitInfo,
465                     size_t precision_size);
466     /**
467      * @brief GNA affine layers are always have activation atached, while IR not
468      */
469     void insertIdentityLayer(std::vector<InferenceEngine::CNNLayerPtr> &layers);
470
471     /**
472      * @brief GNA cannot support broadcast - so we will tile weights and biases for scaleshift layer
473      */
474     void substituteScaleShiftBroadCast(std::vector<InferenceEngine::CNNLayerPtr> &layers);
475
476
477     /**
478      * @brief GNA convolution layers have deinterleaved layout, while affine one doesn't
479      * so between convolution and affine layers permute layers need to be inserted,
480      * current MO approach is to insert such permutations
481      * since GNA-HW already support conv->affine in permuted for, this pass inverses MO behavior
482      * so its remove permutations of certain form conv->conv, and between conv->affine
483      * and insert permutation between conv->affine if they are missed in IR
484      * @param layers
485      */
486     void reversePermutations(std::vector<InferenceEngine::CNNLayerPtr> &layers);
487
488
489     /**
490      * brief @search for specific patter in the graph (6 layers are replaced by single one)
491      * @param layers
492      */
493     void substitutePRelu(std::vector<InferenceEngine::CNNLayerPtr> &layers);
494
495     std::vector<InferenceEngine::CNNLayerPtr> getCandidatesForIdentityInsertion(const InferenceEngine::CNNLayerPtr layer);
496
497     /**
498      * diagonal layer insertion required in cases where activation followed by split layers, or any other
499      * topology changing layers
500      */
501     void insertDiagonalLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
502
503     /**
504      * @brief MaxPool can be reordered with activation, on GNA there is a strategy to have conv->maxpool->activation
505      * it means maxpool receives 4 bytes, and produces 4 bytes
506      */
507     void reorderMaxPool(std::vector<InferenceEngine::CNNLayerPtr> & layers);
508
509     /**
510      * copy layer insertion required in cases where input layer does not have output memory
511      */
512     void insertCopyLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
513
514     /**
515      * aligned filter layer insertion required in cases when split/slice have output connections on not aligned addresses
516      */
517     void insertAligningFilterLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
518
519     intel_dnn_component_t * find_first_unused_input(InferenceEngine::CNNLayerPtr current);
520     std::map<std::string, int> bytes_alllocated_for_input;
521     InferenceEngine::InputsDataMap inputsDataMap;
522
523     InferenceEngine::SizeVector outputDims;
524     InferenceEngine::OutputsDataMap outputsDataMap;
525 };
526 }  // namespace GNAPluginNS