1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
7 #include "cpp_interfaces/base/ie_plugin_base.hpp"
9 #include "gna_memory.hpp"
10 #include "gna_device.hpp"
12 #include <unordered_map>
19 #include <gna-api-status.h>
21 #include <cpp_interfaces/interface/ie_iplugin_internal.hpp>
22 #include <cpp_interfaces/impl/ie_plugin_internal.hpp>
23 #include <cpp_interfaces/impl/ie_executable_network_thread_safe_default.hpp>
24 #include <graph_tools.hpp>
25 #include "gna_allocator.hpp"
26 #include "gna_api_wrapper.hpp"
27 #include "gna_plugin_policy.hpp"
29 namespace GNAPluginNS {
31 void ConvertToInt16(int16_t *ptr_dst,
33 const uint32_t num_rows,
34 const uint32_t num_columns,
35 const float scale_factor);
36 void ConvertToFloat(float *ptr_dst,
38 const uint32_t num_rows,
39 const uint32_t num_columns,
40 const float scale_factor);
42 int16_t ConvertFloatToInt16(float src);
44 class GNAPlugin : public InferenceEngine::IInferencePluginInternal, public std::enable_shared_from_this<GNAPlugin> {
47 using dnn_ptr = std::shared_ptr<CPPWrapper<intel_nnet_type_t>>;
50 * @brief - copy of nnet structure and indicator that related infer request not yet synced
52 std::vector<std::tuple<dnn_ptr, int32_t, InferenceEngine::BlobMap>> nnets;
54 std::unordered_map<std::string, intel_dnn_orientation_t> orientation_in;
55 intel_dnn_orientation_t orientation_out = kDnnUnknownOrientation;
58 * temporary solution to support multiple scale factors
61 float get_input_scale_factor() const;
62 std::unordered_map<std::string, double> input_scale_factor;
64 double output_scale_factor = 1.0;
65 uint32_t num_rotate_rows = 0;
66 uint32_t num_rotate_columns = 0;
69 uint32_t num_feature_maps = 1;
70 uint32_t num_memory_bytes;
72 std::unordered_map<std::string, std::list<std::vector<void *>>::iterator> ptr_inputs_global_id;
73 std::list<std::vector<void *>> ptr_inputs_global_storage;
75 std::vector<void *>& get_ptr_inputs_global(std::string name);
77 std::vector<void *> ptr_outputs_global;
79 uint32_t *ptr_active_indices = NULL;
80 uint32_t num_active_indices = 0;
81 uint32_t num_group_in = 0;
82 uint32_t num_bytes_weight;
83 uint32_t num_bytes_per_output = 0;
85 bool use_dynamic_quantization = false;
86 bool compact_mode = true;
87 bool exclusive_async_requests = false;
88 bool uniformPwlDesign = false;
89 uint8_t gna_lib_async_threads_num = 1;
90 bool gna_openmp_multithreading = false;
91 // precision of GNA hardware model
92 InferenceEngine::Precision gnaPrecision = InferenceEngine::Precision::I16;
94 bool performance_counting = false;
96 intel_dnn_number_type_t output_type = kDnnInt;
97 std::string utterance_name;
127 explicit GNAPlugin(const std::map<std::string, std::string>& configMap);
129 * @brief construct from aot rather then from cnn network
131 GNAPlugin() = default;
133 void LoadNetwork(InferenceEngine::ICNNNetwork &network) override;
134 using InferenceEngine::IInferencePluginInternal::Infer;
136 void Infer(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result) override;
137 void GetPerformanceCounts(std::map<std::string, InferenceEngine::InferenceEngineProfileInfo> &perfMap) override;
138 void AddExtension(InferenceEngine::IExtensionPtr extension) override;
139 void SetConfig(const std::map<std::string, std::string> &config) override;
140 void LoadNetwork(InferenceEngine::IExecutableNetwork::Ptr &executableNetwork,
141 InferenceEngine::ICNNNetwork &network,
142 const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; }
143 void Infer(const InferenceEngine::Blob &input, InferenceEngine::Blob &result) override;
144 void SetLogCallback(InferenceEngine::IErrorListener &listener) override {};
147 * @deprecated Use the version with config parameter
149 void QueryNetwork(const InferenceEngine::ICNNNetwork &network,
150 InferenceEngine::QueryNetworkResult &res) const override;
151 void QueryNetwork(const InferenceEngine::ICNNNetwork &network,
152 const std::map<std::string, std::string>& config,
153 InferenceEngine::QueryNetworkResult &res) const override;
154 uint32_t QueueInference(const InferenceEngine::BlobMap &input, InferenceEngine::BlobMap &result);
155 void Wait(uint32_t idx = 0);
159 * @param sync - points to gna sync point
160 * @param idx - points to
163 void Wait(uint32_t sync, InferenceEngine::Blob &result);
165 void Export(const std::string &fileName);
166 InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName
167 , const std::map<std::string, std::string> &config) override { THROW_GNA_EXCEPTION << "Not implemented"; }
168 InferenceEngine::IExecutableNetwork::Ptr ImportNetwork(const std::string &modelFileName);
171 bool IsExclusiveAsyncRequests() { return exclusive_async_requests; }
174 * utility to provide input and output blobs externally to be used by InferenceEngine request API clients
176 InferenceEngine::Blob::Ptr GetInputBlob(std::string name, InferenceEngine::Precision precision);
177 InferenceEngine::Blob::Ptr GetOutputBlob(InferenceEngine::Precision precision);
179 * helpers to provide inputs info on AOT network
181 InferenceEngine::InputsDataMap GetInputs() {return inputsDataMap;}
182 InferenceEngine::OutputsDataMap GetOutputs() {return outputsDataMap;}
187 std::vector<InferenceEngine::IMemoryStateInternal::Ptr> QueryState();
192 void SetPolicy(Policy p) {policy = p;}
196 uint32_t num_cnn_rows_out = 0;
198 std::string dumpXNNPath;
199 intel_gna_proc_t gna_proc_type = static_cast<intel_gna_proc_t>(GNA_SOFTWARE & GNA_HARDWARE);
201 void DumpXNNToFile() const;
202 void CreateLayerPrimitive(InferenceEngine::CNNLayerPtr);
203 void AffinePrimitive(InferenceEngine::CNNLayerPtr, bool isDiag = false);
204 void AffineFilterPrimitive(InferenceEngine::CNNLayerPtr);
205 void DiagonalPrimitive(InferenceEngine::CNNLayerPtr);
206 void ConvolutionPrimitive(InferenceEngine::CNNLayerPtr);
207 void PermutePrimitive(InferenceEngine::CNNLayerPtr);
208 void PoolingPrimitive(InferenceEngine::CNNLayerPtr);
209 void PowerPrimitive(InferenceEngine::CNNLayerPtr);
210 void ConcatPrimitive(InferenceEngine::CNNLayerPtr);
211 void CropPrimitive(InferenceEngine::CNNLayerPtr);
212 void EltwisePrimitive(InferenceEngine::CNNLayerPtr);
213 void SplitPrimitive(InferenceEngine::CNNLayerPtr);
214 void SlicePrimitive(InferenceEngine::CNNLayerPtr);
215 void PWLPrimitive(InferenceEngine::CNNLayerPtr);
216 void CopyPrimitive(InferenceEngine::CNNLayerPtr);
217 bool AreLayersSupported(InferenceEngine::ICNNNetwork& network, std::string& errMessage);
218 LayerType LayerTypeFromStr(std::string const &str) const;
220 * maps tpe of connection to input and output layers also stores gna_pointer for alloc request
222 class GNAMemoryLayer {
223 InferenceEngine::CNNLayerPtr inputLayer;
224 InferenceEngine::CNNLayerPtr outputLayer;
226 GNAMemoryLayer(InferenceEngine::CNNLayerPtr inLayer, InferenceEngine::CNNLayerPtr outLayer) :
227 inputLayer(inLayer), outputLayer(outLayer) {
230 InferenceEngine::CNNLayerPtr getInput() { return inputLayer; }
231 InferenceEngine::CNNLayerPtr getOutput() { return outputLayer; }
234 * pointer to gna memory request
236 void *gna_ptr = nullptr;
238 * gna memory of this size is reserved
240 size_t reserved_size = 0;
242 * gna memory of this offset from gna_ptr
244 size_t reserved_offset = 0;
247 class GNAConcatLayer {
248 InferenceEngine::CNNLayerPtr concatLayer;
251 explicit GNAConcatLayer(InferenceEngine::CNNLayerPtr layer) :
255 InferenceEngine::CNNLayerPtr getConcat() { return concatLayer; }
257 * pointer to gna memory request
259 void *gna_ptr = nullptr;
261 * gna memory of this size is reserved for concat
263 size_t reserved_size = 0;
264 bool output_allocation_flag = false;
266 * gna memory of this offset from gna_ptr
268 struct ConcatConnectedLayerInfo {
269 ConcatConnectedLayerInfo(const std::string& n,
273 std::string name = "";
277 std::vector<ConcatConnectedLayerInfo> concatInputLayers;
281 class GNASplitLayer {
282 InferenceEngine::CNNLayerPtr splitLayer;
285 explicit GNASplitLayer(InferenceEngine::CNNLayerPtr layer) :
290 InferenceEngine::CNNLayerPtr getSplit() { return splitLayer; }
292 * gna memory of this size is reserved for split
294 size_t reserved_size = 0;
295 bool output_allocation_flag = false;
297 * gna memory of this offset from gna_ptr
299 struct SplitConnectedLayerInfo {
300 SplitConnectedLayerInfo() {}
301 SplitConnectedLayerInfo(std::string& n,
308 SplitConnectedLayerInfo& operator=
309 (SplitConnectedLayerInfo const& layerInfo) {
310 this->name = layerInfo.name;
311 this->offset = layerInfo.offset;
312 this->pure_size = layerInfo.pure_size;
315 std::string name = "";
317 size_t pure_size = 0;
319 SplitConnectedLayerInfo splitInputLayer;
320 std::vector<SplitConnectedLayerInfo> splitOutputLayers;
324 InferenceEngine::CNNLayerPtr cropLayer;
327 explicit GNACropLayer(InferenceEngine::CNNLayerPtr layer) :
331 InferenceEngine::CNNLayerPtr getCrop() { return cropLayer; }
333 * pointer to gna croped memory beginning
335 void *gna_ptr = nullptr;
337 using MemoryConnection = std::list<std::pair<std::string, GNAMemoryLayer>>;
338 using ConcatConnection = std::unordered_map<std::string, GNAConcatLayer>;
339 using SplitConnection = std::unordered_map<std::string, GNASplitLayer>;
340 using CropConnection = std::unordered_map<std::string, GNACropLayer>;
341 // layers with extra storage for connections and additional
342 // non trivial processing
343 MemoryConnection memory_connection;
344 ConcatConnection concat_connection;
345 SplitConnection split_connection;
346 CropConnection crop_connection;
347 void fillMemoryConnections(std::unordered_map<std::string,
348 std::vector<InferenceEngine::CNNLayerPtr>> &memoryPairs);
350 void fillConcatConnections(InferenceEngine::CNNLayerPtr layer);
351 void fillSplitConnections(InferenceEngine::CNNLayerPtr layer);
353 * maps layer name to dnn.component, in topological sort prev nodes will be initialized
355 using DnnComponentsForLayer = std::list<std::pair<std::string, intel_dnn_component_t>>;
356 DnnComponentsForLayer dnnComponentsForLayer;
359 * @brief returns corresponding dnn layer for topology layer
363 intel_dnn_component_t * findDnnLayer(InferenceEngine::CNNLayerPtr __layer);
365 using allocator_type = PolymorphAllocator<uint8_t>;
366 using gna_memory_type = GNAMemory<allocator_type>;
368 std::unique_ptr<GNADeviceHelper> gnadevice;
370 * @brief size of RW segment without extra memory for parallel execution
372 uint32_t rwSegmentSize = 0;
373 std::unique_ptr<gna_memory_type> gnamem;
376 * Fill in the Affine layer weights
377 * @param layer - affine layer pointer
378 * @param ptrWeights - pointer to weights memory
379 * @param offset - memory before offset value will be zeroed
380 * @param isQuantized - information about layer quantization
382 void FillWeightOfAligningFilter(InferenceEngine::CNNLayerPtr layer, void* ptrWeights, size_t offset, bool isQuantized = false);
385 * Connects either memory output, or generic output to a layer
386 * @param layer - layer pointer
387 * @param ptr - pointer to pointer where to store output layer information
388 * @param sz - sizeof output blob
389 * @param ptr_inputs - sizeof output blob
391 void connectOutput(InferenceEngine::CNNLayerPtr layer, void *ptr_outputs, void *ptr_inputs, size_t sz);
393 * Connects certain input to this layer
394 * @param layer - layer that we connect input to
395 * @param pVoid - pointer that holds current layer pointer in gna_mem request
396 * @param num_data_bytes_in - size
397 * @param offset - num bytes to advance in buffer
398 * @param idx - index of input port that we are connecting
399 * @return layer used as input
401 struct ConnectionDetails {
402 InferenceEngine::CNNLayerPtr input;
403 bool needTransposeWeights = false;
404 InferenceEngine::CNNLayerPtr permute;
405 ConnectionDetails(InferenceEngine::CNNLayerPtr input,
406 bool bTranspose = false,
407 InferenceEngine::CNNLayerPtr permute = nullptr)
409 , needTransposeWeights(bTranspose)
413 ConnectionDetails connectInput(InferenceEngine::CNNLayerPtr layer,
415 size_t num_data_bytes_in,
419 void ImportFrames(void *ptr_dst,
421 InferenceEngine::Precision input_precision,
422 intel_dnn_orientation_t orientation,
425 uint32_t num_vector_elements,
426 uint32_t num_vector_stride);
428 void ExportScores(void *ptr_dst,
430 intel_dnn_orientation_t orientation,
433 uint32_t num_vector_elements,
434 uint32_t num_active_elements,
435 uint32_t num_vector_stride,
436 uint32_t num_bytes_per_element_input,
437 uint32_t num_bytes_per_element);
439 friend void GNAPluginNS::ConvertToInt16(int16_t *ptr_dst,
440 const float *ptr_src,
441 const uint32_t num_rows,
442 const uint32_t num_columns,
443 const float scale_factor);
444 friend void GNAPluginNS::ConvertToFloat(float *ptr_dst,
446 const uint32_t num_rows,
447 const uint32_t num_columns,
448 const float scale_factor);
450 friend int16_t GNAPluginNS::ConvertFloatToInt16(float src);
452 template <typename T, typename U>
453 void copyInputData(T *dst,
457 uint32_t num_vector_elements,
458 uint32_t num_vector_stride,
459 intel_dnn_orientation_t orientation);
461 template <typename T, typename U>
462 void copyInputDataWithSplit(T *const dst,
464 const GNASplitLayer& splitInfo,
465 size_t precision_size);
467 * @brief GNA affine layers are always have activation atached, while IR not
469 void insertIdentityLayer(std::vector<InferenceEngine::CNNLayerPtr> &layers);
472 * @brief GNA cannot support broadcast - so we will tile weights and biases for scaleshift layer
474 void substituteScaleShiftBroadCast(std::vector<InferenceEngine::CNNLayerPtr> &layers);
478 * @brief GNA convolution layers have deinterleaved layout, while affine one doesn't
479 * so between convolution and affine layers permute layers need to be inserted,
480 * current MO approach is to insert such permutations
481 * since GNA-HW already support conv->affine in permuted for, this pass inverses MO behavior
482 * so its remove permutations of certain form conv->conv, and between conv->affine
483 * and insert permutation between conv->affine if they are missed in IR
486 void reversePermutations(std::vector<InferenceEngine::CNNLayerPtr> &layers);
490 * brief @search for specific patter in the graph (6 layers are replaced by single one)
493 void substitutePRelu(std::vector<InferenceEngine::CNNLayerPtr> &layers);
495 std::vector<InferenceEngine::CNNLayerPtr> getCandidatesForIdentityInsertion(const InferenceEngine::CNNLayerPtr layer);
498 * diagonal layer insertion required in cases where activation followed by split layers, or any other
499 * topology changing layers
501 void insertDiagonalLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
504 * @brief MaxPool can be reordered with activation, on GNA there is a strategy to have conv->maxpool->activation
505 * it means maxpool receives 4 bytes, and produces 4 bytes
507 void reorderMaxPool(std::vector<InferenceEngine::CNNLayerPtr> & layers);
510 * copy layer insertion required in cases where input layer does not have output memory
512 void insertCopyLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
515 * aligned filter layer insertion required in cases when split/slice have output connections on not aligned addresses
517 void insertAligningFilterLayer(std::vector<InferenceEngine::CNNLayerPtr> & layers);
519 intel_dnn_component_t * find_first_unused_input(InferenceEngine::CNNLayerPtr current);
520 std::map<std::string, int> bytes_alllocated_for_input;
521 InferenceEngine::InputsDataMap inputsDataMap;
523 InferenceEngine::SizeVector outputDims;
524 InferenceEngine::OutputsDataMap outputsDataMap;
526 } // namespace GNAPluginNS