1 // Copyright (C) 2018-2019 Intel Corporation
2 // SPDX-License-Identifier: Apache-2.0
11 #include <xmmintrin.h>
17 #include <type_traits>
21 #define DNN_MAX_BATCH_SIZE 8
22 #define DNN_MAX_INPUTS 3072
23 #define DNN_MAX_OUTPUTS 8192
24 #define DNN_MAX_ERROR 1.0e-4f
25 #define DNN_NUM_BYTES_INT_BIAS 4
26 #define DNN_NUM_BYTES_INT_AFFINE_OUT 4
27 #define DNN_RAND_INT8_AMPLITUDE 127.0f
28 #define DNN_RAND_INT16_AMPLITUDE 16384.0f
29 #define DNN_RAND_INT32_AMPLITUDE 1048576.0f
30 #define DNN_RAND_FLOAT32_AMPLITUDE 8.0f
32 enum DnnActivationType {
39 kActKaldiLstmClipping,
43 struct DnnActivation {
45 DnnActivationType type;
47 operator DnnActivationType () const noexcept {
50 static DnnActivation fromType(DnnActivationType type) {
51 DnnActivation activation;
52 activation.type = type;
53 activation.negative_slope = 0.0f;
58 static_assert(std::is_trivial<DnnActivation>::value, "DnnActivation is not trival type");
60 static const char *intel_dnn_activation_name[kActNumType] = {
67 "kActKaldiLstmClipping",
71 typedef enum DnnSoftmaxType {
73 kSoftmaxKaldiSumgroup,
77 } intel_dnn_softmax_type_t;
79 static const char *intel_dnn_softmax_name[kSoftmaxNumType] = {
81 "kSoftmaxKaldiSumGroup",
82 "kSoftmaxKaldiApplyLog",
87 kDnnUnknownOrientation,
88 kDnnInterleavedOrientation,
89 kDnnNonInterleavedOrientation,
91 } intel_dnn_orientation_t;
97 kDnnConvolutional1dOp,
98 kDnnPiecewiselinearOp,
105 } intel_dnn_operation_t;
107 static const char *intel_dnn_operation_name[kDnnNumOp] = {
111 "kDnnConvolutional1dOp",
112 "kDnnPiecewiselinearOp",
116 "kDnnDeinterleaveOp",
125 } intel_dnn_macro_operation_t;
127 static const char *intel_dnn_macro_operation_name[kDnnNumMacroOp] = {
137 } intel_dnn_number_type_t;
139 static const char *intel_dnn_number_type_name[kDnnNumNumberType] = {
145 uint32_t num_bytes_per_weight;
146 uint32_t num_bytes_per_bias;
147 float weight_scale_factor;
153 uint32_t num_bytes_per_weight;
154 uint32_t num_bytes_per_bias;
155 uint32_t num_filters;
156 uint32_t num_filter_rows;
157 uint32_t num_filter_coefficients;
158 uint32_t num_feature_maps;
159 uint32_t num_feature_map_rows;
160 uint32_t num_feature_map_columns;
161 float weight_scale_factor;
162 void *ptr_filters; // filters stored one after the other
164 } intel_convolutionalD_t;
167 uint32_t num_inputs; // pool size
168 uint32_t num_inputs_step; // pool step
169 uint32_t num_inputs_stride; // pool stride (number of convolution filters)
174 DnnActivation func_id; // identifies function being approximated
175 uint32_t num_segments;
176 intel_pwl_segment_t *ptr_segments;
177 } intel_piecewiselinear_t;
180 uint32_t num_vector_delay;
181 uint32_t num_bytes_per_weight;
182 uint32_t num_bytes_per_bias;
183 float weight_scale_factor;
190 } intel_interleave_t;
193 } intel_deinterleave_t;
196 uint32_t num_copy_columns; // number of columns to copy
197 uint32_t num_copy_rows; // number of rows to copy
201 uint32_t num_rows_in;
202 uint32_t num_columns_in;
203 uint32_t num_rows_out;
204 uint32_t num_columns_out;
205 uint32_t num_bytes_per_input;
206 uint32_t num_bytes_per_output;
207 intel_dnn_operation_t operation;
208 intel_dnn_macro_operation_t macro_operation;
209 intel_dnn_orientation_t orientation_in;
210 intel_dnn_orientation_t orientation_out;
211 union operation_struct_t {
212 intel_affine_t affine;
213 intel_convolutionalD_t conv1D;
214 intel_maxpool_t maxpool;
215 intel_piecewiselinear_t pwl;
216 intel_recurrent_t recurrent;
217 intel_interleave_t interleave;
218 intel_deinterleave_t deinterleave;
223 float output_scale_factor;
224 } intel_dnn_component_t;
234 float sum_squared_error;
237 float sum_squared_rel_error;
238 } intel_score_error_t;
243 : ptr_active_outputs_(NULL),
244 num_active_outputs_(0),
245 input_scale_factor_(1.0),
247 num_right_context(0),
248 do_rotate_input(false),
250 num_rotate_columns(0),
251 softmax_type(kSoftmaxNone),
252 ptr_sumgroup_sizes(NULL),
253 num_sumgroup_sizes(0),
255 ptr_dnn_memory_(NULL) {
260 if (ptr_sumgroup_sizes != NULL) {
261 _mm_free(ptr_sumgroup_sizes);
263 if (ptr_priors != NULL) {
264 _mm_free(ptr_priors);
268 uint32_t num_components() { return (uint32_t) component.size(); }
270 void Init(void *ptr_memory, uint32_t num_memory_bytes, intel_dnn_number_type_t number_type, float scale_factor);
271 void InitActiveList(uint32_t *ptr_active_list);
273 template<class A, class B, class C, class D>
274 static void InitAffineComponent(intel_dnn_component_t &comp,
275 uint32_t num_rows_in,
276 uint32_t num_columns,
277 uint32_t num_rows_out,
278 uint32_t num_bytes_per_input,
279 uint32_t num_bytes_per_output,
280 uint32_t num_bytes_per_weight,
281 uint32_t num_bytes_per_bias,
282 float weight_scale_factor,
283 float output_scale_factor,
288 bool isDiag = false) {
289 InitAffineComponentPrivate(comp,
294 num_bytes_per_output,
295 num_bytes_per_weight,
299 (void *&) ptr_inputs,
300 (void *&) ptr_outputs,
301 (void *&) ptr_weights,
302 (void *&) ptr_biases,
307 template<class A, class B, class C, class D>
308 void InitAffineComponent(uint32_t component_index,
309 uint32_t num_rows_in,
310 uint32_t num_columns,
311 uint32_t num_rows_out,
312 uint32_t num_bytes_per_input,
313 uint32_t num_bytes_per_output,
314 uint32_t num_bytes_per_weight,
315 uint32_t num_bytes_per_bias,
316 float weight_scale_factor,
317 float output_scale_factor,
322 bool isDiag = false) {
323 InitAffineComponentPrivate(component[component_index],
328 num_bytes_per_output,
329 num_bytes_per_weight,
333 (void *&) ptr_inputs,
334 (void *&) ptr_outputs,
335 (void *&) ptr_weights,
336 (void *&) ptr_biases,
341 void InitDiagonalComponent(uint32_t component_index,
342 uint32_t num_rows_in,
343 uint32_t num_columns,
344 uint32_t num_rows_out,
345 uint32_t num_bytes_per_input,
346 uint32_t num_bytes_per_output,
347 uint32_t num_bytes_per_weight,
348 uint32_t num_bytes_per_bias,
349 float weight_scale_factor,
350 float output_scale_factor,
356 template<class A, class B, class C, class D>
357 void InitConvolutional1DComponent(uint32_t component_index,
358 uint32_t num_rows_in,
359 uint32_t num_columns_in,
360 uint32_t num_rows_out,
361 uint32_t num_columns_out,
362 uint32_t num_bytes_per_input,
363 uint32_t num_bytes_per_output,
364 uint32_t num_bytes_per_weight,
365 uint32_t num_bytes_per_bias,
366 uint32_t num_filters,
367 uint32_t num_filter_rows,
368 uint32_t num_filter_coefficients,
369 uint32_t num_feature_maps,
370 uint32_t num_feature_map_rows,
371 uint32_t num_feature_map_columns,
372 float weight_scale_factor,
373 float output_scale_factor,
378 InitConvolutional1DComponentPrivate(component[component_index],
384 num_bytes_per_output,
385 num_bytes_per_weight,
389 num_filter_coefficients,
391 num_feature_map_rows,
392 num_feature_map_columns,
395 (void *&) ptr_inputs,
396 (void *&) ptr_outputs,
397 (void *&) ptr_filters,
398 (void *&) ptr_biases,
402 template<class A, class B, class C, class D>
403 static void InitConvolutional1DComponent(intel_dnn_component_t &comp,
404 uint32_t num_rows_in,
405 uint32_t num_columns_in,
406 uint32_t num_rows_out,
407 uint32_t num_columns_out,
408 uint32_t num_bytes_per_input,
409 uint32_t num_bytes_per_output,
410 uint32_t num_bytes_per_weight,
411 uint32_t num_bytes_per_bias,
412 uint32_t num_filters,
413 uint32_t num_filter_rows,
414 uint32_t num_filter_coefficients,
415 uint32_t num_feature_maps,
416 uint32_t num_feature_map_rows,
417 uint32_t num_feature_map_columns,
418 float weight_scale_factor,
419 float output_scale_factor,
424 InitConvolutional1DComponentPrivate(comp,
430 num_bytes_per_output,
431 num_bytes_per_weight,
435 num_filter_coefficients,
437 num_feature_map_rows,
438 num_feature_map_columns,
441 (void *&) ptr_inputs,
442 (void *&) ptr_outputs,
443 (void *&) ptr_filters,
444 (void *&) ptr_biases,
450 // TODO: this functions accepted component_index only used in legacy code
451 void InitMaxpoolComponent(uint32_t component_index,
452 uint32_t num_rows_in,
453 uint32_t num_columns_in,
454 uint32_t num_rows_out,
455 uint32_t num_columns_out,
456 uint32_t num_bytes_per_input,
457 uint32_t num_bytes_per_output,
458 uint32_t num_pool_size,
459 uint32_t num_pool_step,
460 uint32_t num_pool_stride,
462 float output_scale_factor,
464 void * ptr_outputs) {
465 InitMaxpoolComponentPrivate(component[component_index],
471 num_bytes_per_output,
477 (void *&) ptr_inputs,
478 (void *&) ptr_outputs,
482 template<class A, class B>
483 static void InitMaxpoolComponent(intel_dnn_component_t &cmp,
484 uint32_t num_rows_in,
485 uint32_t num_columns_in,
486 uint32_t num_rows_out,
487 uint32_t num_columns_out,
488 uint32_t num_bytes_per_input,
489 uint32_t num_bytes_per_output,
490 uint32_t num_pool_size,
491 uint32_t num_pool_step,
492 uint32_t num_pool_stride,
494 float output_scale_factor,
497 InitMaxpoolComponentPrivate(cmp,
503 num_bytes_per_output,
509 (void *&) ptr_inputs,
510 (void *&) ptr_outputs,
517 void InitPiecewiseLinearComponent(uint32_t component_index,
518 DnnActivation function_id,
519 intel_dnn_orientation_t orientation,
521 uint32_t num_columns,
522 uint32_t num_bytes_per_input,
523 uint32_t num_bytes_per_output,
524 uint32_t num_segments,
525 float output_scale_factor,
528 intel_pwl_segment_t *ptr_segments) {
529 InitPiecewiseLinearComponentPrivate(component[component_index],
535 num_bytes_per_output,
543 template<class A, class B>
544 static void InitPiecewiseLinearComponent(intel_dnn_component_t &cmp,
545 DnnActivation function_id,
546 intel_dnn_orientation_t orientation,
548 uint32_t num_columns,
549 uint32_t num_bytes_per_input,
550 uint32_t num_bytes_per_output,
551 uint32_t num_segments,
552 float output_scale_factor,
555 intel_pwl_segment_t *ptr_segments) {
556 InitPiecewiseLinearComponentPrivate(cmp,
562 num_bytes_per_output,
565 (void *&) ptr_inputs,
566 (void *&) ptr_outputs,
572 void InitRecurrentComponent(uint32_t component_index,
574 uint32_t num_columns_in,
575 uint32_t num_columns_out,
576 uint32_t num_bytes_per_input,
577 uint32_t num_bytes_per_output,
578 uint32_t num_vector_delay,
579 uint32_t num_bytes_per_weight,
580 uint32_t num_bytes_per_bias,
581 float weight_scale_factor,
582 float output_scale_factor,
588 void InitInterleaveComponent(uint32_t component_index,
590 uint32_t num_columns,
591 uint32_t num_bytes_per_input,
592 uint32_t num_bytes_per_output,
593 float output_scale_factor,
596 void InitDeinterleaveComponent(uint32_t component_index,
598 uint32_t num_columns,
599 uint32_t num_bytes_per_input,
600 uint32_t num_bytes_per_output,
601 float output_scale_factor,
604 void InitCopyComponent(uint32_t component_index,
605 intel_dnn_orientation_t orientation,
606 uint32_t num_rows_in,
607 uint32_t num_columns_in,
608 uint32_t num_rows_out,
609 uint32_t num_columns_out,
610 uint32_t num_bytes_per_input,
611 uint32_t num_bytes_per_output,
612 float output_scale_factor,
613 uint32_t num_copy_rows,
614 uint32_t num_copy_columns,
617 InitCopyComponentPrivate(component[component_index],
624 num_bytes_per_output,
633 template<class A, class B>
634 static void InitCopyComponent(intel_dnn_component_t &cmp,
635 intel_dnn_orientation_t orientation,
636 uint32_t num_rows_in,
637 uint32_t num_columns_in,
638 uint32_t num_rows_out,
639 uint32_t num_columns_out,
640 uint32_t num_bytes_per_input,
641 uint32_t num_bytes_per_output,
642 float output_scale_factor,
643 uint32_t num_copy_rows,
644 uint32_t num_copy_columns,
647 InitCopyComponentPrivate(cmp,
654 num_bytes_per_output,
658 (void *&) ptr_inputs,
659 (void *&) ptr_outputs,
662 void AddComponents(uint32_t num_components_to_add);
663 void ClearComponent(uint32_t component_index);
665 uint32_t CopyActiveList(std::vector<std::vector<uint32_t> > &active_list, uint32_t list_index);
667 intel_dnn_macro_operation_t MacroOperation(uint32_t component_index);
668 void SetMacroOperation(uint32_t component_index, intel_dnn_macro_operation_t macro_operation);
669 float InputScaleFactor(uint32_t component_index);
670 float WeightScaleFactor(uint32_t component_index);
671 float OutputScaleFactor(uint32_t component_index) {
672 return OutputScaleFactor(component[component_index]);
674 float OutputScaleFactor(intel_dnn_component_t &comp);
675 void SetInputScaleFactor(float scale_factor) { input_scale_factor_ = scale_factor; }
676 void SetOutputScaleFactor(uint32_t component_index, float scale_factor);
677 void PrintOutputs(uint32_t component_index);
678 uint32_t CompareScores(void *ptr_scores, intel_score_error_t *score_error, uint32_t num_frames);
679 void WriteGraphWizModel(const char *filename);
680 void WriteDnnText(const char *filename, intel_dnn_number_type_t number_type);
681 uint32_t MemoryRequiredToReadDnnText(const char *filename);
682 void ReadDnnText(const char *filename, void *ptr_memory, uint32_t num_memory_bytes, float *ptr_scale_in);
684 void InitGNAStruct(intel_nnet_type_t *ptr_nnet);
685 void DestroyGNAStruct(intel_nnet_type_t *ptr_nnet);
686 void GetScaledOutput(float *ptr_output, uint32_t component_index);
687 uint32_t *ptr_active_outputs() { return (ptr_active_outputs_); }
688 uint32_t num_active_outputs() { return (num_active_outputs_); }
689 uint32_t num_gna_layers() {
690 uint32_t num_layers = 0;
691 for (uint32_t i = 0; i < component.size(); i++) {
692 if ((component[i].operation == kDnnAffineOp) || (component[i].operation == kDnnDiagonalOp)
693 || (component[i].operation == kDnnConvolutional1dOp) || (component[i].operation == kDnnCopyOp)
694 || (component[i].operation == kDnnDeinterleaveOp) || (component[i].operation == kDnnInterleaveOp)
695 || (component[i].operation == kDnnRecurrentOp)) {
701 uint32_t num_group_in() {
702 return ((component.size() > 0) ? ((component[0].orientation_in == kDnnInterleavedOrientation)
703 ? component[0].num_columns_in : component[0].num_rows_in) : 0);
705 uint32_t num_group_out() {
706 return ((component.size() > 0) ? ((component[component.size() - 1].orientation_out
707 == kDnnInterleavedOrientation) ? component[component.size() - 1].num_columns_out : component[
708 component.size() - 1].num_rows_out) : 0);
711 std::vector<intel_dnn_component_t> component;
712 uint32_t num_left_context;
713 uint32_t num_right_context;
714 bool do_rotate_input;
715 uint32_t num_rotate_rows = 0;
716 uint32_t num_rotate_columns = 0;
717 DnnSoftmaxType softmax_type;
718 uint32_t *ptr_sumgroup_sizes;
719 uint32_t num_sumgroup_sizes;
722 void WriteInputAndOutputText();
723 static void WriteInputAndOutputTextGNA(intel_nnet_type_t * nnet);
724 void BeginNewWrite();
727 void *ptr_dnn_memory_;
728 uint32_t num_bytes_dnn_memory_;
729 uint32_t *ptr_active_outputs_;
730 uint32_t num_active_outputs_;
731 intel_dnn_number_type_t number_type_;
732 float input_scale_factor_;
734 static void InitCopyComponentPrivate(intel_dnn_component_t &cmp,
735 intel_dnn_orientation_t orientation,
736 uint32_t num_rows_in,
737 uint32_t num_columns_in,
738 uint32_t num_rows_out,
739 uint32_t num_columns_out,
740 uint32_t num_bytes_per_input,
741 uint32_t num_bytes_per_output,
742 float output_scale_factor,
743 uint32_t num_copy_rows,
744 uint32_t num_copy_columns,
749 static void InitMaxpoolComponentPrivate(intel_dnn_component_t &cmp,
750 uint32_t num_rows_in,
751 uint32_t num_columns_in,
752 uint32_t num_rows_out,
753 uint32_t num_columns_out,
754 uint32_t num_bytes_per_input,
755 uint32_t num_bytes_per_output,
756 uint32_t num_pool_size,
757 uint32_t num_pool_step,
758 uint32_t num_pool_stride,
760 float output_scale_factor,
765 static void InitPiecewiseLinearComponentPrivate(intel_dnn_component_t &cmp,
766 DnnActivation function_id,
767 intel_dnn_orientation_t orientation,
769 uint32_t num_columns,
770 uint32_t num_bytes_per_input,
771 uint32_t num_bytes_per_output,
772 uint32_t num_segments,
773 float output_scale_factor,
776 intel_pwl_segment_t *ptr_segments,
779 static void InitConvolutional1DComponentPrivate(intel_dnn_component_t &comp,
780 uint32_t num_rows_in,
781 uint32_t num_columns_in,
782 uint32_t num_rows_out,
783 uint32_t num_columns_out,
784 uint32_t num_bytes_per_input,
785 uint32_t num_bytes_per_output,
786 uint32_t num_bytes_per_weight,
787 uint32_t num_bytes_per_bias,
788 uint32_t num_filters,
789 uint32_t num_filter_rows,
790 uint32_t num_filter_coefficients,
791 uint32_t num_feature_maps,
792 uint32_t num_feature_map_rows,
793 uint32_t num_feature_map_columns,
794 float weight_scale_factor,
795 float output_scale_factor,
802 static void InitAffineComponentPrivate(intel_dnn_component_t &comp,
803 uint32_t num_rows_in,
804 uint32_t num_columns,
805 uint32_t num_rows_out,
806 uint32_t num_bytes_per_input,
807 uint32_t num_bytes_per_output,
808 uint32_t num_bytes_per_weight,
809 uint32_t num_bytes_per_bias,
810 float weight_scale_factor,
811 float output_scale_factor,
820 void PlotFloatIntDnn(AmIntelDnn *dnn, AmIntelDnn *dnn_int);
821 bool isCompatibleDnn(AmIntelDnn dnn1, AmIntelDnn dnn2);
822 void ClearScoreError(intel_score_error_t *error);
823 void UpdateScoreError(intel_score_error_t *error, intel_score_error_t *total_error);
824 void SoftmaxGoogle(float *ptr_output, float *ptr_input, const uint32_t num_outputs, const uint32_t num_inputs);