inference-engine/src/gna_plugin/dnn.cpp

   1 // Copyright (C) 2018-2019 Intel Corporation
   2 // SPDX-License-Identifier: Apache-2.0
   3 //
   4 // dnn.cpp : component based neural network class for ease of use
   5 //
   6 extern bool global_debug;
   7
   8 #include <cstdlib>
   9 #include <cstdio>
  10 #include <cmath>
  11 #include <set>
  12 #include <details/ie_exception.hpp>
  13 #include <algorithm>
  14 #include <gna-api-types-xnn.h>
  15
  16 #ifndef _NO_MKL_
  17 #include <mkl_dnn.h>
  18 #endif
  19 #include "dnn.h"
  20 #ifdef INTEGER_REF
  21 #include "convnet.h"
  22 #include "igemv16.h"
  23 #include "igemv8.h"
  24 #include "sgemm.h"
  25 #else
  26 #include "floatmath.h"
  27 #endif
  28 #include "pwl.h"
  29 #include "util.h"
  30 #include "gna_plugin_log.hpp"
  31
  32 #ifdef WIN32
  33 # define rand_r(X) rand()
  34 #endif
  35
  36 /**
  37  * whether to dump weights and biases
  38  */
  39 #define DUMP_WB
  40 /**
  41  * in light mode only layer names are dumped
  42  * @param filename
  43  * @param number_type
  44  * @return
  45  */
  46 #define LIGHT_DUMP
  47
  48 static int & getDumpFolderId() {
  49     static int N = 0;
  50     return N;
  51 }
  52
  53 static std::string getDumpFolderNameGNA() {
  54     return std::string("./gna_layers/")+std::to_string(getDumpFolderId() - 1)+"/";
  55 }
  56
  57 static std::string getDumpFolderName() {
  58     return std::string("./layers/")+std::to_string(getDumpFolderId() - 1)+"/";
  59 }
  60
  61 static std::string getRefFolderName() {
  62     return std::string("./ref_layers/")+std::to_string(getDumpFolderId() - 1)+"/";
  63 }
  64
  65 void AmIntelDnn::BeginNewWrite() {
  66     getDumpFolderId()++;
  67 }
  68
  69
  70 void AmIntelDnn::Init(void *ptr_memory,
  71                       uint32_t num_memory_bytes,
  72                       intel_dnn_number_type_t number_type,
  73                       float scale_factor) {
  74     ptr_dnn_memory_ = ptr_memory;
  75     num_bytes_dnn_memory_ = num_memory_bytes;
  76     number_type_ = number_type;
  77     input_scale_factor_ = scale_factor;
  78
  79     ptr_active_outputs_ = nullptr;
  80     num_active_outputs_ = 0;
  81     num_left_context = 0;
  82     num_right_context = 0;
  83     do_rotate_input = false;
  84     softmax_type = kSoftmaxNone;
  85     ptr_sumgroup_sizes = nullptr;
  86     num_sumgroup_sizes = 0;
  87     ptr_priors = nullptr;
  88
  89
  90     //  component.clear();
  91 }
  92
  93 void AmIntelDnn::InitActiveList(uint32_t *ptr_active_list) {
  94     ptr_active_outputs_ = ptr_active_list;
  95     if (ptr_active_list == nullptr) {
  96         if (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) {
  97             num_active_outputs_ = component[component.size() - 1].num_rows_out;
  98         } else {
  99             num_active_outputs_ = component[component.size() - 1].num_columns_out;
 100         }
 101     } else {
 102         num_active_outputs_ = 0;
 103     }
 104 }
 105
 106 void AmIntelDnn::AddComponents(uint32_t num_components_to_add) {
 107     component.resize(component.size() + num_components_to_add);
 108     for (uint32_t i = 0; i < num_components_to_add; i++) {
 109         ClearComponent(component.size() - i - 1);
 110     }
 111 }
 112
 113 void AmIntelDnn::ClearComponent(uint32_t component_index) {
 114     if (component_index > component.size() - 1) {
 115         fprintf(stderr, "Error:  attempt to clear non-existent component!\n");
 116         throw -1;
 117     }
 118     component[component_index].num_rows_in = 0;
 119     component[component_index].num_columns_in = 0;
 120     component[component_index].num_rows_out = 0;
 121     component[component_index].num_columns_out = 0;
 122     component[component_index].num_bytes_per_input = 0;
 123     component[component_index].num_bytes_per_output = 0;
 124     component[component_index].operation = kDnnNullOp;
 125     component[component_index].macro_operation = kDnnMacroOpNone;
 126     component[component_index].orientation_in = kDnnUnknownOrientation;
 127     component[component_index].orientation_out = kDnnUnknownOrientation;
 128     component[component_index].ptr_inputs = nullptr;
 129     component[component_index].ptr_outputs = nullptr;
 130     memset(&component[component_index].op, 0, sizeof(component[component_index].op));
 131 }
 132
 133 void AmIntelDnn::ClearState() {
 134     // To support recurrent networks, provide mechanism to clear persistent state
 135     // (e.g., between utterances for speech recognition).  For recurrent component,
 136     // this means clearing the feedback buffer.  For other components, just clear the
 137     // output buffer since any feedback will come from some component's output.
 138     for (uint32_t i = 0; i < component.size(); i++) {
 139         if (component[i].operation == kDnnRecurrentOp) {
 140             memset(component[i].op.recurrent.ptr_feedbacks,
 141                    0,
 142                    component[i].op.recurrent.num_vector_delay * component[i].num_columns_out
 143                        * component[i].num_bytes_per_input);
 144         } else {
 145             memset(component[i].ptr_outputs,
 146                    0,
 147                    component[i].num_bytes_per_output * component[i].num_rows_out * component[i].num_columns_out);
 148         }
 149     }
 150 }
 151
 152 void AmIntelDnn::InitAffineComponentPrivate(intel_dnn_component_t &comp,
 153                                             uint32_t num_rows_in,
 154                                             uint32_t num_columns,
 155                                             uint32_t num_rows_out,
 156                                             uint32_t num_bytes_per_input,
 157                                             uint32_t num_bytes_per_output,
 158                                             uint32_t num_bytes_per_weight,
 159                                             uint32_t num_bytes_per_bias,
 160                                             float weight_scale_factor,
 161                                             float output_scale_factor,
 162                                             void *&ptr_inputs,
 163                                             void *&ptr_outputs,
 164                                             void *&ptr_weights,
 165                                             void *&ptr_biases,
 166                                             bool isDiag,
 167                                             bool postInitMem) {
 168     comp.num_rows_in = num_rows_in;
 169     comp.num_columns_in = num_columns;
 170     comp.num_rows_out = num_rows_out;
 171     comp.num_columns_out = num_columns;
 172     comp.num_bytes_per_input = num_bytes_per_input;
 173     comp.num_bytes_per_output = num_bytes_per_output;
 174     comp.operation = isDiag ? kDnnDiagonalOp : kDnnAffineOp;
 175     comp.macro_operation = kDnnMacroOpNone;
 176     comp.orientation_in = kDnnInterleavedOrientation;
 177     comp.orientation_out = kDnnInterleavedOrientation;
 178     comp.op.affine.num_bytes_per_weight = num_bytes_per_weight;
 179     comp.op.affine.num_bytes_per_bias = num_bytes_per_bias;
 180     comp.op.affine.weight_scale_factor = weight_scale_factor;
 181     comp.output_scale_factor = output_scale_factor;
 182     if (!postInitMem) {
 183         comp.op.affine.ptr_weights = ptr_weights;
 184         comp.op.affine.ptr_biases = ptr_biases;
 185         comp.ptr_inputs = ptr_inputs;
 186         comp.ptr_outputs = ptr_outputs;
 187     } else {
 188         ptr_weights = &comp.op.affine.ptr_weights;
 189         ptr_biases = &comp.op.affine.ptr_biases;
 190         ptr_inputs = &comp.ptr_inputs;
 191         ptr_outputs = &comp.ptr_outputs;
 192     }
 193 }
 194
 195 void AmIntelDnn::InitDiagonalComponent(uint32_t component_index,
 196                                        uint32_t num_rows_in,
 197                                        uint32_t num_columns,
 198                                        uint32_t num_rows_out,
 199                                        uint32_t num_bytes_per_input,
 200                                        uint32_t num_bytes_per_output,
 201                                        uint32_t num_bytes_per_weight,
 202                                        uint32_t num_bytes_per_bias,
 203                                        float weight_scale_factor,
 204                                        float output_scale_factor,
 205                                        void *ptr_inputs,
 206                                        void *ptr_outputs,
 207                                        void *ptr_weights,
 208                                        void *ptr_biases) {
 209     component[component_index].num_rows_in = num_rows_in;
 210     component[component_index].num_columns_in = num_columns;
 211     component[component_index].num_rows_out = num_rows_out;
 212     component[component_index].num_columns_out = num_columns;
 213     component[component_index].num_bytes_per_input = num_bytes_per_input;
 214     component[component_index].num_bytes_per_output = num_bytes_per_output;
 215     component[component_index].operation = kDnnDiagonalOp;
 216     component[component_index].macro_operation = kDnnMacroOpNone;
 217     component[component_index].orientation_in = kDnnInterleavedOrientation;
 218     component[component_index].orientation_out = kDnnInterleavedOrientation;
 219     component[component_index].ptr_inputs = ptr_inputs;
 220     component[component_index].ptr_outputs = ptr_outputs;
 221     component[component_index].op.affine.num_bytes_per_weight = num_bytes_per_weight;
 222     component[component_index].op.affine.num_bytes_per_bias = num_bytes_per_bias;
 223     component[component_index].op.affine.weight_scale_factor = weight_scale_factor;
 224     component[component_index].output_scale_factor = output_scale_factor;
 225     component[component_index].op.affine.ptr_weights = ptr_weights;
 226     component[component_index].op.affine.ptr_biases = ptr_biases;
 227 }
 228
 229 void AmIntelDnn::InitConvolutional1DComponentPrivate(intel_dnn_component_t &comp,
 230                                               uint32_t num_rows_in,
 231                                               uint32_t num_columns_in,
 232                                               uint32_t num_rows_out,
 233                                               uint32_t num_columns_out,
 234                                               uint32_t num_bytes_per_input,
 235                                               uint32_t num_bytes_per_output,
 236                                               uint32_t num_bytes_per_weight,
 237                                               uint32_t num_bytes_per_bias,
 238                                               uint32_t num_filters,
 239                                               uint32_t num_filter_rows,
 240                                               uint32_t num_filter_coefficients,
 241                                               uint32_t num_feature_maps,
 242                                               uint32_t num_feature_map_rows,
 243                                               uint32_t num_feature_map_columns,
 244                                               float weight_scale_factor,
 245                                               float output_scale_factor,
 246                                               void *&ptr_inputs,
 247                                               void *&ptr_outputs,
 248                                               void *&ptr_filters,
 249                                               void *&ptr_biases,
 250                                               bool postInitMem) {
 251     comp.num_rows_in = num_rows_in;
 252     comp.num_columns_in = num_columns_in;
 253     comp.num_rows_out = num_rows_out;
 254     comp.num_columns_out = num_columns_out;
 255     comp.num_bytes_per_input = num_bytes_per_input;
 256     comp.num_bytes_per_output = num_bytes_per_output;
 257     comp.operation = kDnnConvolutional1dOp;
 258     comp.macro_operation = kDnnMacroOpNone;
 259     comp.orientation_in = kDnnNonInterleavedOrientation;
 260     comp.orientation_out = kDnnNonInterleavedOrientation;
 261     comp.ptr_inputs = ptr_inputs;
 262     comp.ptr_outputs = ptr_outputs;
 263     comp.op.conv1D.num_bytes_per_weight = num_bytes_per_weight;
 264     comp.op.conv1D.num_bytes_per_bias = num_bytes_per_bias;
 265     comp.op.conv1D.num_filters = num_filters;
 266     comp.op.conv1D.num_filter_rows = num_filter_rows;
 267     comp.op.conv1D.num_filter_coefficients = num_filter_coefficients;
 268     comp.op.conv1D.num_feature_maps = num_feature_maps;
 269     comp.op.conv1D.num_feature_map_rows = num_feature_map_rows;
 270     comp.op.conv1D.num_feature_map_columns = num_feature_map_columns;
 271     comp.op.conv1D.weight_scale_factor = weight_scale_factor;
 272     comp.output_scale_factor = output_scale_factor;
 273
 274     if (!postInitMem) {
 275         comp.op.conv1D.ptr_filters = ptr_filters;
 276         comp.op.conv1D.ptr_biases  = ptr_biases;
 277         comp.ptr_inputs = ptr_inputs;
 278         comp.ptr_outputs = ptr_outputs;
 279     } else {
 280         ptr_filters = &comp.op.conv1D.ptr_filters;
 281         ptr_biases  = &comp.op.conv1D.ptr_biases;
 282         ptr_inputs  = &comp.ptr_inputs;
 283         ptr_outputs = &comp.ptr_outputs;
 284     }
 285 }
 286
 287 void AmIntelDnn::InitMaxpoolComponentPrivate(intel_dnn_component_t &comp,
 288                                       uint32_t num_rows_in,
 289                                       uint32_t num_columns_in,
 290                                       uint32_t num_rows_out,
 291                                       uint32_t num_columns_out,
 292                                       uint32_t num_bytes_per_input,
 293                                       uint32_t num_bytes_per_output,
 294                                       uint32_t num_pool_size,
 295                                       uint32_t num_pool_step,
 296                                       uint32_t num_pool_stride,
 297                                       bool do_sum_not_max,
 298                                       float output_scale_factor,
 299                                       void *&ptr_inputs,
 300                                       void *&ptr_outputs,
 301                                       bool postInitMem) {
 302     comp.num_rows_in = num_rows_in;
 303     comp.num_columns_in = num_columns_in;
 304     comp.num_rows_out = num_rows_out;
 305     comp.num_columns_out = num_columns_out;
 306     comp.num_bytes_per_input = num_bytes_per_input;
 307     comp.num_bytes_per_output = num_bytes_per_output;
 308     comp.operation = kDnnMaxPoolOp;
 309     comp.macro_operation = kDnnMacroOpNone;
 310     comp.orientation_in = kDnnNonInterleavedOrientation;
 311     comp.orientation_out = kDnnNonInterleavedOrientation;
 312     comp.op.maxpool.num_inputs = num_pool_size;
 313     comp.op.maxpool.num_inputs_step = num_pool_step;
 314     comp.op.maxpool.num_inputs_stride = num_pool_stride;
 315     comp.op.maxpool.do_sum_not_max = do_sum_not_max;
 316     comp.output_scale_factor = output_scale_factor;
 317
 318     if (!postInitMem) {
 319         comp.ptr_inputs = ptr_inputs;
 320         comp.ptr_outputs = ptr_outputs;
 321     } else {
 322         ptr_inputs  = &comp.ptr_inputs;
 323         ptr_outputs = &comp.ptr_outputs;
 324     }
 325 }
 326
 327 void AmIntelDnn::InitCopyComponentPrivate(intel_dnn_component_t &comp,
 328                                           intel_dnn_orientation_t orientation,
 329                                           uint32_t num_rows_in,
 330                                           uint32_t num_columns_in,
 331                                           uint32_t num_rows_out,
 332                                           uint32_t num_columns_out,
 333                                           uint32_t num_bytes_per_input,
 334                                           uint32_t num_bytes_per_output,
 335                                           float output_scale_factor,
 336                                           uint32_t num_copy_rows,
 337                                           uint32_t num_copy_columns,
 338                                           void *&ptr_inputs,
 339                                           void *&ptr_outputs,
 340                                           bool postInitMem) {
 341     comp.num_rows_in = num_rows_in;
 342     comp.num_columns_in = num_columns_in;
 343     comp.num_rows_out = num_rows_out;
 344     comp.num_columns_out = num_columns_out;
 345     comp.num_bytes_per_input = num_bytes_per_input;
 346     comp.num_bytes_per_output = num_bytes_per_output;
 347     comp.operation = kDnnCopyOp;
 348     comp.macro_operation = kDnnMacroOpNone;
 349     comp.orientation_in = orientation;
 350     comp.orientation_out = orientation;
 351     comp.ptr_inputs = ptr_inputs;
 352     comp.ptr_outputs = ptr_outputs;
 353     comp.output_scale_factor = output_scale_factor;
 354     comp.op.copy.num_copy_rows = num_copy_rows;
 355     comp.op.copy.num_copy_columns = num_copy_columns;
 356
 357     if (!postInitMem) {
 358         comp.ptr_inputs = ptr_inputs;
 359         comp.ptr_outputs = ptr_outputs;
 360     } else {
 361         ptr_inputs  = &comp.ptr_inputs;
 362         ptr_outputs = &comp.ptr_outputs;
 363     }
 364 }
 365
 366 void AmIntelDnn::InitPiecewiseLinearComponentPrivate(intel_dnn_component_t &comp,
 367                                                      DnnActivation function_id,
 368                                                      intel_dnn_orientation_t orientation,
 369                                                      uint32_t num_rows,
 370                                                      uint32_t num_columns,
 371                                                      uint32_t num_bytes_per_input,
 372                                                      uint32_t num_bytes_per_output,
 373                                                      uint32_t num_segments,
 374                                                      float output_scale_factor,
 375                                                      void *&ptr_inputs,
 376                                                      void *&ptr_outputs,
 377                                                      intel_pwl_segment_t *ptr_segments,
 378                                                      bool postInitMem) {
 379     comp.num_rows_in = num_rows;
 380     comp.num_columns_in = num_columns;
 381     comp.num_rows_out = num_rows;
 382     comp.num_columns_out = num_columns;
 383     comp.num_bytes_per_input = num_bytes_per_input;
 384     comp.num_bytes_per_output = num_bytes_per_output;
 385     comp.operation = kDnnPiecewiselinearOp;
 386     comp.macro_operation = kDnnMacroOpNone;
 387     comp.orientation_in = orientation;
 388     comp.orientation_out = orientation;
 389     comp.op.pwl.func_id = function_id;
 390     comp.op.pwl.num_segments = num_segments;
 391     comp.output_scale_factor = output_scale_factor;
 392
 393     if (!postInitMem) {
 394         comp.ptr_inputs = ptr_inputs;
 395         comp.ptr_outputs = ptr_outputs;
 396         comp.op.pwl.ptr_segments = ptr_segments;
 397     } else {
 398         ptr_inputs = &comp.ptr_inputs;
 399         ptr_outputs = &comp.ptr_outputs;
 400         if (ptr_segments != nullptr) {
 401             *reinterpret_cast<intel_pwl_segment_t **>(ptr_segments) =
 402                 reinterpret_cast<intel_pwl_segment_t *>(& comp.op.pwl.ptr_segments);
 403         }
 404     }
 405 }
 406
 407 void AmIntelDnn::InitRecurrentComponent(uint32_t component_index,
 408                                         uint32_t num_rows,
 409                                         uint32_t num_columns_in,
 410                                         uint32_t num_columns_out,
 411                                         uint32_t num_bytes_per_input,
 412                                         uint32_t num_bytes_per_output,
 413                                         uint32_t num_vector_delay,
 414                                         uint32_t num_bytes_per_weight,
 415                                         uint32_t num_bytes_per_bias,
 416                                         float weight_scale_factor,
 417                                         float output_scale_factor,
 418                                         void *ptr_inputs,
 419                                         void *ptr_feedbacks,
 420                                         void *ptr_outputs,
 421                                         void *ptr_weights,
 422                                         void *ptr_biases) {
 423     component[component_index].num_rows_in = num_rows;
 424     component[component_index].num_columns_in = num_columns_in;
 425     component[component_index].num_rows_out = num_rows;
 426     component[component_index].num_columns_out = num_columns_out;
 427     component[component_index].num_bytes_per_input = num_bytes_per_input;
 428     component[component_index].num_bytes_per_output = num_bytes_per_output;
 429     component[component_index].operation = kDnnRecurrentOp;
 430     component[component_index].macro_operation = kDnnMacroOpNone;
 431     component[component_index].orientation_in = kDnnNonInterleavedOrientation;
 432     component[component_index].orientation_out = kDnnNonInterleavedOrientation;
 433     component[component_index].ptr_inputs = ptr_inputs;
 434     component[component_index].ptr_outputs = ptr_outputs;
 435     component[component_index].op.recurrent.num_vector_delay = num_vector_delay;
 436     component[component_index].op.recurrent.num_bytes_per_weight = num_bytes_per_weight;
 437     component[component_index].op.recurrent.num_bytes_per_bias = num_bytes_per_bias;
 438     component[component_index].op.recurrent.weight_scale_factor = weight_scale_factor;
 439     component[component_index].output_scale_factor = output_scale_factor;
 440     component[component_index].op.recurrent.ptr_feedbacks = ptr_feedbacks;
 441     component[component_index].op.recurrent.ptr_weights = ptr_weights;
 442     component[component_index].op.recurrent.ptr_biases = ptr_biases;
 443 }
 444
 445 void AmIntelDnn::InitInterleaveComponent(uint32_t component_index, uint32_t num_rows, uint32_t num_columns,
 446                                          uint32_t num_bytes_per_input, uint32_t num_bytes_per_output,
 447                                          float output_scale_factor, void *ptr_inputs, void *ptr_outputs) {
 448     component[component_index].num_rows_in = num_rows;
 449     component[component_index].num_columns_in = num_columns;
 450     component[component_index].num_rows_out = num_columns;
 451     component[component_index].num_columns_out = num_rows;
 452     component[component_index].num_bytes_per_input = num_bytes_per_input;
 453     component[component_index].num_bytes_per_output = num_bytes_per_output;
 454     component[component_index].operation = kDnnInterleaveOp;
 455     component[component_index].macro_operation = kDnnMacroOpNone;
 456     component[component_index].orientation_in = kDnnNonInterleavedOrientation;
 457     component[component_index].orientation_out = kDnnInterleavedOrientation;
 458     component[component_index].ptr_inputs = ptr_inputs;
 459     component[component_index].ptr_outputs = ptr_outputs;
 460     component[component_index].output_scale_factor = output_scale_factor;
 461 }
 462
 463 void AmIntelDnn::InitDeinterleaveComponent(uint32_t component_index, uint32_t num_rows, uint32_t num_columns,
 464                                            uint32_t num_bytes_per_input, uint32_t num_bytes_per_output,
 465                                            float output_scale_factor, void *ptr_inputs, void *ptr_outputs) {
 466     component[component_index].num_rows_in = num_rows;
 467     component[component_index].num_columns_in = num_columns;
 468     component[component_index].num_rows_out = num_columns;
 469     component[component_index].num_columns_out = num_rows;
 470     component[component_index].num_bytes_per_input = num_bytes_per_input;
 471     component[component_index].num_bytes_per_output = num_bytes_per_output;
 472     component[component_index].operation = kDnnDeinterleaveOp;
 473     component[component_index].macro_operation = kDnnMacroOpNone;
 474     component[component_index].orientation_in = kDnnInterleavedOrientation;
 475     component[component_index].orientation_out = kDnnNonInterleavedOrientation;
 476     component[component_index].ptr_inputs = ptr_inputs;
 477     component[component_index].ptr_outputs = ptr_outputs;
 478     component[component_index].output_scale_factor = output_scale_factor;
 479 }
 480
 481 __inline void ApplyAffineTransform(intel_dnn_component_t *component, uint32_t *list, uint32_t listsize) {
 482     auto transform = &component->op.affine;
 483     int m = component->num_rows_out;
 484     int n = component->num_columns_in;
 485     int k = component->num_rows_in;
 486     int lda = component->num_rows_in;
 487     int ldb = component->num_columns_in;
 488     int ldc = component->num_columns_out;
 489
 490     switch (component->num_bytes_per_input) {
 491 #ifdef INTEGER_REF
 492         case 2:
 493             if (component->op.affine.num_bytes_per_weight == 1) {
 494                 int8_t *A = reinterpret_cast<int8_t*>(transform->ptr_weights);
 495                 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
 496                 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
 497                 intel_compound_bias_t *bias = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
 498                 if (list == nullptr) {
 499                     //  PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
 500                     //  PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
 501                     //  PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
 502                     igemm8_gna(m, n, k, A, lda, B, ldb, bias, C, ldc);
 503                 } else {
 504                     //  PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
 505                     //  PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
 506                     //  PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
 507                     igemm8_gna_subset(m, n, k, A, lda, B, ldb, bias, C, ldc, list, listsize);
 508                 }
 509                 //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
 510             } else if (component->op.affine.num_bytes_per_weight == 2) {
 511                 int16_t *A = reinterpret_cast<int16_t*>(transform->ptr_weights);
 512                 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
 513                 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
 514                 int32_t *bias = reinterpret_cast<int32_t*>(transform->ptr_biases);
 515                 if (list == nullptr) {
 516                     for (uint32_t i = 0; i < m; i++) {
 517                         for (uint32_t j = 0; j < n; j++) {
 518                             C[i*ldc+j] = bias[i];
 519                         }
 520                     }
 521                     //  PrintMatrixInt16("A int16", A, k, m, lda, component->op.affine.weight_scale_factor);
 522                     //  PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.weight_scale_factor);
 523                     //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
 524                     cblas_igemm16(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc);
 525                 } else {
 526                     for (int l = 0; l < listsize; l++) {
 527                         int i = list[l];
 528                         for (uint32_t j = 0; j < n; j++) {
 529                             C[l*ldc+j] = bias[i];
 530                         }
 531                     }
 532                     //  PrintMatrixInt16("A int16", A, k, m, lda, component->op.affine.scale_factor);
 533                     //  PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.scale_factor);
 534                     //  PrintMatrixInt32("C int32", C, m, n, ldc, component->op.affine.scale_factor * component->op.affine.scale_factor);
 535                     cblas_igemm16_subset(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc, list, listsize);
 536                 }
 537                 //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
 538             } else {
 539                 fprintf(stderr, "Bad weight width in ApplyAffineTransform!\n");
 540                 throw -1;
 541             }
 542             break;
 543 #endif  // #ifdef INTEGER_REF
 544         case 4: {
 545             auto A = reinterpret_cast<float *>(transform->ptr_weights);
 546             auto B = reinterpret_cast<float *>(component->ptr_inputs);
 547             auto C = reinterpret_cast<float *>(component->ptr_outputs);
 548             auto bias = reinterpret_cast<float *>(transform->ptr_biases);
 549             if (list == nullptr) {
 550                 for (uint32_t i = 0; i < m; i++) {
 551                     for (uint32_t j = 0; j < n; j++) {
 552                         C[i * ldc + j] = bias[i];
 553                     }
 554                 }
 555                 //  if (global_debug) PrintMatrixFloat32("A float", A, m, k, lda);
 556                 //  if (global_debug) PrintMatrixFloat32("B float", B, k, n, ldb);
 557                 //  if (global_debug) PrintMatrixFloat32("C float before", C, m, n, ldc);
 558                 cblas_sgemm1(CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, 1.0, A, lda, B, ldb, 1.0, C, ldc);
 559                 //  if (global_debug) PrintMatrixFloat32("C float after", C, m, n, ldc);
 560             } else {
 561                 for (int l = 0; l < listsize; l++) {
 562                     int i = list[l];
 563                     for (uint32_t j = 0; j < n; j++) {
 564                         C[l * ldc + j] = bias[i];
 565                     }
 566                 }
 567                 //  PrintMatrixFloat32("A float", A, k, m, lda);
 568                 //  PrintMatrixFloat32("trans(B) float", B, k, n, ldb);
 569                 //  PrintMatrixFloat32("C float before", C, listsize, n, ldc);
 570                 cblas_sgemm_subset(CblasRowMajor,
 571                                    CblasNoTrans,
 572                                    CblasNoTrans,
 573                                    m,
 574                                    n,
 575                                    k,
 576                                    1.0,
 577                                    A,
 578                                    lda,
 579                                    B,
 580                                    ldb,
 581                                    1.0,
 582                                    C,
 583                                    ldc,
 584                                    list,
 585                                    listsize);
 586                 //  PrintMatrixFloat32("C float after", C, listsize, n, ldc);
 587             }
 588         }
 589             break;
 590         default:fprintf(stderr, "Bad data width in ApplyAffineTransform!\n");
 591             throw -1;
 592     }
 593 }
 594
 595 __inline void ApplyDiagonalTransform(intel_dnn_component_t *component) {
 596     auto transform = &component->op.affine;
 597     int m = component->num_rows_out;
 598     int n = component->num_columns_in;
 599     int ldb = component->num_columns_in;
 600     int ldc = component->num_columns_out;
 601
 602     switch (component->num_bytes_per_input) {
 603 #ifdef INTEGER_REF
 604         case 2:
 605             if (component->op.affine.num_bytes_per_weight == 1) {
 606                 int8_t *A = reinterpret_cast<int8_t*>(transform->ptr_weights);
 607                 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
 608                 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
 609                 intel_compound_bias_t *bias = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
 610                 //  PrintMatrixInt8("W int8", W, k, m, ldw, component->op.affine.weight_scale_factor);
 611                 //  PrintMatrixInt16("X int16", X, k, n, ldx, component->op.affine.weight_scale_factor);
 612                 //  PrintMatrixInt32("Y int32", Y, m, n, ldy, component->output_scale_factor);
 613                 isbmm8_gna(m, n, A, lda, B, ldb, bias, C, ldc);
 614                 //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
 615             } else if (component->op.affine.num_bytes_per_weight == 2) {
 616                 int16_t *A = reinterpret_cast<int16_t*>(transform->ptr_weights);
 617                 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_inputs);
 618                 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs);
 619                 int32_t *bias = reinterpret_cast<int32_t*>(transform->ptr_biases);
 620                 for (uint32_t i = 0; i < m; i++) {
 621                     for (uint32_t j = 0; j < n; j++) {
 622                         C[i*ldc+j] = bias[i];
 623                     }
 624                 }
 625                 //  PrintMatrixInt16("A int16", A, 1, m, lda, component->op.affine.weight_scale_factor);
 626                 //  PrintMatrixInt16("trans(B) int16", B, k, n, ldb, component->op.affine.weight_scale_factor);
 627                 //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
 628                 cblas_isbmm16(m, n, A, lda, B, ldb, C, ldc);
 629                 //  PrintMatrixInt32("C int32", C, m, n, ldc, component->output_scale_factor);
 630             } else {
 631                 fprintf(stderr, "Bad weight width in ApplyDiagonalTransform!\n");
 632                 throw -1;
 633             }
 634             break;
 635 #endif  // #ifdef INTEGER_REF
 636         case 4: {
 637             auto A = reinterpret_cast<float *>(transform->ptr_weights);
 638             auto B = reinterpret_cast<float *>(component->ptr_inputs);
 639             auto C = reinterpret_cast<float *>(component->ptr_outputs);
 640             auto bias = reinterpret_cast<float *>(transform->ptr_biases);
 641             for (uint32_t i = 0; i < m; i++) {
 642                 for (uint32_t j = 0; j < n; j++) {
 643                     C[i * ldc + j] = bias[i];
 644                 }
 645             }
 646             //  PrintMatrixFloat32("A float", A, 1, m, lda);
 647             //  PrintMatrixFloat32("B float", B, k, n, ldb);
 648             //  PrintMatrixFloat32("C float before", C, m, n, ldc);
 649             for (uint32_t j = 0; j < n; j++) {
 650                 float *Bcol = B + j * ldb;
 651                 float *Ccol = C + j * ldc;
 652                 cblas_ssbmv1(CblasRowMajor, CblasLower, m, 0, 1.0, A, 1, Bcol, 1, 1.0, Ccol, 1);
 653             }
 654             //  PrintMatrixFloat32("C float after", C, m, n, ldc);
 655         }
 656             break;
 657         default:fprintf(stderr, "Bad data width in ApplyDiagonalTransform!\n");
 658             throw -1;
 659     }
 660 }
 661
 662 __inline void ApplyRecurrentTransform(intel_dnn_component_t *component, uint32_t row, void *ptr_feedbacks) {
 663     intel_recurrent_t *transform = &component->op.recurrent;
 664     int k1 = component->num_columns_in;
 665     int k2 = component->num_columns_out;
 666     int n = k2;
 667
 668     if (component->op.recurrent.ptr_feedbacks == nullptr) {
 669         fprintf(stderr, "nullptr feedback pointer in ApplyRecurrentTransform()!\n");
 670         throw -1;
 671     }
 672
 673     switch (component->num_bytes_per_input) {
 674 #ifdef INTEGER_REF
 675         case 2:
 676             if (component->op.recurrent.num_bytes_per_weight == 1) {
 677                 int16_t *A1 = reinterpret_cast<int16_t*>(component->ptr_inputs) + row * component->num_columns_in;
 678                 int16_t *A2 = reinterpret_cast<int16_t*>(ptr_feedbacks);
 679                 int8_t *X = reinterpret_cast<int8_t*>(transform->ptr_weights);
 680                 intel_compound_bias_t *B = reinterpret_cast<intel_compound_bias_t*>(transform->ptr_biases);
 681                 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs) + row * component->num_columns_out;
 682                 //  PrintMatrixInt16("A1 int", A1, 1, k1, k1, component->op.recurrent.weight_scale_factor);
 683                 //  PrintMatrixInt16("A2 int", A2, 1, k2, k2);
 684                 //  PrintMatrixInt8("X int", X, k, n, n, component->op.recurrent.weight_scale_factor);
 685                 //  PrintMatrixInt32("B int", B, 1, 2*n, 2*n, component->output_scale_factor);
 686                 igemv8_gna_split(n, k1, k2, A1, A2, X, B, C);
 687                 //  PrintMatrixInt32("C int", C, 1, n, n, component->output_scale_factor);
 688             } else if (component->op.recurrent.num_bytes_per_weight == 2) {
 689                 int16_t *A1 = reinterpret_cast<int16_t*>(component->ptr_inputs) + row * component->num_columns_in;
 690                 int16_t *A2 = reinterpret_cast<int16_t*>(ptr_feedbacks);
 691                 int16_t *X = reinterpret_cast<int16_t*>(transform->ptr_weights);
 692                 int32_t *B = reinterpret_cast<int32_t*>(transform->ptr_biases);
 693                 int32_t *C = reinterpret_cast<int32_t*>(component->ptr_outputs) + row * component->num_columns_out;
 694                 //  PrintMatrixInt16("A1 int", A1, 1, k1, k1, component->op.recurrent.weight_scale_factor);
 695                 //  PrintMatrixInt16("A2 int", A2, 1, k2, k2, component->op.recurrent.weight_scale_factor);
 696                 //  PrintMatrixInt16("X int", X, k, n, n, component->op.recurrent.weight_scale_factor);
 697                 //  PrintMatrixInt32("B int", B, 1, n, n, component->output_scale_factor);
 698                 igemv16_split(n, k1, k2, A1, A2, X, B, C);
 699                 //  PrintMatrixInt32("C int", C, 1, n, n, component->output_scale_factor);
 700             } else {
 701                 fprintf(stderr, "Weight width not supported in ApplyRecurrentTransform!\n");
 702                 throw -1;
 703             }
 704             break;
 705 #endif  // #ifdef INTEGER_REF
 706         case 4: {
 707             auto A1 = reinterpret_cast<float *>(component->ptr_inputs) + row * component->num_columns_in;
 708             auto A2 = reinterpret_cast<float *>(ptr_feedbacks);
 709             auto X = reinterpret_cast<float *>(transform->ptr_weights);
 710             auto B = reinterpret_cast<float *>(transform->ptr_biases);
 711             auto C = reinterpret_cast<float *>(component->ptr_outputs) + row * component->num_columns_out;
 712             //  PrintMatrixFloat32("A1 float", A1, 1, k1, k1);
 713             //  PrintMatrixFloat32("A2 float", A2, 1, k2, k2);
 714             //  PrintMatrixFloat32("X float", X, k, n, n);
 715             //  PrintMatrixFloat32("B float", B, 1, n, n);
 716             sgemv_split(n, k1, k2, A1, A2, X, B, C);
 717             //  PrintMatrixFloat32("C float", C, 1, n, n);
 718         }
 719             break;
 720         default:fprintf(stderr, "Bad data width in ApplyRecurrentTransform!\n");
 721             throw -1;
 722     }
 723 }
 724
 725 __inline void ApplyConvolutional1DTransform(intel_dnn_component_t *component) {
 726     switch (component->num_bytes_per_input) {
 727 #ifdef INTEGER_REF
 728         case 2:
 729             CNNFilter16(component);
 730             break;
 731 #endif  // #ifdef INTEGER_REF
 732         case 4:
 733             //  PrintMatrixFloat32("Input float", reinterpret_cast<float*>(component->ptr_inputs),
 734             //  component->num_rows_in, component->num_columns_in, component->num_columns_in);
 735             //  PrintMatrixFloat32("Filt float", reinterpret_cast<float*>(component->op.conv1D.ptr_filters),
 736             //  component->op.conv1D.num_filters,
 737             //  component->op.conv1D.num_filter_rows*component->op.conv1D.num_feature_map_columns*component->op.conv1D.num_feature_maps,
 738             //  component->op.conv1D.num_filter_rows*component->op.conv1D.num_feature_map_columns*component->op.conv1D.num_feature_maps);
 739             //  PrintMatrixFloat32("Bias float", reinterpret_cast<float*>(component->op.conv1D.ptr_biases), 1,
 740             // component->op.conv1D.num_filters, component->op.conv1D.num_filters);
 741             CNNFilter32(component);
 742             //  PrintMatrixFloat32("Output float", reinterpret_cast<float*>(component->ptr_outputs, component->num_rows_out,
 743             // component->num_columns_out, component->num_columns_out);
 744             break;
 745         default:fprintf(stderr, "Bad data width in ApplyConvolutionalTransform!\n");
 746             throw -1;
 747     }
 748 }
 749
 750 __inline void ApplyPiecewiseLinearTransform(intel_dnn_component_t *component,
 751                                             intel_dnn_number_type_t number_type,
 752                                             uint32_t listsize) {
 753     if (number_type == kDnnFloat) {
 754         // PrintMatrixFloat32("PWL Input float", reinterpret_cast<float*>(component->ptr_inputs), component->num_rows_in,
 755         // component->num_columns_in, component->num_columns_in);
 756         PwlApply32(component, listsize);
 757         // PrintMatrixFloat32("PWL Output float", reinterpret_cast<float*>(component->ptr_outputs), component->num_rows_out,
 758         // component->num_columns_out, component->num_columns_out);
 759 #ifdef INTEGER_REF
 760         } else if (component->num_bytes_per_output == 2) {
 761             PwlApply16(component, listsize);
 762 #endif  // #ifdef INTEGER_REF
 763     } else {
 764         fprintf(stderr, "Bad data width in ApplyPiecewiseLinearTransform!\n");
 765         throw -1;
 766     }
 767 }
 768
 769 __inline void ApplyPiecewiseLinearTransform(intel_dnn_component_t *component,
 770                                             intel_dnn_number_type_t number_type,
 771                                             uint32_t listsize,
 772                                             uint32_t num_row) {
 773     if (number_type == kDnnFloat) {
 774         PwlApply32(component, num_row, num_row, 0, listsize - 1);
 775 #ifdef INTEGER_REF
 776         } else if (component->num_bytes_per_output == 2) {
 777             PwlApply16(component, num_row, num_row, 0, listsize-1);
 778 #endif  // #ifdef INTEGER_REF
 779     } else {
 780         fprintf(stderr, "Bad data width in ApplyPiecewiseLinearTransform!\n");
 781         throw -1;
 782     }
 783 }
 784
 785 __inline void ApplyMaxPoolTransform(intel_dnn_component_t *component, intel_dnn_number_type_t number_type) {
 786     if (component->num_bytes_per_input == 4) {
 787         // PrintMatrixFloat32("Input float", reinterpret_cast<float*>(component->ptr_inputs), component->num_rows_in,
 788         // component->num_columns_in, component->num_columns_in);
 789         CNNMaxPool(component, number_type);
 790         // PrintMatrixFloat32("Output float", reinterpret_cast<float*>(component->ptr_outputs), component->num_rows_out,
 791         // component->num_columns_out, component->num_columns_out);
 792     } else {
 793         fprintf(stderr, "Bad data width in ApplyMaxPoolTransform!\n");
 794         throw -1;
 795     }
 796 }
 797
 798 __inline void ApplyTranspose(intel_dnn_component_t *component) {
 799     int m = component->num_rows_in;
 800     int n = component->num_columns_in;
 801     int lda = component->num_columns_in;
 802     int ldb = component->num_columns_out;
 803     // B = Transpose(A) where A is mxn and B is nxm
 804     switch (component->num_bytes_per_input) {
 805 #ifdef INTEGER_REF
 806         case 1:
 807             {
 808                 int8_t *A = reinterpret_cast<int8_t*>(component->ptr_inputs);
 809                 int8_t *B = reinterpret_cast<int8_t*>(component->ptr_outputs);
 810                 for (uint32_t row = 0; row < m; row++) {
 811                     for (uint32_t col = 0; col < n; col++) {
 812                         B[col*ldb+row] = A[row*lda+col];
 813                     }
 814                 }
 815             }
 816             break;
 817         case 2:
 818             {
 819                 int16_t *A = reinterpret_cast<int16_t*>(component->ptr_inputs);
 820                 int16_t *B = reinterpret_cast<int16_t*>(component->ptr_outputs);
 821                 for (uint32_t row = 0; row < m; row++) {
 822                     for (uint32_t col = 0; col < n; col++) {
 823                         B[col*ldb+row] = A[row*lda+col];
 824                     }
 825                 }
 826             }
 827             break;
 828 #endif  // #ifdef INTEGER_REF
 829         case 4: {
 830             auto A = reinterpret_cast<float *>(component->ptr_inputs);
 831             auto B = reinterpret_cast<float *>(component->ptr_outputs);
 832             for (uint32_t row = 0; row < m; row++) {
 833                 for (uint32_t col = 0; col < n; col++) {
 834                     B[col * ldb + row] = A[row * lda + col];
 835                 }
 836             }
 837         }
 838             break;
 839         default:fprintf(stderr, "Bad data width in ApplyInterleave!\n");
 840             throw -1;
 841     }
 842 }
 843
 844 __inline void ApplyCopy(intel_dnn_component_t *component) {
 845     auto src = reinterpret_cast<uint8_t *>(component->ptr_inputs);
 846     auto dst = reinterpret_cast<uint8_t *>(component->ptr_outputs);
 847     int32_t m = component->op.copy.num_copy_rows;
 848     int32_t n = component->op.copy.num_copy_columns;
 849     int32_t lda = component->num_columns_in;
 850     int32_t ldb = component->num_columns_out;
 851     if (m > component->num_rows_in) {
 852         fprintf(stderr, "Error:  attempt to copy more columns than matrix has!\n");
 853         throw -1;
 854     } else {
 855         switch (component->num_bytes_per_input) {
 856 #ifdef INTEGER_REF
 857             case 2:
 858                 {
 859                     int16_t *A = reinterpret_cast<int16_t*>(src);
 860                     int16_t *B = reinterpret_cast<int16_t*>(dst);
 861                     for (uint32_t row = 0; row < m; row++) {
 862                         for (uint32_t col = 0; col < n; col++) {
 863                             B[row*ldb + col] = A[row*lda + col];
 864                         }
 865                     }
 866                 }
 867                 break;
 868 #endif  // #ifdef INTEGER_REF
 869             case 4: {
 870                 auto A = reinterpret_cast<float *>(src);
 871                 auto B = reinterpret_cast<float *>(dst);
 872                 for (uint32_t row = 0; row < m; row++) {
 873                     for (uint32_t col = 0; col < n; col++) {
 874                         B[row * ldb + col] = A[row * lda + col];
 875                     }
 876                 }
 877             }
 878                 break;
 879             default:fprintf(stderr, "Bad data width in ApplyCopy!\n");
 880                 throw -1;
 881         }
 882     }
 883 }
 884
 885 uint32_t AmIntelDnn::CopyActiveList(std::vector<std::vector<uint32_t> > &active_list, uint32_t list_index) {
 886     if (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) {
 887         num_active_outputs_ = component[component.size() - 1].num_rows_out;
 888     } else {
 889         num_active_outputs_ = component[component.size() - 1].num_columns_out;
 890     }
 891
 892     if (!active_list.empty()) {
 893         if (list_index >= active_list.size()) {
 894             fprintf(stderr, "Index %d beyond end of active list in CopyActiveList()\n", list_index);
 895             throw -1;
 896         }
 897         if (active_list[list_index].size() > component[component.size() - 1].num_rows_out) {
 898             fprintf(stderr, "Active list too large in CopyActiveList()\n");
 899             throw -1;
 900         }
 901
 902         if (ptr_active_outputs_ != nullptr) {
 903             num_active_outputs_ = active_list[list_index].size();
 904             memcpy(ptr_active_outputs_, active_list[list_index].data(), num_active_outputs_ * sizeof(uint32_t));
 905         }
 906     }
 907
 908     return (num_active_outputs_);
 909 }
 910
 911 void AmIntelDnn::Propagate() {
 912     for (uint32_t i = 0; i < component.size(); i++) {
 913         intel_dnn_component_t *comp = &component[i];
 914         uint32_t *ptr_active_outputs = nullptr;
 915         uint32_t num_active_outputs = (comp->orientation_out == kDnnInterleavedOrientation)
 916                                       ? comp->num_rows_out : comp->num_columns_out;
 917
 918         if (i == component.size() - 1) {  // active list applies to last component
 919             ptr_active_outputs = ptr_active_outputs_;
 920             num_active_outputs = num_active_outputs_;
 921         } else if (i == component.size() - 2) {  // also applies to last two components when last is PWL
 922             if ((component[i].operation == kDnnAffineOp) && (component[i + 1].operation == kDnnPiecewiselinearOp)) {
 923                 ptr_active_outputs = ptr_active_outputs_;
 924                 num_active_outputs = num_active_outputs_;
 925             }
 926         }
 927
 928         switch (comp->operation) {
 929             case kDnnAffineOp :ApplyAffineTransform(comp, ptr_active_outputs, num_active_outputs);
 930                 break;
 931             case kDnnDiagonalOp:ApplyDiagonalTransform(comp);
 932                 break;
 933             case kDnnRecurrentOp:
 934                 if ((i < component.size() - 1) && (component[i + 1].operation == kDnnPiecewiselinearOp)) {
 935                     intel_dnn_component_t *comp_pwl = &component[i + 1];
 936                     for (uint32_t j = 0; j < comp->num_rows_in; j++) {
 937                         void *ptr_feedbacks =
 938                             reinterpret_cast<void *>(reinterpret_cast<int32_t *>(comp->op.recurrent.ptr_feedbacks) + j * comp_pwl->num_columns_out);
 939                         ApplyRecurrentTransform(comp, j, ptr_feedbacks);
 940                         //  PrintOutputs(i);
 941                         ApplyPiecewiseLinearTransform(comp_pwl, number_type_, num_active_outputs, j);
 942                     }
 943                     i++;  // skip next component
 944                 } else {
 945                     fprintf(stderr, "Missing PiecewiseLinear component after Recurrent component in Propagate!\n");
 946                     throw -1;
 947                 }
 948                 break;
 949             case kDnnConvolutional1dOp:ApplyConvolutional1DTransform(comp);
 950                 break;
 951             case kDnnPiecewiselinearOp:ApplyPiecewiseLinearTransform(comp, number_type_, num_active_outputs);
 952                 break;
 953             case kDnnMaxPoolOp:ApplyMaxPoolTransform(comp, number_type_);
 954                 break;
 955             case kDnnInterleaveOp:ApplyTranspose(comp);
 956                 break;
 957             case kDnnDeinterleaveOp:ApplyTranspose(comp);
 958                 break;
 959             case kDnnCopyOp:ApplyCopy(comp);
 960                 break;
 961             default:fprintf(stderr, "Bad operation in Propagate!\n");
 962                 throw -1;
 963                 break;
 964         }
 965         //  PrintOutputs(i); fflush(stdout);
 966     }
 967 }
 968
 969 intel_dnn_macro_operation_t AmIntelDnn::MacroOperation(uint32_t component_index) {
 970     return (component[component_index].macro_operation);
 971 }
 972
 973 void AmIntelDnn::SetMacroOperation(uint32_t component_index, intel_dnn_macro_operation_t macro_operation) {
 974     component[component_index].macro_operation = macro_operation;
 975 }
 976
 977 float AmIntelDnn::InputScaleFactor(uint32_t component_index) {
 978     float scale_factor = 1.0;
 979
 980     if (component_index == 0) {
 981         scale_factor = input_scale_factor_;
 982     } else {
 983         if (component[component_index - 1].operation == kDnnAffineOp) {
 984             scale_factor = component[component_index - 1].output_scale_factor;
 985         } else if (component[component_index - 1].operation == kDnnDiagonalOp) {
 986             scale_factor = component[component_index - 1].output_scale_factor;
 987         } else if (component[component_index - 1].operation == kDnnConvolutional1dOp) {
 988             scale_factor = component[component_index - 1].output_scale_factor;
 989         } else if (component[component_index - 1].operation == kDnnRecurrentOp) {
 990             scale_factor = component[component_index - 1].output_scale_factor;
 991         } else if (component[component_index - 1].operation == kDnnInterleaveOp) {
 992             scale_factor = component[component_index - 1].output_scale_factor;
 993         } else if (component[component_index - 1].operation == kDnnDeinterleaveOp) {
 994             scale_factor = component[component_index - 1].output_scale_factor;
 995         } else if (component[component_index - 1].operation == kDnnCopyOp) {
 996             scale_factor = component[component_index - 1].output_scale_factor;
 997         }
 998     }
 999
1000     return (scale_factor);
1001 }
1002
1003 float AmIntelDnn::WeightScaleFactor(uint32_t component_index) {
1004     float scale_factor = 1.0;
1005
1006     if (component[component_index].operation == kDnnAffineOp) {
1007         scale_factor = component[component_index].op.affine.weight_scale_factor;
1008     } else if (component[component_index].operation == kDnnDiagonalOp) {
1009         scale_factor = component[component_index].op.affine.weight_scale_factor;
1010     } else if (component[component_index].operation == kDnnConvolutional1dOp) {
1011         scale_factor = component[component_index].op.conv1D.weight_scale_factor;
1012     } else if (component[component_index].operation == kDnnRecurrentOp) {
1013         scale_factor = component[component_index].op.recurrent.weight_scale_factor;
1014     }
1015
1016     return (scale_factor);
1017 }
1018
1019 float AmIntelDnn::OutputScaleFactor(intel_dnn_component_t &comp) {
1020     return comp.output_scale_factor;
1021 }
1022
1023 void AmIntelDnn::SetOutputScaleFactor(uint32_t component_index, float scale_factor) {
1024     component[component_index].output_scale_factor = scale_factor;
1025 }
1026
1027 void AmIntelDnn::PrintOutputs(uint32_t component_index) {
1028     float scale_factor = OutputScaleFactor(component_index);
1029     uint32_t num_rows = component[component_index].num_rows_out;
1030     uint32_t num_columns = component[component_index].num_columns_out;
1031
1032     printf("component %d : %s\n", component_index, intel_dnn_operation_name[component[component_index].operation]);
1033     if (number_type_ == kDnnFloat) {
1034         auto ptr_output = reinterpret_cast<float *>(component[component_index].ptr_outputs);
1035         for (int i = 0; i < num_rows; i++) {
1036             for (int j = 0; j < num_columns; j++) {
1037                 printf("%d %d : %e\n", i, j, ptr_output[i * num_columns + j] / scale_factor);
1038             }
1039         }
1040     } else {
1041         switch (component[component_index].num_bytes_per_output) {
1042             case 1: {
1043                 auto ptr_output = reinterpret_cast<int8_t *>(component[component_index].ptr_outputs);
1044                 for (int i = 0; i < num_rows; i++) {
1045                     for (int j = 0; j < num_columns; j++) {
1046                         printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
1047                     }
1048                 }
1049             }
1050                 break;
1051             case 2: {
1052                 auto ptr_output = reinterpret_cast<int16_t *>(component[component_index].ptr_outputs);
1053                 for (int i = 0; i < num_rows; i++) {
1054                     for (int j = 0; j < num_columns; j++) {
1055                         printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
1056                     }
1057                 }
1058             }
1059                 break;
1060             case 4: {
1061                 auto ptr_output = reinterpret_cast<int32_t *>(component[component_index].ptr_outputs);
1062                 for (int i = 0; i < num_rows; i++) {
1063                     for (int j = 0; j < num_columns; j++) {
1064                         printf("%d %d : %e\n", i, j, static_cast<float>(ptr_output[i * num_columns + j]) / scale_factor);
1065                     }
1066                 }
1067             }
1068                 break;
1069             default:
1070                 fprintf(stderr,
1071                         "Bad num_bytes_per_output in component %d in AmIntelDnn::PrintOutputs()\n",
1072                         component_index);
1073                 throw -1;
1074         }
1075     }
1076 }
1077
1078 uint32_t AmIntelDnn::CompareScores(void *ptr_refscorearray, intel_score_error_t *score_error, uint32_t num_frames) {
1079     intel_dnn_component_t *ptr_component = &component[component.size() - 1];
1080     intel_dnn_orientation_t orientation = ptr_component->orientation_out;
1081     float scale_factor = OutputScaleFactor(component.size() - 1);
1082     uint32_t num_errors = 0;
1083     uint32_t num_rows = (orientation == kDnnInterleavedOrientation) ? ptr_component->num_rows_out : num_frames;
1084     uint32_t num_columns = (orientation == kDnnInterleavedOrientation) ? num_frames : ptr_component->num_columns_out;
1085     uint32_t num_row_step_ref =
1086         (orientation == kDnnInterleavedOrientation) ? ptr_component->num_rows_out : ptr_component->num_columns_out;
1087     uint32_t num_row_step = ptr_component->num_columns_out;
1088
1089     if (ptr_component->operation == kDnnAffineOp) {
1090         num_rows = num_active_outputs_;
1091     }
1092
1093     ClearScoreError(score_error);
1094
1095     if (number_type_ == kDnnFloat) {
1096         auto A = reinterpret_cast<float *>(ptr_component->ptr_outputs);
1097         auto B = reinterpret_cast<float *>(ptr_refscorearray);
1098         for (int i = 0; i < num_rows; i++) {
1099             for (int j = 0; j < num_columns; j++) {
1100                 float score = A[i * num_row_step + j];
1101                 float refscore =
1102                     (orientation == kDnnInterleavedOrientation) ? B[j * num_row_step_ref + i] : B[i * num_row_step_ref
1103                         + j];
1104                 float scaled_score = score / scale_factor;
1105                 float error = fabs(refscore - scaled_score);
1106                 float rel_error = error / (fabs(refscore) + 1e-20);
1107                 float squared_error = error * error;
1108                 float squared_rel_error = rel_error * rel_error;
1109                 score_error->num_scores++;
1110                 score_error->sum_error += error;
1111                 score_error->sum_squared_error += squared_error;
1112                 if (error > score_error->max_error) {
1113                     score_error->max_error = error;
1114                 }
1115                 score_error->sum_rel_error += rel_error;
1116                 score_error->sum_squared_rel_error += squared_rel_error;
1117                 if (rel_error > score_error->max_rel_error) {
1118                     score_error->max_rel_error = rel_error;
1119                 }
1120                 if (error > score_error->threshold) {
1121                     num_errors++;
1122                 }
1123             }
1124         }
1125     } else if (number_type_ == kDnnInt) {
1126         auto B = reinterpret_cast<float *>(ptr_refscorearray);
1127         for (int i = 0; i < num_rows; i++) {
1128             for (int j = 0; j < num_columns; j++) {
1129                 float score;
1130                 if (ptr_component->num_bytes_per_output == 4) {
1131                     auto A = reinterpret_cast<int32_t *>(ptr_component->ptr_outputs);
1132                     score = static_cast<float>(A[i * num_row_step + j]);
1133                 } else if (ptr_component->num_bytes_per_output == 2) {
1134                     auto A = reinterpret_cast<int16_t *>(ptr_component->ptr_outputs);
1135                     score = static_cast<float>(A[i * num_row_step + j]);
1136                 } else {
1137                     fprintf(stderr,
1138                             "Unsupported output width (%d) in AmIntelDnn::CompareScores()!\n",
1139                             ptr_component->num_bytes_per_output);
1140                     throw -1;
1141                 }
1142                 float refscore =
1143                     (orientation == kDnnInterleavedOrientation) ? B[j * num_row_step_ref + i] : B[i * num_row_step_ref
1144                         + j];
1145                 float scaled_score = score / scale_factor;
1146                 float error = fabs(refscore - scaled_score);
1147                 float rel_error = error / (fabs(refscore) + 1e-20);
1148                 float squared_error = error * error;
1149                 float squared_rel_error = rel_error * rel_error;
1150                 score_error->num_scores++;
1151                 score_error->sum_error += error;
1152                 score_error->sum_squared_error += squared_error;
1153                 if (error > score_error->max_error) {
1154                     score_error->max_error = error;
1155                 }
1156                 score_error->sum_rel_error += rel_error;
1157                 score_error->sum_squared_rel_error += squared_rel_error;
1158                 if (rel_error > score_error->max_rel_error) {
1159                     score_error->max_rel_error = rel_error;
1160                 }
1161                 if (error > score_error->threshold) {
1162                     num_errors++;
1163                 }
1164             }
1165         }
1166     } else {
1167         fprintf(stderr, "Unknown number type in AmIntelDnn::CompareScores()!\n");
1168         throw -1;
1169     }
1170
1171     score_error->num_errors = num_errors;
1172
1173     return (num_errors);
1174 }
1175
1176 void AmIntelDnn::WriteGraphWizModel(const char *filename) {
1177     auto & components = component;
1178
1179 #define IS_AFFINE(k)\
1180     (components[k].operation == kDnnAffineOp ||\
1181      components[k].operation == kDnnDiagonalOp)
1182
1183 #define IS_CONV(k)\
1184     (components[k].operation == kDnnConvolutional1dOp)
1185
1186 #define IS_RELU(k)\
1187     (components[k].operation == kDnnPiecewiselinearOp &&\
1188      components[k].op.pwl.func_id == kActRelu)
1189
1190
1191 #define IS_DIAG(k)\
1192     (components[k].operation == kDnnDiagonalOp)
1193
1194 #define OUTPUTS(idx)\
1195     components[idx].ptr_outputs, components[idx].num_rows_out*components[idx].num_columns_out * components[idx].num_bytes_per_output
1196
1197 #define INPUTS(idx)\
1198     components[idx].ptr_inputs, components[idx].num_rows_in*components[idx].num_columns_in * components[idx].num_bytes_per_input
1199
1200 #define BIASES(idx)\
1201     components[idx].op.affine.ptr_biases,  components[idx].num_rows_in*components[idx].num_columns_in * components[idx].op.affine.num_bytes_per_bias
1202
1203 #define WEIGHTS(idx)\
1204     components[idx].op.affine.ptr_weights, components[idx].op.affine.num_bytes_per_weight * components[idx].num_rows_in*components[idx].num_columns_in * \
1205             (IS_DIAG(idx) ? 1 : components[idx].num_rows_out*components[idx].num_columns_out)
1206
1207     auto intersected = [](void * ptra, size_t asize, void * ptrb, size_t bsize) {
1208         return !(((reinterpret_cast<char*>(ptra) + asize) <= ptrb) || ((reinterpret_cast<char*>(ptrb) + bsize) <= ptra));
1209     };
1210
1211     auto equals = [](void * ptra, size_t asize, void * ptrb, size_t bsize) {
1212         // return !((((char*)ptra + asize) < ptrb) || (((char*)ptrb + bsize) < ptra));
1213         return ptra >= ptrb  && ptra < reinterpret_cast<char*>(ptrb) + bsize;
1214     };
1215
1216     std::fstream graph("graph.dot", std::ios::out);
1217     graph << "strict digraph {";
1218     std::set<void*> weights;
1219     std::set<void*> biases;
1220     std::set<void*> outputs;
1221     std::set<std::string> layersNames;
1222
1223     auto generate_layer_name = [&](int k) {
1224         std::string l;
1225         if (components[k].operation == kDnnPiecewiselinearOp) {
1226             l += intel_dnn_activation_name[components[k].op.pwl.func_id];
1227         } else {
1228             l += intel_dnn_operation_name[components[k].operation];
1229         }
1230         l += "_" + std::to_string(k);
1231         if (components[k].operation == kDnnPiecewiselinearOp) {
1232             graph << l << " [shape=box, style=filled, fillcolor=yellow";
1233         } else {
1234             graph << l << " [shape=box";
1235         }
1236
1237         graph << ", label=<<TABLE BORDER=\"0\" CELLBORDER=\"1\" CELLSPACING=\"0\">\n"
1238             "  <TR><TD  colspan=\"2\">" <<  l << "</TD></TR>\n"
1239             "  <TR><TD  colspan=\"2\">" <<  components[k].num_rows_in << "x" <<  components[k].num_rows_out<< "</TD></TR>\n";
1240         if (IS_AFFINE(k)) {
1241             graph << "  <TR><TD> wscale</TD><TD>" <<  components[k].op.affine.weight_scale_factor<< "</TD></TR>\n";
1242             graph << "  <TR><TD> wbit</TD><TD>" <<  components[k].op.affine.num_bytes_per_weight<< "</TD></TR>\n";
1243             graph << "  <TR><TD> bbit</TD><TD>" <<  components[k].op.affine.num_bytes_per_bias<< "</TD></TR>\n";
1244         }
1245         if (IS_RELU(k)) {
1246             graph << "  <TR><TD> negative_slope</TD><TD>" <<  components[k].op.pwl.func_id.negative_slope<< "</TD></TR>\n";
1247         }
1248         if (IS_CONV(k)) {
1249             auto &conv = components[k].op.conv1D;
1250             graph << "  <TR><TD> num_filters</TD><TD>" <<  conv.num_filters<< "</TD></TR>\n";
1251             graph << "  <TR><TD> num_filter_rows</TD><TD>" <<  conv.num_filter_rows<< "</TD></TR>\n";
1252             graph << "  <TR><TD> num_filter_coefficients</TD><TD>" <<  conv.num_filter_coefficients<< "</TD></TR>\n";
1253             graph << "  <TR><TD> num_feature_maps</TD><TD>" <<  conv.num_feature_maps<< "</TD></TR>\n";
1254             graph << "  <TR><TD> num_feature_map_rows</TD><TD>" <<  conv.num_feature_map_rows<< "</TD></TR>\n";
1255             graph << "  <TR><TD> num_feature_map_columns</TD><TD>" <<  conv.num_feature_map_columns<< "</TD></TR>\n";
1256             graph << "  <TR><TD> wscale</TD><TD>" <<  conv.weight_scale_factor<< "</TD></TR>\n";
1257             graph << "  <TR><TD> wbit</TD><TD>" <<  conv.num_bytes_per_weight<< "</TD></TR>\n";
1258             graph << "  <TR><TD> bbit</TD><TD>" <<  conv.num_bytes_per_bias<< "</TD></TR>\n";
1259         }
1260         graph<<   "  <TR><TD> num_rows_in</TD><TD>" <<  components[k].num_rows_in<< "</TD></TR>\n"
1261                   "  <TR><TD> num_columns_in</TD><TD>" <<  components[k].num_columns_in<< "</TD></TR>\n"
1262                   "  <TR><TD> num_rows_out</TD><TD>" <<  components[k].num_rows_out<< "</TD></TR>\n"
1263                   "  <TR><TD> num_columns_out</TD><TD>" <<  components[k].num_columns_out<< "</TD></TR>\n"
1264                   "  <TR><TD> oscale</TD><TD>" <<  components[k].output_scale_factor<< "</TD></TR>\n"
1265                   "  <TR><TD> ibit</TD><TD>" <<  components[k].num_bytes_per_input<< "</TD></TR>\n"
1266                   "  <TR><TD> obit</TD><TD>" <<  components[k].num_bytes_per_output<< "</TD></TR>\n"
1267             "</TABLE>>];\n";
1268
1269         return l;
1270     };
1271
1272
1273     for (int k = 0; k < components.size(); ++k) {
1274         std::string l = generate_layer_name(k);
1275         layersNames.insert(l);
1276         int lidx = std::distance(layersNames.begin(), layersNames.find(l));
1277         int widx = 0;
1278         int bidx = 0;
1279
1280         if (IS_AFFINE(k)) {
1281             weights.insert(components[k].op.affine.ptr_weights);
1282             biases.insert(components[k].op.affine.ptr_biases);
1283
1284             widx = std::distance(weights.begin(), weights.find(components[k].op.affine.ptr_weights));
1285             bidx = std::distance(biases.begin(), biases.find(components[k].op.affine.ptr_biases));
1286         }
1287
1288
1289         auto lw =  "weights_" +  std::to_string(lidx) + "_" + std::to_string(widx);;
1290         auto lb =  "biases_" +  std::to_string(lidx) + "_" + std::to_string(bidx);
1291
1292         if (IS_AFFINE(k)) {
1293             graph << lw << " -> " << l << "[style=bold];";
1294             graph << lb << " -> " << l << "[style=bold];";
1295         }
1296
1297         graph << "\n";
1298
1299         bool inputConnected = false;
1300
1301         for (int k2 = 0; k2 < components.size(); ++k2) {
1302             if (k2 == k) continue;
1303
1304
1305             std::string r = generate_layer_name(k2);
1306
1307             int w2idx = 0;
1308             int b2idx = 0;
1309
1310             if (IS_AFFINE(k2)) {
1311                 weights.insert(components[k2].op.affine.ptr_weights);
1312                 biases.insert(components[k2].op.affine.ptr_biases);
1313
1314                 w2idx = std::distance(weights.begin(), weights.find(components[k2].op.affine.ptr_weights));
1315                 b2idx = std::distance(biases.begin(), biases.find(components[k2].op.affine.ptr_biases));
1316             }
1317
1318             auto rw =  "weights_" + std::to_string(w2idx);
1319             auto rb =  "biases_" + std::to_string(b2idx);
1320
1321             // ----------------------------------------------------------
1322             // output to input connections
1323             if (intersected(OUTPUTS(k2), INPUTS(k))) {
1324                 graph << r <<" -> "<< l << ";";
1325                 inputConnected = true;
1326             }
1327
1328             // ----------------------------------------------------------
1329             // output to biases connections
1330             if (IS_AFFINE(k) && intersected(OUTPUTS(k2), BIASES(k))) {
1331                 graph << r << " -> " << lb << " [label=\"OB\", fontcolor=blue, color=blue, style=dashed];";
1332             }
1333
1334             // ----------------------------------------------------------
1335             // output to weights connections
1336             if (IS_AFFINE(k) && equals(OUTPUTS(k2), WEIGHTS(k))) {
1337                 graph << r << " -> " << lw << " [label=\"OW\", fontcolor=magenta, color=magenta, style=dashed];";
1338             }
1339
1340             // ----------------------------------------------------------
1341             // weights to input connections
1342             if (IS_AFFINE(k2) && equals(WEIGHTS(k2), INPUTS(k))) {
1343                 graph << rw << " -> " << l << " [label=\"WI\", fontcolor=red, color=red, style=dashed];";
1344                 inputConnected = true;
1345             }
1346
1347             // ----------------------------------------------------------
1348             // weights to bias connections
1349             if (IS_AFFINE(k2) && IS_AFFINE(k) && equals(WEIGHTS(k2), BIASES(k))) {
1350                 graph << rw << " -> " << lb << " [label=\"WB\", fontcolor=darkgreen,color=darkgreen, style=dashed];";
1351             }
1352         }
1353         if (!inputConnected) {
1354             // drawing tmp connection
1355             outputs.insert(components[k].ptr_inputs);
1356             auto tidx = std::distance(outputs.begin(), outputs.find(components[k].ptr_inputs));
1357             graph << tidx << " -> " << l
1358                   << " [label=\"FROM_TMP\", fontcolor=darkgreen,color=orange, style=dashed];";
1359         }
1360     }
1361
1362     for (int k = 0; k < components.size(); ++k) {
1363         std::string l = generate_layer_name(k);
1364
1365         int tidx = 0;
1366         for (auto tmpOutPtrs : outputs) {
1367             if (components[k].ptr_outputs == tmpOutPtrs) {
1368                 graph << l << " -> " << tidx << " [label=\"TO_TMP\", fontcolor=darkgreen,color=orange, style=dashed];";
1369             }
1370             tidx++;
1371         }
1372     }
1373
1374     graph << "}";
1375 }
1376
1377 void AmIntelDnn::WriteDnnText(const char *filename, intel_dnn_number_type_t number_type) {
1378     if ((number_type_ == kDnnFloat) && (number_type == kDnnInt)) {
1379         fprintf(stderr, "Error trying to write floating point DNN as integer in AmIntelDnn::WriteDnnText().\n");
1380         fprintf(stderr, "  Please convert to integer first.\n");
1381         throw -1;
1382     }
1383 #ifndef LIGHT_DUMP
1384     std::ofstream out_file1(filename, std::ios::out);
1385     std::ofstream &out_file = out_file1;
1386 #else
1387     std::ofstream out_file((std::string(filename) + ".light").c_str(), std::ios::out);
1388 #endif
1389     if (out_file.good()) {
1390         uint32_t num_inputs = component[0].num_rows_in;
1391         uint32_t num_outputs =
1392             (component[component.size() - 1].orientation_out == kDnnInterleavedOrientation) ? component[component.size()
1393                 - 1].num_rows_out : component[component.size() - 1].num_columns_out;
1394         uint32_t num_layers = num_gna_layers();
1395         uint32_t num_group = this->num_group_in();
1396         uint32_t layer = 0;
1397
1398         out_file << "<intel_dnn_file>\n";
1399         out_file << "<number_type> " << intel_dnn_number_type_name[number_type] << "\n";
1400         out_file << "<softmax_type> " << intel_dnn_softmax_name[softmax_type] << "\n";
1401         out_file << "<num_memory_bytes> " << std::dec << num_bytes_dnn_memory_ << "\n";
1402         out_file << "<num_group> " << std::dec << num_group << "\n";
1403         out_file << "<number_inputs> " << std::dec << num_inputs << "\n";
1404         out_file << "<num_outputs> " << std::dec << num_outputs << "\n";
1405         out_file << "<num_layers> " << std::dec << num_layers << "\n";
1406         for (uint32_t i = 0; i < component.size(); i++) {
1407 #ifdef LIGHT_DUMP
1408             std::stringstream out_file_name;
1409             out_file_name << getDumpFolderName() << std::setfill('0') << std::setw(2) << i << "_"
1410                           << intel_dnn_operation_name[component[i].operation]
1411                           << "-" << component[i].num_rows_in
1412                           << "-" << component[i].num_rows_out;
1413             if (component[i].operation == kDnnPiecewiselinearOp) {
1414                 out_file_name << "-" << intel_dnn_activation_name[component[i].op.pwl.func_id.type];
1415             }
1416             std::ofstream out_file((out_file_name.str() + ".txt").c_str(), std::ios::out);
1417 #endif
1418
1419             uint32_t num_rows_in = component[i].num_rows_in;
1420             uint32_t num_columns_in = component[i].num_columns_in;
1421             uint32_t num_rows_out = component[i].num_rows_out;
1422             uint32_t num_columns_out = component[i].num_columns_out;
1423             uint32_t num_bytes_per_input = component[i].num_bytes_per_input;
1424             uint32_t num_bytes_per_output = component[i].num_bytes_per_output;
1425             if ((component[i].operation == kDnnAffineOp)
1426                 || (component[i].operation == kDnnDiagonalOp)
1427                 || (component[i].operation == kDnnRecurrentOp)
1428                 || (component[i].operation == kDnnConvolutional1dOp)
1429                 || (component[i].operation == kDnnInterleaveOp)
1430                 || (component[i].operation == kDnnDeinterleaveOp)
1431                 || (component[i].operation == kDnnCopyOp)) {
1432                 out_file << "<layer_index> " << std::dec << layer << "\n";
1433                 layer++;
1434             }
1435             out_file << "<component_operation> " << intel_dnn_operation_name[component[i].operation] << "\n";
1436             out_file << "<macro_operation> " << intel_dnn_macro_operation_name[component[i].macro_operation] << "\n";
1437             out_file << "<num_rows_in> " << std::dec << num_rows_in << "\n";
1438             out_file << "<num_columns_in> " << std::dec << num_columns_in << "\n";
1439             out_file << "<num_rows_out> " << std::dec << num_rows_out << "\n";
1440             out_file << "<num_columns_out> " << std::dec << num_columns_out << "\n";
1441             out_file << "<orientation_in> " << std::dec << (component[i].orientation_in == kDnnInterleavedOrientation ?
1442             "interleaved" : "deinterleaved") << "\n";
1443             out_file << "<orientation_out> " << std::dec << (component[i].orientation_out == kDnnInterleavedOrientation ?
1444                                                             "interleaved" : "deinterleaved") << "\n";
1445
1446             if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1447                 out_file << "<num_bytes_per_input> " << std::dec << sizeof(float) << "\n";
1448                 out_file << "<num_bytes_per_output> " << std::dec << sizeof(float) << "\n";
1449             } else {
1450                 out_file << "<num_bytes_per_input> " << std::dec << num_bytes_per_input << "\n";
1451                 out_file << "<num_bytes_per_output> " << std::dec << num_bytes_per_output << "\n";
1452             }
1453             out_file << "<input_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1454                      << MemoryOffset(component[i].ptr_inputs, ptr_dnn_memory_) << "\n";
1455             out_file << "<output_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1456                      << MemoryOffset(component[i].ptr_outputs, ptr_dnn_memory_) << "\n";
1457             switch (component[i].operation) {
1458                 case kDnnAffineOp:
1459                 case kDnnDiagonalOp: {
1460                     uint32_t num_bytes_per_weight = component[i].op.affine.num_bytes_per_weight;
1461                     uint32_t num_bytes_per_bias = component[i].op.affine.num_bytes_per_bias;
1462                     float weight_scale_factor = component[i].op.affine.weight_scale_factor;
1463                     float output_scale_factor = component[i].output_scale_factor;
1464                     uint32_t num_weight_rows = (component[i].operation == kDnnDiagonalOp) ? 1 : num_rows_out;
1465                     uint32_t num_weight_columns = num_rows_in;
1466                     if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1467                         out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
1468                         out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
1469                     } else {
1470                         out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
1471                         out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
1472                     }
1473                     if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1474                         out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
1475                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
1476                     } else {
1477                         out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
1478                                  << weight_scale_factor << "\n";
1479                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1480                                  << output_scale_factor << "\n";
1481                     }
1482                     out_file << "<weight_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1483                              << MemoryOffset(component[i].op.affine.ptr_weights, ptr_dnn_memory_) << "\n";
1484                     out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1485                              << MemoryOffset(component[i].op.affine.ptr_biases, ptr_dnn_memory_) << "\n";
1486
1487                     std::ofstream out_wfile((out_file_name.str() + "_weights.txt").c_str(), std::ios::out);
1488                     std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out);
1489
1490                     if (num_bytes_per_weight == 1) {
1491                         int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.affine.ptr_weights);
1492                         intel_compound_bias_t *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.affine.ptr_biases);
1493 #ifdef DUMP_WB
1494                         for (uint32_t row = 0; row < num_weight_rows; row++) {
1495                             for (uint32_t col = 0; col < num_weight_columns; col++) {
1496                                 if (number_type == kDnnFloat) {
1497                                     float val =
1498                                         static_cast<float>(ptr_weight[row * num_weight_columns + col]) * ptr_bias[row].multiplier
1499                                             / weight_scale_factor;
1500                                     out_wfile << std::setprecision(4) << val << " ";
1501                                 } else {
1502                                     out_wfile <<  int((int8_t) ptr_weight[row * num_weight_columns + col]) << " ";
1503                                 }
1504                                 out_wfile << "\n";
1505                             }
1506                         }
1507 #endif
1508                     } else if (num_bytes_per_weight == 2) {
1509                         int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.affine.ptr_weights);
1510 #ifdef DUMP_WB
1511                         for (uint32_t row = 0; row < num_weight_rows; row++) {
1512                             for (uint32_t col = 0; col < num_weight_columns; col++) {
1513                                 if (number_type == kDnnFloat) {
1514                                     out_wfile << std::setprecision(12)
1515                                               << ptr_weight[row * num_weight_columns + col] / weight_scale_factor << " ";
1516                                 } else {
1517                                     out_wfile << ptr_weight[row * num_weight_columns + col] << " ";
1518                                 }
1519                                 out_wfile << "\n";
1520                             }
1521                         }
1522 #endif
1523                     } else if (number_type_ == kDnnFloat) {
1524                         float *ptr_weight = reinterpret_cast<float *>(component[i].op.affine.ptr_weights);
1525 #ifdef DUMP_WB
1526                         for (uint32_t row = 0; row < num_weight_rows; row++) {
1527                             for (uint32_t col = 0; col < num_weight_columns; col++) {
1528                                 out_wfile << std::setprecision(5)
1529                                           << ptr_weight[row * num_weight_columns + col] << " ";
1530                                 out_wfile << "\n";
1531                             }
1532                         }
1533 #endif
1534                     } else {
1535                         fprintf(stderr, "Unsupported weight type in WriteDnnText!\n");
1536                         throw -1;
1537                     }
1538                     if (number_type_ == kDnnInt) {
1539                         if (num_bytes_per_weight == 1) {
1540                             intel_compound_bias_t
1541                                 *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.affine.ptr_biases);
1542 #ifdef DUMP_WB
1543                             for (uint32_t row = 0; row < num_rows_out; row++) {
1544                                 out_bfile << std::setw(8) << ptr_biases[row].bias << ", ";
1545                                 out_bfile << std::setw(8) << int(ptr_biases[row].multiplier) << "\n";
1546                             }
1547 #endif
1548                         } else {
1549                             int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.affine.ptr_biases);
1550 #ifdef DUMP_WB
1551                             for (uint32_t row = 0; row < num_rows_out; row++) {
1552                                 if (number_type == kDnnInt) {
1553                                     out_bfile << std::setw(8) << ptr_biases[row] << "\n";
1554                                 } else {
1555                                     out_bfile << std::setw(8) << ptr_biases[row] / output_scale_factor << "\n";
1556                                 }
1557                             }
1558 #endif
1559                         }
1560
1561                     } else {
1562                         float *ptr_biases = reinterpret_cast<float *>(component[i].op.affine.ptr_biases);
1563 #ifdef DUMP_WB
1564
1565                         for (uint32_t row = 0; row < num_rows_out; row++) {
1566                             out_bfile << std::setprecision(5) << ptr_biases[row] << "\n";
1567                         }
1568 #endif
1569                     }
1570                 }
1571                 break;
1572                 case kDnnConvolutional1dOp: {
1573                     uint32_t num_filters = component[i].op.conv1D.num_filters;
1574                     uint32_t num_filter_rows = component[i].op.conv1D.num_filter_rows;
1575                     uint32_t num_filter_coefficients = component[i].op.conv1D.num_filter_coefficients;
1576                     uint32_t num_feature_maps = component[i].op.conv1D.num_feature_maps;
1577                     uint32_t num_feature_map_rows = component[i].op.conv1D.num_feature_map_rows;
1578                     uint32_t num_feature_map_columns = component[i].op.conv1D.num_feature_map_columns;
1579                     uint32_t num_filter_outputs =
1580                         component[i].op.conv1D.num_feature_map_rows - component[i].op.conv1D.num_filter_rows + 1;
1581                     uint32_t num_bytes_per_weight = component[i].op.conv1D.num_bytes_per_weight;
1582                     uint32_t num_bytes_per_bias = component[i].op.conv1D.num_bytes_per_bias;
1583                     float weight_scale_factor = component[i].op.conv1D.weight_scale_factor;
1584                     float output_scale_factor = component[i].output_scale_factor;
1585                     out_file << "<num_filters> " << std::dec << num_filters << "\n";
1586                     out_file << "<num_filter_coefficients> " << std::dec << num_filter_coefficients << "\n";
1587                     out_file << "<num_filter_rows> " << std::dec << num_filter_rows << "\n";
1588                     out_file << "<num_feature_maps> " << std::dec << num_feature_maps << "\n";
1589                     out_file << "<num_feature_map_rows> " << std::dec << num_feature_map_rows << "\n";
1590                     out_file << "<num_feature_map_columns> " << std::dec << num_feature_map_columns << "\n";
1591                     if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1592                         out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
1593                         out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
1594                     } else {
1595                         out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
1596                         out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
1597                     }
1598                     if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1599                         out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
1600                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
1601                     } else {
1602                         out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
1603                                  << weight_scale_factor << "\n";
1604                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1605                                  << output_scale_factor << "\n";
1606                     }
1607                     out_file << "<filter_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1608                              << MemoryOffset(component[i].op.conv1D.ptr_filters, ptr_dnn_memory_) << "\n";
1609                     out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1610                              << MemoryOffset(component[i].op.conv1D.ptr_biases, ptr_dnn_memory_) << "\n";
1611
1612
1613                     std::ofstream out_wfile((out_file_name.str() + "_weights.txt").c_str(), std::ios::out);
1614                     std::ofstream out_bfile((out_file_name.str() + "_biases.txt").c_str(), std::ios::out);
1615
1616
1617                     if (num_bytes_per_weight == 1) {
1618                         int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.conv1D.ptr_filters);
1619                         intel_compound_bias_t *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.conv1D.ptr_biases);
1620 #ifdef DUMP_WB
1621                         for (uint32_t row = 0; row < num_filters; row++) {
1622                             for (uint32_t col = 0; col < num_filter_coefficients; col++) {
1623                                 if (number_type == kDnnFloat) {
1624                                     float val = static_cast<float>(ptr_weight[row * num_filter_coefficients + col])
1625                                         * ptr_bias[row].multiplier / weight_scale_factor;
1626                                     out_wfile << std::setprecision(12) <<val << "\n";
1627                                 } else {
1628                                     out_wfile << "0x" << std::setfill('0') << std::setw(2) << std::hex
1629                                              << int((uint8_t) ptr_weight[row * num_filter_coefficients + col]) << "\n";
1630                                 }
1631                             }
1632                         }
1633 #endif
1634                     } else if (num_bytes_per_weight == 2) {
1635                         int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.conv1D.ptr_filters);
1636 #ifdef DUMP_WB
1637                         for (uint32_t row = 0; row < num_filters; row++) {
1638                             for (uint32_t col = 0; col < num_filter_coefficients; col++) {
1639                                 if (number_type == kDnnFloat) {
1640                                     out_wfile << std::setprecision(12)
1641                                              << ptr_weight[row * num_filter_coefficients + col] / weight_scale_factor
1642                                              << "\n";
1643                                 } else {
1644                                     out_wfile << "0x" << std::setfill('0') << std::setw(4) << std::hex
1645                                              << ptr_weight[row * num_filter_coefficients + col] << "\n";
1646                                 }
1647                             }
1648                         }
1649 #endif
1650                     } else if (number_type_ == kDnnFloat) {
1651                         float *ptr_weight = reinterpret_cast<float *>(component[i].op.conv1D.ptr_filters);
1652 #ifdef DUMP_WB
1653                         for (uint32_t row = 0; row < num_filters; row++) {
1654                             for (uint32_t col = 0; col < num_filter_coefficients; col++) {
1655                                 out_wfile << std::setprecision(12)
1656                                          << ptr_weight[row * num_filter_coefficients + col] << "\n";
1657                             }
1658                             out_wfile << "\n";
1659                         }
1660 #endif
1661                     } else {
1662                         fprintf(stderr, "Unsupported filter weight type in WriteDnnText!\n");
1663                         throw -1;
1664                     }
1665
1666                     if (number_type_ == kDnnInt) {
1667                         if (number_type == kDnnInt) {
1668                             if (num_bytes_per_weight == 1) {
1669                                 intel_compound_bias_t
1670                                     *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.conv1D.ptr_biases);
1671 #ifdef DUMP_WB
1672                                 for (uint32_t row = 0; row < num_filters; row++) {
1673                                     out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex
1674                                              << ptr_biases[row].bias << " ";
1675                                     out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex
1676                                              << int(ptr_biases[row].multiplier) << "\n";
1677                                 }
1678 #endif
1679                             } else {
1680                                 int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.conv1D.ptr_biases);
1681 #ifdef DUMP_WB
1682                                 for (uint32_t row = 0; row < num_filters; row++) {
1683                                     out_bfile << "0x" << std::setfill('0') << std::setw(8) << std::hex << ptr_biases[row]
1684                                              << "\n";
1685                                 }
1686 #endif
1687                             }
1688                         } else {
1689                             int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.conv1D.ptr_biases);
1690 #ifdef DUMP_WB
1691                             for (uint32_t row = 0; row < num_filters; row++) {
1692                                 out_bfile << std::setprecision(12)
1693                                          << ptr_biases[row] / output_scale_factor << "\n";
1694                             }
1695 #endif
1696                         }
1697                     } else {
1698                         float *ptr_biases = reinterpret_cast<float *>(component[i].op.conv1D.ptr_biases);
1699 #ifdef DUMP_WB
1700                         for (uint32_t row = 0; row < num_filters; row++) {
1701                             out_bfile << std::setprecision(12) << ptr_biases[row] << "\n";
1702                         }
1703 #endif
1704                     }
1705                     out_file << "\n";
1706                 }
1707                     break;
1708                 case kDnnRecurrentOp: {
1709                     float weight_scale_factor = component[i].op.recurrent.weight_scale_factor;
1710                     float output_scale_factor = component[i].output_scale_factor;
1711                     uint32_t num_vector_delay = component[i].op.recurrent.num_vector_delay;
1712                     uint32_t num_bytes_per_weight = component[i].op.recurrent.num_bytes_per_weight;
1713                     uint32_t num_bytes_per_bias = component[i].op.recurrent.num_bytes_per_bias;
1714                     uint32_t num_weight_rows = num_columns_out;
1715                     uint32_t num_weight_columns = num_columns_in + num_columns_out;
1716                     out_file << "<num_vector_delay> " << std::dec << num_vector_delay << "\n";
1717                     if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1718                         out_file << "<num_bytes_per_weight> " << std::dec << 4 << "\n";
1719                         out_file << "<num_bytes_per_bias> " << std::dec << 4 << "\n";
1720                     } else {
1721                         out_file << "<num_bytes_per_weight> " << std::dec << num_bytes_per_weight << "\n";
1722                         out_file << "<num_bytes_per_bias> " << std::dec << num_bytes_per_bias << "\n";
1723                     }
1724                     if ((number_type_ == kDnnInt) && (number_type == kDnnFloat)) {
1725                         out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> " << 1.0 << "\n";
1726                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
1727                     } else {
1728                         out_file << std::setprecision(12) << std::scientific << "<weight_scale_factor> "
1729                                  << weight_scale_factor << "\n";
1730                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1731                                  << output_scale_factor << "\n";
1732                     }
1733                     out_file << "<weight_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1734                              << MemoryOffset(component[i].op.recurrent.ptr_weights, ptr_dnn_memory_) << "\n";
1735                     out_file << "<bias_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1736                              << MemoryOffset(component[i].op.recurrent.ptr_biases, ptr_dnn_memory_) << "\n";
1737                     out_file << "<feedback_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1738                              << MemoryOffset(component[i].op.recurrent.ptr_feedbacks, ptr_dnn_memory_) << "\n";
1739                     if (num_bytes_per_weight == 1) {
1740                         int8_t *ptr_weight = reinterpret_cast<int8_t *>(component[i].op.recurrent.ptr_weights);
1741                         intel_compound_bias_t
1742                             *ptr_bias = reinterpret_cast<intel_compound_bias_t *>(component[i].op.recurrent.ptr_biases);
1743 #ifdef DUMP_WB
1744                         for (uint32_t row = 0; row < num_weight_rows; row++) {
1745                             out_file << "<weight_row> ";
1746                             for (uint32_t col = 0; col < num_weight_columns; col++) {
1747                                 if (number_type == kDnnFloat) {
1748                                     float val =
1749                                         static_cast<float>(ptr_weight[row * num_weight_columns + col]) * ptr_bias[col].multiplier
1750                                             / weight_scale_factor;
1751                                     out_file << std::setprecision(12) << std::scientific << val << " ";
1752                                 } else {
1753                                     out_file << "0x" << std::setfill('0') << std::setw(2) << std::hex
1754                                              << int((uint8_t) ptr_weight[row * num_weight_columns + col]) << " ";
1755                                 }
1756                             }
1757                             out_file << "\n";
1758                         }
1759 #endif
1760                     } else if (num_bytes_per_weight == 2) {
1761                         int16_t *ptr_weight = reinterpret_cast<int16_t *>(component[i].op.recurrent.ptr_weights);
1762 #ifdef DUMP_WB
1763                         for (uint32_t row = 0; row < num_weight_rows; row++) {
1764                             out_file << "<weight_row> ";
1765                             for (uint32_t col = 0; col < num_weight_columns; col++) {
1766                                 if (number_type == kDnnFloat) {
1767                                     out_file << std::setprecision(12) << std::scientific
1768                                              << ptr_weight[row * num_weight_columns + col] / weight_scale_factor << " ";
1769                                 } else {
1770                                     out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
1771                                              << ptr_weight[row * num_weight_columns + col] << " ";
1772                                 }
1773                             }
1774                             out_file << "\n";
1775                         }
1776 #endif
1777                     } else if (number_type_ == kDnnFloat) {
1778                         float *ptr_weight = reinterpret_cast<float *>(component[i].op.recurrent.ptr_weights);
1779 #ifdef DUMP_WB
1780                         for (uint32_t row = 0; row < num_weight_rows; row++) {
1781                             out_file << "<weight_row> ";
1782                             for (uint32_t col = 0; col < num_weight_columns; col++) {
1783                                 out_file << std::setprecision(12) << std::scientific
1784                                          << ptr_weight[row * num_weight_columns + col] << " ";
1785                             }
1786                             out_file << "\n";
1787                         }
1788 #endif
1789                     } else {
1790                         fprintf(stderr, "Unsupported weight type in WriteDnnText!\n");
1791                         throw -1;
1792                     }
1793                     if (number_type_ == kDnnInt) {
1794                         if (number_type == kDnnInt) {
1795                             if (num_bytes_per_weight == 1) {
1796                                 intel_compound_bias_t
1797                                     *ptr_biases = reinterpret_cast<intel_compound_bias_t *>(component[i].op.recurrent.ptr_biases);
1798                                 out_file << "<compound_bias>" << " ";
1799 #ifdef DUMP_WB
1800                                 for (uint32_t col = 0; col < num_columns_out; col++) {
1801                                     out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
1802                                              << ptr_biases[col].bias << " ";
1803                                     out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
1804                                              << ptr_biases[col].multiplier << " ";
1805                                 }
1806 #endif
1807                             } else {
1808                                 int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.recurrent.ptr_biases);
1809                                 out_file << "<bias>" << " ";
1810 #ifdef DUMP_WB
1811                                 for (uint32_t col = 0; col < num_columns_out; col++) {
1812                                     out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex << ptr_biases[col]
1813                                              << " ";
1814                                 }
1815 #endif
1816                             }
1817                         } else {
1818                             int32_t *ptr_biases = reinterpret_cast<int32_t *>(component[i].op.recurrent.ptr_biases);
1819                             out_file << "<bias>" << " ";
1820 #ifdef DUMP_WB
1821                             for (uint32_t col = 0; col < num_columns_out; col++) {
1822                                 out_file << std::setprecision(12) << std::scientific
1823                                          << ptr_biases[col] / output_scale_factor << " ";
1824                             }
1825 #endif
1826                         }
1827                     } else {
1828                         float *ptr_biases = reinterpret_cast<float *>(component[i].op.recurrent.ptr_biases);
1829                         out_file << "<bias>" << " ";
1830 #ifdef DUMP_WB
1831                         for (uint32_t col = 0; col < num_columns_out; col++) {
1832                             out_file << std::setprecision(12) << std::scientific << ptr_biases[col] << " ";
1833                         }
1834 #endif
1835                     }
1836                     out_file << "\n";
1837                 }
1838                     break;
1839                 case kDnnMaxPoolOp: {
1840                     uint32_t num_pool_type = (component[i].op.maxpool.do_sum_not_max) ? 2 : 1;
1841                     out_file << "<pool_type> " << std::dec << num_pool_type << "\n";
1842                     out_file << "<pool_size> " << std::dec << component[i].op.maxpool.num_inputs << "\n";
1843                     out_file << "<pool_step> " << std::dec << component[i].op.maxpool.num_inputs_step << "\n";
1844                     out_file << "<pool_num_rows> " << std::dec << component[i].op.maxpool.num_inputs_stride << "\n";
1845                     out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1846                              << component[i].output_scale_factor << "\n";
1847                 }
1848                     break;
1849                 case kDnnPiecewiselinearOp: {
1850                     intel_pwl_segment_t *ptr_segment = component[i].op.pwl.ptr_segments;
1851                     DnnActivationType func_id = component[i].op.pwl.func_id.type;
1852                     uint32_t num_segments = component[i].op.pwl.num_segments;
1853                     float output_scale_factor = component[i].output_scale_factor;
1854                     out_file << "<func_id> " << intel_dnn_activation_name[func_id] << "\n";
1855                     out_file << "<num_bytes_per_slope> " << std::dec << sizeof(int16_t) << "\n";
1856                     out_file << "<num_bytes_per_intercept> " << std::dec << sizeof(int16_t) << "\n";
1857                     out_file << "<num_bytes_per_offset> " << std::dec << sizeof(int32_t) << "\n";
1858                     if (number_type == kDnnFloat) {
1859                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> " << 1.0 << "\n";
1860                         out_file << "<num_segments> " << std::dec << 0 << "\n";
1861                         out_file << "<segment_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1862                                  << MemoryOffset(component[i].op.pwl.ptr_segments, ptr_dnn_memory_) << "\n";
1863                     } else {
1864                         out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1865                                  << output_scale_factor << "\n";
1866                         out_file << "<num_segments> " << std::dec << num_segments << "\n";
1867                         out_file << "<segment_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1868                                  << MemoryOffset(component[i].op.pwl.ptr_segments, ptr_dnn_memory_) << "\n";
1869                         if (number_type_ == kDnnInt) {
1870                             out_file << "<slope> ";
1871                             for (int segment = 0; segment < num_segments; segment++) {
1872                                 out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
1873                                          << ptr_segment[segment].slope << " ";
1874                             }
1875                             out_file << "\n";
1876                             out_file << "<intercept> ";
1877                             for (int segment = 0; segment < component[i].op.pwl.num_segments; segment++) {
1878                                 out_file << "0x" << std::setfill('0') << std::setw(4) << std::hex
1879                                          << ptr_segment[segment].yBase << " ";
1880                             }
1881                             out_file << "\n";
1882                             out_file << "<offset> ";
1883                             for (int segment = 0; segment < component[i].op.pwl.num_segments; segment++) {
1884                                 out_file << "0x" << std::setfill('0') << std::setw(8) << std::hex
1885                                          << ptr_segment[segment].xBase << " ";
1886                             }
1887                             out_file << "\n";
1888                         } else if (num_segments > 0) {
1889                             fprintf(stderr,
1890                                     "Number of segments must be zero in floating point model in WriteDnnText!\n");
1891                             throw -1;
1892                         }
1893                     }
1894                 }
1895                     break;
1896                 case kDnnInterleaveOp:
1897                     out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1898                              << component[i].output_scale_factor << "\n";
1899                     break;
1900                 case kDnnDeinterleaveOp:
1901                     out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1902                              << component[i].output_scale_factor << "\n";
1903                     break;
1904                 case kDnnCopyOp:
1905                     out_file << std::setprecision(12) << std::scientific << "<output_scale_factor> "
1906                              << component[i].output_scale_factor << "\n";
1907                     out_file << "<num_copy_rows> " << std::dec << component[i].op.copy.num_copy_rows << "\n";
1908                     out_file << "<num_copy_columns> " << std::dec << component[i].op.copy.num_copy_columns << "\n";
1909                     break;
1910                 default:
1911                     out_file << "<Error!!!> Unsupported Component :  "
1912                              << intel_dnn_operation_name[component[i].operation] << "\n";
1913                     //  fprintf(stderr, "Component type %s not yet supported in AmIntelDnn::WriteDnnText()!\n",
1914                     //    intel_dnn_operation_name[component[i].operation]);
1915                     //  throw -1;
1916                     break;
1917             }
1918         }
1919         if (ptr_active_outputs() != nullptr) {
1920             out_file << "<activelist_address> " << "0x" << std::setfill('0') << std::setw(8) << std::hex
1921                      << MemoryOffset(ptr_active_outputs(), ptr_dnn_memory_) << "\n";
1922         }
1923         out_file << "<end_of_file>\n";
1924         out_file.close();
1925     } else {
1926         fprintf(stderr, "Failed to open %s for writing!\n", filename);
1927         throw -1;
1928     }
1929 }
1930
1931 void AmIntelDnn::InitGNAStruct(intel_nnet_type_t *ptr_nnet) {
1932     intel_nnet_layer_t *pLayer;
1933
1934     if (ptr_nnet == nullptr)
1935         THROW_GNA_EXCEPTION << "Invalid input parameter";
1936     if (ptr_nnet->pLayers != nullptr)
1937         THROW_GNA_EXCEPTION << "InitGNAStruct can't work on prellocated layers array";
1938     if (component.empty())
1939         THROW_GNA_EXCEPTION << "empty model in AmIntelDnn::FillGNAStruct()";
1940
1941     ptr_nnet->nLayers = 0;
1942     for (auto && c : component) {
1943         if (c.operation == kDnnAffineOp
1944             || (c.operation == kDnnDiagonalOp)
1945             || (c.operation == kDnnConvolutional1dOp)
1946             || (c.operation == kDnnDeinterleaveOp)
1947             || (c.operation == kDnnInterleaveOp)
1948             || (c.operation == kDnnRecurrentOp)
1949             || (c.operation == kDnnCopyOp)
1950             ) {
1951             ptr_nnet->nLayers++;
1952         }
1953     }
1954     ptr_nnet->nGroup = num_group_in();
1955     ptr_nnet->pLayers = reinterpret_cast<intel_nnet_layer_t *>(_mm_malloc(ptr_nnet->nLayers * sizeof(intel_nnet_layer_t), 64));
1956     if (ptr_nnet->pLayers == nullptr)
1957         THROW_GNA_EXCEPTION << "out of memory in AmIntelDnn::FillGNAStruct()";
1958     pLayer = ptr_nnet->pLayers;
1959
1960     for (int i = 0; i < component.size(); i++) {
1961         // std::cout << "Component + " << i <<"=GNA_" << std::distance(ptr_nnet->pLayers, pLayer) << "\n";
1962         switch (component[i].operation) {
1963             case kDnnAffineOp:
1964                 pLayer->nInputRows = component[i].num_rows_in;
1965                 pLayer->nInputColumns = component[i].num_columns_in;
1966                 pLayer->nOutputRows = component[i].num_rows_out;
1967                 pLayer->nOutputColumns = component[i].num_columns_out;
1968                 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
1969                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten if PWL op is needed
1970                 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
1971                 pLayer->pInputs = component[i].ptr_inputs;
1972                 pLayer->pOutputsIntermediate = component[i].ptr_outputs;
1973                 pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten if PWL op is needed
1974                 pLayer->nLayerKind = INTEL_AFFINE;
1975                 {
1976                     pLayer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64);
1977                     if (pLayer->pLayerStruct == nullptr) {
1978                         THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_AFFINE layer structure.";
1979                     }
1980                     auto pAffineLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
1981                     pAffineLayer->pwl.pSegments = nullptr;
1982                     pAffineLayer->pwl.nSegments = 0;
1983
1984                     pAffineLayer->affine.nBytesPerBias = component[i].op.affine.num_bytes_per_bias;
1985                     pAffineLayer->affine.nBytesPerWeight = component[i].op.affine.num_bytes_per_weight;
1986                     pAffineLayer->affine.pBiases = component[i].op.affine.ptr_biases;
1987                     pAffineLayer->affine.pWeights = component[i].op.affine.ptr_weights;
1988                 }
1989                 if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
1990                     pLayer++;
1991                 }
1992                 break;
1993             case kDnnDiagonalOp:
1994                 pLayer->nInputRows = component[i].num_rows_in;
1995                 pLayer->nInputColumns = component[i].num_columns_in;
1996                 pLayer->nOutputRows = component[i].num_rows_out;
1997                 pLayer->nOutputColumns = component[i].num_columns_out;
1998                 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
1999                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten if PWL op is needed
2000                 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2001                 pLayer->pInputs = component[i].ptr_inputs;
2002                 pLayer->pOutputsIntermediate = component[i].ptr_outputs;
2003                 pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten if PWL op is needed
2004                 pLayer->nLayerKind = INTEL_AFFINE_DIAGONAL;
2005                 {
2006                     pLayer->pLayerStruct = _mm_malloc(sizeof(intel_affine_layer_t), 64);
2007                     if (pLayer->pLayerStruct == nullptr) {
2008                         THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_AFFINE_DIAGONAL layer structure.";
2009                     }
2010                     auto pDiagonalLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
2011                     pDiagonalLayer->pwl.pSegments = nullptr;
2012                     pDiagonalLayer->pwl.nSegments = 0;
2013
2014                     pDiagonalLayer->affine.nBytesPerBias = component[i].op.affine.num_bytes_per_bias;
2015                     pDiagonalLayer->affine.nBytesPerWeight = component[i].op.affine.num_bytes_per_weight;
2016                     pDiagonalLayer->affine.pBiases = component[i].op.affine.ptr_biases;
2017                     pDiagonalLayer->affine.pWeights = component[i].op.affine.ptr_weights;
2018                 }
2019                 if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
2020                     pLayer++;
2021                 }
2022                 break;
2023             case kDnnRecurrentOp:
2024                 pLayer->nInputRows = component[i].num_rows_in;
2025                 pLayer->nInputColumns = component[i].num_columns_in;
2026                 pLayer->nOutputRows = component[i].num_rows_out;
2027                 pLayer->nOutputColumns = component[i].num_columns_out;
2028                 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2029                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten if PWL op is needed
2030                 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2031                 pLayer->pInputs = component[i].ptr_inputs;
2032                 pLayer->pOutputsIntermediate = component[i].ptr_outputs;
2033                 pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten if PWL op is needed
2034                 pLayer->nLayerKind = INTEL_RECURRENT;
2035                 {
2036                     pLayer->pLayerStruct = _mm_malloc(sizeof(intel_recurrent_layer_t), 64);
2037                     if (pLayer->pLayerStruct == nullptr) {
2038                         THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_RECURRENT layer structure.";
2039                     }
2040                     auto pRecurrentLayer = reinterpret_cast<intel_recurrent_layer_t *>(pLayer->pLayerStruct);
2041                     pRecurrentLayer->pFeedbackBuffer = component[i].op.recurrent.ptr_feedbacks;
2042                     pRecurrentLayer->pwl.pSegments = nullptr;
2043                     pRecurrentLayer->pwl.nSegments = 0;
2044
2045                     pRecurrentLayer->affine.nBytesPerBias = component[i].op.recurrent.num_bytes_per_bias;
2046                     pRecurrentLayer->affine.nBytesPerWeight = component[i].op.recurrent.num_bytes_per_weight;
2047                     pRecurrentLayer->affine.pBiases = component[i].op.recurrent.ptr_biases;
2048                     pRecurrentLayer->affine.pWeights = component[i].op.recurrent.ptr_weights;
2049                 }
2050                 if (i == component.size() - 1 || component[i + 1].operation != kDnnPiecewiselinearOp) {
2051                     pLayer++;
2052                 }
2053                 break;
2054             case kDnnConvolutional1dOp:
2055                 pLayer->nInputRows = component[i].num_rows_in;
2056                 pLayer->nInputColumns = component[i].num_columns_in;
2057                 pLayer->nOutputRows = component[i].num_rows_out;
2058                 pLayer->nOutputColumns = component[i].num_columns_out;
2059                 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2060                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;  //  will be overwritten
2061                 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2062                 pLayer->pInputs = component[i].ptr_inputs;
2063                 pLayer->pOutputsIntermediate = component[i].ptr_outputs;
2064                 pLayer->pOutputs = component[i].ptr_outputs;  //  will be overwritten
2065                 pLayer->nLayerKind = INTEL_CONVOLUTIONAL;
2066                 {
2067                     pLayer->pLayerStruct = _mm_malloc(sizeof(intel_convolutional_layer_t), 64);
2068                     if (pLayer->pLayerStruct == nullptr) {
2069                         THROW_GNA_EXCEPTION << "could not allocate memory for INTEL_CONVOLUTIONAL layer structure.";
2070                     }
2071                     auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
2072                     pConvolutionalLayer->nBytesBias = component[i].op.conv1D.num_bytes_per_bias;
2073                     pConvolutionalLayer->nBytesFilterCoefficient = component[i].op.conv1D.num_bytes_per_weight;
2074                     pConvolutionalLayer->nFilters = component[i].op.conv1D.num_filters;
2075                     pConvolutionalLayer->nFilterRows = component[i].op.conv1D.num_filter_rows;
2076                     pConvolutionalLayer->nFilterCoefficients = component[i].op.conv1D.num_filter_coefficients;
2077                     pConvolutionalLayer->nFeatureMaps = component[i].op.conv1D.num_feature_maps;
2078                     pConvolutionalLayer->nFeatureMapRows = component[i].op.conv1D.num_feature_map_rows;
2079                     pConvolutionalLayer->nFeatureMapColumns = component[i].op.conv1D.num_feature_map_columns;
2080                     pConvolutionalLayer->poolType = INTEL_NO_POOLING;  //  will be overwritten
2081                     pConvolutionalLayer->nPoolSize = 0;  //  will be overwritten
2082                     pConvolutionalLayer->nPoolStride = 0;  //  will be overwritten
2083                     pConvolutionalLayer->pwl.nSegments = 0;  //  will be overwritten
2084                     pConvolutionalLayer->pwl.pSegments = nullptr;  //  will be overwritten
2085                     pConvolutionalLayer->pBiases = component[i].op.conv1D.ptr_biases;
2086                     pConvolutionalLayer->pFilters = component[i].op.conv1D.ptr_filters;
2087                 }
2088                 if (i == component.size() - 1 || ((component[i + 1].operation != kDnnMaxPoolOp)
2089                         && (component[i + 1].operation != kDnnPiecewiselinearOp))) {
2090                     pLayer++;
2091                 }
2092                 break;
2093             case kDnnMaxPoolOp:
2094                 if (i == 0) {
2095                     THROW_GNA_EXCEPTION << "Pooling component with no preceeding component";
2096                 } else if (pLayer->nLayerKind == INTEL_CONVOLUTIONAL) {
2097                     if (pLayer->pLayerStruct == nullptr) {
2098                         THROW_GNA_EXCEPTION "INTEL_CONVOLUTIONAL layer structure was not initialized.";
2099                     }
2100                     auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
2101                     // it is possible to have activation preceding to maxpool
2102                     if (pConvolutionalLayer->pwl.nSegments != 0) {
2103                         THROW_GNA_EXCEPTION << "Encountered activation component before pooling component at." << i;
2104                     } else {
2105                         pConvolutionalLayer->poolType =
2106                             (component[i].op.maxpool.do_sum_not_max) ? INTEL_SUM_POOLING : INTEL_MAX_POOLING;
2107                         pConvolutionalLayer->nPoolSize = component[i].op.maxpool.num_inputs;
2108                         pConvolutionalLayer->nPoolStride = component[i].op.maxpool.num_inputs_step;
2109
2110
2111                         // number of output columns correction - based on GNA-library expectations
2112                         auto nFltSize = pConvolutionalLayer->nFilterCoefficients;
2113                         auto fltStrideSz = pConvolutionalLayer->nFeatureMaps * pConvolutionalLayer->nFeatureMapColumns;  // always move 1 "row"
2114                         auto maxNCOE = (pLayer->nInputColumns - nFltSize) / fltStrideSz + 1;
2115                         // FLAT input matrix, pooled outputs per filter
2116                         pLayer->nOutputColumns = pConvolutionalLayer->nFilters * ((maxNCOE - 1) / pConvolutionalLayer->nPoolStride + 1);
2117
2118                         // old code
2119                         // pLayer->nOutputColumns /= pConvolutionalLayer->nPoolStride;
2120                     }
2121                 } else {
2122                     THROW_GNA_EXCEPTION << "Pooling component applied to non-convolutional layer";
2123                 }
2124                 break;
2125             case kDnnPiecewiselinearOp:
2126                 pLayer->pOutputs = component[i].ptr_outputs;
2127                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
2128                 if (pLayer->pLayerStruct == nullptr) {
2129                     THROW_GNA_EXCEPTION << pLayer->nLayerKind << " layer structure was not initialized.";
2130                 }
2131                 if (i == 0) {
2132                     THROW_GNA_EXCEPTION << "PWL component with no preceding component.";
2133                 } else if ((component[i - 1].operation == kDnnAffineOp)
2134                     || (component[i - 1].operation == kDnnDiagonalOp)) {
2135                     auto pAffineLayer = reinterpret_cast<intel_affine_layer_t *>(pLayer->pLayerStruct);
2136                     pAffineLayer->pwl.nSegments = component[i].op.pwl.num_segments;
2137                     pAffineLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
2138                 } else if (component[i - 1].operation == kDnnRecurrentOp) {
2139                     auto pRecurrentLayer = reinterpret_cast<intel_recurrent_layer_t *>(pLayer->pLayerStruct);
2140                     pRecurrentLayer->pwl.nSegments = component[i].op.pwl.num_segments;
2141                     pRecurrentLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
2142                 } else if ((component[i - 1].operation == kDnnConvolutional1dOp)
2143                     || ((component[i - 1].operation == kDnnMaxPoolOp)
2144                         && (component[i - 2].operation == kDnnConvolutional1dOp))) {
2145                     auto pConvolutionalLayer = reinterpret_cast<intel_convolutional_layer_t *>(pLayer->pLayerStruct);
2146                     pConvolutionalLayer->pwl.nSegments = component[i].op.pwl.num_segments;
2147                     pConvolutionalLayer->pwl.pSegments = component[i].op.pwl.ptr_segments;
2148                     if (component[i - 1].operation != kDnnMaxPoolOp) {
2149                         pLayer->nOutputColumns = component[i].num_columns_out;
2150                     }
2151                 }
2152                 pLayer++;
2153
2154                 break;
2155             case kDnnInterleaveOp:
2156                 pLayer->nInputRows = component[i].num_rows_in;
2157                 pLayer->nInputColumns = component[i].num_columns_in;
2158                 pLayer->nOutputRows = component[i].num_rows_out;
2159                 pLayer->nOutputColumns = component[i].num_columns_out;
2160                 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2161                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
2162                 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2163                 pLayer->pInputs = component[i].ptr_inputs;
2164                 pLayer->pOutputsIntermediate = nullptr;
2165                 pLayer->pOutputs = component[i].ptr_outputs;
2166                 pLayer->nLayerKind = INTEL_INTERLEAVE;
2167                 pLayer->pLayerStruct = nullptr;
2168                 pLayer++;
2169                 break;
2170             case kDnnDeinterleaveOp:
2171                 pLayer->nInputRows = component[i].num_rows_in;
2172                 pLayer->nInputColumns = component[i].num_columns_in;
2173                 pLayer->nOutputRows = component[i].num_rows_out;
2174                 pLayer->nOutputColumns = component[i].num_columns_out;
2175                 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2176                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
2177                 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2178                 pLayer->pInputs = component[i].ptr_inputs;
2179                 pLayer->pOutputsIntermediate = nullptr;
2180                 pLayer->pOutputs = component[i].ptr_outputs;
2181                 pLayer->nLayerKind = INTEL_DEINTERLEAVE;
2182                 pLayer->pLayerStruct = nullptr;
2183                 pLayer++;
2184                 break;
2185             case kDnnCopyOp:
2186                 pLayer->nInputRows = component[i].num_columns_in;
2187                 pLayer->nInputColumns = component[i].num_rows_in;
2188                 pLayer->nOutputRows = component[i].num_columns_out;
2189                 pLayer->nOutputColumns = component[i].num_rows_out;
2190                 pLayer->nBytesPerInput = component[i].num_bytes_per_input;
2191                 pLayer->nBytesPerOutput = component[i].num_bytes_per_output;
2192                 pLayer->nBytesPerIntermediateOutput = sizeof(int32_t);
2193                 pLayer->pInputs = component[i].ptr_inputs;
2194                 pLayer->pOutputsIntermediate = nullptr;
2195                 pLayer->pOutputs = component[i].ptr_outputs;
2196                 pLayer->nLayerKind = INTEL_COPY;
2197                 pLayer->pLayerStruct = nullptr;
2198                 {
2199                     pLayer->pLayerStruct = _mm_malloc(sizeof(intel_copy_layer_t), 64);
2200                     if (pLayer->pLayerStruct == nullptr) {
2201                         THROW_GNA_EXCEPTION << pLayer->nLayerKind << " could not allocate memory for INTEL_COPY layer structure.";
2202                     }
2203                     auto *pCopyLayer = reinterpret_cast<intel_copy_layer_t *>(pLayer->pLayerStruct);
2204                     pCopyLayer->nCopyRows = component[i].op.copy.num_copy_columns;
2205                     pCopyLayer->nCopyCols = component[i].op.copy.num_copy_rows;
2206                 }
2207                 pLayer++;
2208                 break;
2209             default: {
2210                 THROW_GNA_EXCEPTION << "GNA does yet not support " << intel_dnn_operation_name[component[i].operation];
2211             }
2212         }
2213     }
2214     // enable debugging of partial array of components
2215     ptr_nnet->nLayers = std::distance(ptr_nnet->pLayers, pLayer);
2216 }
2217
2218 void AmIntelDnn::DestroyGNAStruct(intel_nnet_type_t *ptr_nnet) {
2219     ptr_nnet->nGroup = 0;
2220     if (ptr_nnet->pLayers != nullptr) {
2221         for (int i = 0; i < ptr_nnet->nLayers; i++) {
2222             switch (ptr_nnet->pLayers[i].nLayerKind) {
2223                 case INTEL_AFFINE:break;
2224                 case INTEL_AFFINE_DIAGONAL:break;
2225                 case INTEL_RECURRENT:break;
2226                 case INTEL_CONVOLUTIONAL:break;
2227                 case INTEL_INTERLEAVE:break;
2228                 case INTEL_DEINTERLEAVE:break;
2229                 case INTEL_COPY:break;
2230                 default:break;
2231             }
2232             if (ptr_nnet->pLayers[i].pLayerStruct != nullptr) {
2233                 _mm_free(ptr_nnet->pLayers[i].pLayerStruct);
2234             }
2235         }
2236         if (ptr_nnet->pLayers != nullptr) {
2237             _mm_free(ptr_nnet->pLayers);
2238         }
2239     }
2240     ptr_nnet->nLayers = 0;
2241 }
2242
2243 void AmIntelDnn::GetScaledOutput(float *ptr_output, uint32_t component_index) {
2244     if (component_index > num_components()) {
2245         fprintf(stderr, "Illegal component index %d in GetScaledOutput\n", component_index);
2246         throw -1;
2247     }
2248     if (ptr_output != nullptr) {
2249         float scale_factor = OutputScaleFactor(component_index);
2250         uint32_t num_elements = component[component_index].num_rows_out * component[component_index].num_columns_out;
2251         if (number_type_ == kDnnFloat) {
2252             float *ptr_input = reinterpret_cast<float *>(component[component_index].ptr_outputs);
2253             for (uint32_t i = 0; i < num_elements; i++) {
2254                 ptr_output[i] = ptr_input[i] / scale_factor;
2255             }
2256         } else if (component[component_index].num_bytes_per_output == 2) {
2257             int16_t *ptr_input = reinterpret_cast<int16_t *>(component[component_index].ptr_outputs);
2258             for (uint32_t i = 0; i < num_elements; i++) {
2259                 ptr_output[i] = static_cast<float>(ptr_input[i]) / scale_factor;
2260             }
2261         } else {
2262             int32_t *ptr_input = reinterpret_cast<int32_t *>(component[component_index].ptr_outputs);
2263             for (uint32_t i = 0; i < num_elements; i++) {
2264                 ptr_output[i] = static_cast<float>(ptr_input[i]) / scale_factor;
2265             }
2266         }
2267     } else {
2268         fprintf(stderr, "Output pointer is nullptr in GetScaledOutput\n");
2269         throw -1;
2270     }
2271 }
2272
2273 void AmIntelDnn::WriteInputAndOutputTextGNA(intel_nnet_type_t * nnet) {
2274 #ifdef LIGHT_DUMP
2275     if (nnet) {
2276         for (int i = 0; i < nnet->nLayers; i++) {
2277             auto component = nnet->pLayers;
2278             std::stringstream out_file_name;
2279             auto getLayerType = [](intel_layer_kind_t kind){
2280                 switch (kind){
2281                     case INTEL_AFFINE : return "affine";
2282                     case INTEL_AFFINE_DIAGONAL : return "diag";
2283                     case INTEL_RECURRENT : return "recurrent";
2284                     case INTEL_CONVOLUTIONAL : return "convolution";
2285                     case INTEL_INTERLEAVE : return "interleave";
2286                     case INTEL_DEINTERLEAVE : return "deinterleave";
2287                     case INTEL_COPY : return "copy";
2288                     default: return "unknown";
2289                 }
2290             };
2291             out_file_name << std::setfill('0') << std::setw(2) << i << "_"
2292                           << getLayerType(component[i].nLayerKind)
2293                           << "-" << nnet->pLayers[i].nInputRows
2294                           << "-" << nnet->pLayers[i].nOutputRows;
2295
2296             auto inputfileName = getDumpFolderNameGNA() + out_file_name.str() + "_input.txt";
2297             auto outFileName = getDumpFolderNameGNA() + out_file_name.str() + "_output.txt";
2298             auto pwlFileName = getDumpFolderNameGNA() + out_file_name.str() + "_pwl.txt";
2299             auto refOutputFileName = getRefFolderName() + out_file_name.str() + "_output.txt";
2300
2301             std::ofstream out_file(outFileName.c_str(), std::ios::out);
2302             std::ofstream pwl_file(pwlFileName.c_str(), std::ios::out);
2303             std::ifstream ref_out_file(refOutputFileName.c_str(), std::ios::in);
2304             std::ofstream in_file(inputfileName.c_str(), std::ios::out);
2305
2306             float  summOfDiff = 0.f;
2307             float  summOfSqDiff = 0.f;
2308             float  maxD = 0.0f;
2309             int    numItems = 0;
2310
2311             auto write_pwl = [&pwl_file](intel_pwl_func_t & pwl) {
2312                 for (int k =0; k < pwl.nSegments; k++) {
2313                     pwl_file << pwl.pSegments[k].slope << ", " << pwl.pSegments[k].xBase << ", " << pwl.pSegments[k].yBase << "\n";
2314                 }
2315             };
2316             if (nnet->pLayers[i].nLayerKind == INTEL_AFFINE || nnet->pLayers[i].nLayerKind == INTEL_AFFINE_DIAGONAL) {
2317                 auto affine = reinterpret_cast<intel_affine_layer_t*>(nnet->pLayers[i].pLayerStruct);
2318                 write_pwl(affine->pwl);
2319             }
2320             if (nnet->pLayers[i].nLayerKind == INTEL_CONVOLUTIONAL) {
2321                 auto conv = reinterpret_cast<intel_convolutional_layer_t*>(nnet->pLayers[i].pLayerStruct);
2322                 write_pwl(conv->pwl);
2323             }
2324
2325             for (int k = 0; k < component[i].nOutputRows; k++) {
2326                 for (int j = 0; j < component[i].nOutputColumns; j++) {
2327                     float floatValue = 0.f;
2328                     if (component[i].nBytesPerOutput == 4) {
2329                         auto value = (reinterpret_cast<int32_t *>(component[i].pOutputs)[k * component[i].nOutputColumns + j]);
2330                         floatValue = (static_cast<float>(value) / 1.0);
2331                     } else {
2332                         auto value = reinterpret_cast<int16_t *>(component[i].pOutputs)[k * component[i].nOutputColumns + j];
2333                         floatValue = (static_cast<float>(value) / 1.0);
2334                     }
2335                     out_file << std::setw(8) << floatValue << "\n";
2336                     if (ref_out_file) {
2337                         float ref_value = 0.f;
2338                         ref_out_file >> ref_value;
2339                         float diff = (ref_value - floatValue);
2340                         diff = diff  < 0 ? -diff : diff;
2341                         summOfDiff += diff;
2342                         summOfSqDiff += diff * diff;
2343                         maxD = std::max(maxD, diff);
2344                         numItems++;
2345                     }
2346                 }
2347             }
2348             if (numItems) {
2349                 auto rmse = sqrt(summOfSqDiff / numItems);
2350                 auto avg = summOfDiff / numItems;
2351                 std :: cout << std::left << std::setw(55) << out_file_name.str()
2352                             << " RMSE="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << rmse
2353                             << " avg=" << std::fixed << std::setprecision(5) << std::right << std::setw(8) << avg
2354                             << " maxD="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << maxD << std::endl;
2355             }
2356
2357
2358             for (int k = 0; k < component[i].nInputRows; k++) {
2359                 for (int j = 0; j < component[i].nInputColumns; j++) {
2360                     if (component[i].nBytesPerInput == 4) {
2361                         in_file << std::setw(8)
2362                                 << (reinterpret_cast<int32_t *>(component[i].pInputs)[k * component[i].nInputColumns + j]);
2363                     } else {
2364                         in_file << std::setw(8)
2365                                 << (reinterpret_cast<int16_t *>(component[i].pInputs)[k * component[i].nInputColumns + j]);
2366                     }
2367                     in_file << "\n";
2368                 }
2369             }
2370         }
2371     }
2372 #endif
2373 }
2374
2375 void AmIntelDnn::WriteInputAndOutputText() {
2376 #ifdef LIGHT_DUMP
2377     for (int i = 0; i < num_components(); i++) {
2378         std::stringstream out_file_name;
2379         out_file_name << std::setfill('0') << std::setw(2) << i << "_"
2380                       << intel_dnn_operation_name[component[i].operation]
2381                       << "-" << component[i].num_rows_in
2382                       << "-" << component[i].num_rows_out;
2383         if (component[i].operation == kDnnPiecewiselinearOp) {
2384             out_file_name << "-" << intel_dnn_activation_name[component[i].op.pwl.func_id];
2385         }
2386         auto inputfileName = getDumpFolderName() + out_file_name.str() + "_input.txt";
2387         auto outFileName = getDumpFolderName() + out_file_name.str() + "_output.txt";
2388         auto refOutputFileName = getRefFolderName() + out_file_name.str() + "_output.txt";
2389
2390         std::ofstream out_file(outFileName.c_str(), std::ios::out);
2391         std::ifstream ref_out_file(refOutputFileName.c_str(), std::ios::in);
2392         std::ofstream in_file(inputfileName.c_str(), std::ios::out);
2393
2394         float  summOfDiff = 0.f;
2395         float  summOfSqDiff = 0.f;
2396         float  maxD = 0.0f;
2397         int    numItems = 0;
2398
2399         for (int k = 0; k < component[i].num_rows_out; k++) {
2400             for (int j = 0; j < component[i].num_columns_out; j++) {
2401                 float floatValue = 0.f;
2402                 if (component[i].num_bytes_per_output == 4) {
2403                     if (number_type_ == kDnnInt) {
2404                         auto value = reinterpret_cast<int32_t *>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
2405                         floatValue = static_cast<float>(value);
2406
2407                     } else {
2408                         floatValue = reinterpret_cast<float*>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
2409                     }
2410                 } else {
2411                     auto value = reinterpret_cast<int16_t *>(component[i].ptr_outputs)[k * component[i].num_columns_out+ j];
2412                     floatValue = static_cast<float>(value);
2413                 }
2414                 out_file << std::setw(8) << floatValue / component[i].output_scale_factor << "\n";
2415
2416                 if (ref_out_file) {
2417                     float ref_value = 0.f;
2418                     ref_out_file >> ref_value;
2419                     float diff = (ref_value - floatValue);
2420                     diff = diff < 0.f ? -diff : diff;
2421                     summOfDiff += diff;
2422                     summOfSqDiff += diff * diff;
2423                     maxD = std::max(maxD, diff);
2424                     numItems++;
2425                 }
2426             }
2427         }
2428         if (numItems) {
2429             auto rmse = sqrt(summOfSqDiff / numItems);
2430             auto avg = summOfDiff / numItems;
2431             std :: cout << std::left << std::setw(55) << out_file_name.str()
2432                         << " RMSE="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << rmse
2433                         << " avg=" << std::fixed << std::setprecision(5) << std::right << std::setw(8) << avg
2434                         << " maxD="<< std::fixed << std::setprecision(5) << std::right << std::setw(8) << maxD << std::endl;
2435         }
2436
2437         float input_scale_factor = component[i].output_scale_factor;
2438         if (component[i].operation == kDnnAffineOp ||
2439             component[i].operation == kDnnDiagonalOp) {
2440             input_scale_factor /= component[i].op.affine.weight_scale_factor;
2441         } else if (component[i].operation == kDnnConvolutional1dOp) {
2442             input_scale_factor /= component[i].op.conv1D.weight_scale_factor;
2443         } else if (component[i].operation == kDnnPiecewiselinearOp) {
2444             input_scale_factor = 1.f;
2445         }
2446
2447         for (int k = 0; k < component[i].num_rows_in; k++) {
2448             for (int j = 0; j < component[i].num_columns_in; j++) {
2449                 float floatValue = 0.f;
2450                 if (component[i].num_bytes_per_input == 4) {
2451                     if (number_type_ == kDnnInt) {
2452                         auto value = reinterpret_cast<int32_t *>(component[i].ptr_inputs)[k * component[i].num_columns_in + j];
2453                         floatValue = static_cast<float>(value);
2454                     } else {
2455                         floatValue = reinterpret_cast<float *>(component[i].ptr_inputs)[k * component[i].num_columns_in + j];
2456                     }
2457                 } else {
2458                     auto value = reinterpret_cast<int16_t *>(component[i].ptr_inputs)[k * component[i].num_columns_in+ j];
2459                     floatValue = static_cast<float>(value);
2460                 }
2461                 in_file << std::setw(8) << floatValue / input_scale_factor << "\n";
2462             }
2463         }
2464 #endif
2465     }
2466 }
2467
2468 bool isCompatibleDnn(AmIntelDnn dnn1, AmIntelDnn dnn2) {
2469     bool isCompatible = true;
2470
2471     // compare basic structures to see if they are compatible
2472     if (dnn1.num_components() != dnn2.num_components()) isCompatible = false;
2473     for (int i = 0; i < dnn1.num_components(); i++) {
2474         if (dnn1.component[i].num_rows_in != dnn2.component[i].num_rows_in) isCompatible = false;
2475         if (dnn1.component[i].num_columns_in != dnn2.component[i].num_columns_in) isCompatible = false;
2476         if (dnn1.component[i].num_rows_out != dnn2.component[i].num_rows_out) isCompatible = false;
2477         if (dnn1.component[i].num_columns_out != dnn2.component[i].num_columns_out) isCompatible = false;
2478         if (dnn1.component[i].operation != dnn2.component[i].operation) isCompatible = false;
2479     }
2480
2481     return (isCompatible);
2482 }
2483
2484 void ClearScoreError(intel_score_error_t *error) {
2485     error->num_scores = 0;
2486     error->num_errors = 0;
2487     error->max_error = 0.0;
2488     error->sum_error = 0.0;
2489     error->sum_squared_error = 0.0;
2490     error->max_rel_error = 0.0;
2491     error->sum_rel_error = 0.0;
2492     error->sum_squared_rel_error = 0.0;
2493 }
2494
2495 void UpdateScoreError(intel_score_error_t *error, intel_score_error_t *total_error) {
2496     total_error->num_errors += error->num_errors;
2497     total_error->num_scores += error->num_scores;
2498     total_error->sum_error += error->sum_error;
2499     total_error->sum_squared_error += error->sum_squared_error;
2500     if (error->max_error > total_error->max_error) {
2501         total_error->max_error = error->max_error;
2502     }
2503     total_error->sum_rel_error += error->sum_rel_error;
2504     total_error->sum_squared_rel_error += error->sum_squared_rel_error;
2505     if (error->max_rel_error > total_error->max_rel_error) {
2506         total_error->max_rel_error = error->max_rel_error;
2507     }
2508 }
2509
2510 void SoftmaxGoogle(float *ptr_output, float *ptr_input, const uint32_t num_outputs, const uint32_t num_inputs) {
2511     // Assumes input vector contains log likelihoods
2512     // The computes x[i] = x[i] - log(sum_j exp(x[j]))
2513     // This normalizes the likelihoods by the sum of likelihoods but stores them as log likelihoods
2514
2515     float max_score = ptr_input[0];
2516     float sum = 0.0;
2517     float diff;
2518     // find max score for normalization to [0,1]
2519     for (uint32_t i = 0; i < num_inputs; i++) {
2520         if (ptr_input[i] > max_score) {
2521             max_score = ptr_input[i];
2522         }
2523     }
2524     for (uint32_t i = 0; i < num_inputs; i++) {
2525         sum += exp(ptr_input[i] - max_score);
2526     }
2527     if (sum < 1.0e-20) {
2528         fprintf(stderr, "Warning:  attempt to take log(0) in SoftmaxGoogle()!\n");
2529         sum = 1.0e-20;
2530     }
2531     diff = max_score + log(sum);
2532     for (uint32_t i = 0; i < num_outputs; i++) {
2533         ptr_output[i] = ptr_input[i] - diff;
2534     }
2535 }