inference-engine/thirdparty/clDNN/tests/test_cases/lstm_gpu_test.cpp

   1 /*
   2 // Copyright (c) 2016 Intel Corporation
   3 //
   4 // Licensed under the Apache License, Version 2.0 (the "License");
   5 // you may not use this file except in compliance with the License.
   6 // You may obtain a copy of the License at
   7 //
   8 //      http://www.apache.org/licenses/LICENSE-2.0
   9 //
  10 // Unless required by applicable law or agreed to in writing, software
  11 // distributed under the License is distributed on an "AS IS" BASIS,
  12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13 // See the License for the specific language governing permissions and
  14 // limitations under the License.
  15 */
  16
  17 ///////////////////////////////////////////////////////////////////////////////////////////////////
  18 #include <gtest/gtest.h>
  19 #include "api/CPP/memory.hpp"
  20 #include <api/CPP/input_layout.hpp>
  21 #include "api/CPP/lstm.hpp"
  22 #include <api/CPP/split.hpp>
  23 #include <api/CPP/crop.hpp>
  24 #include <api/CPP/reshape.hpp>
  25 #include <api/CPP/concatenation.hpp>
  26 #include <api/CPP/topology.hpp>
  27 #include <api/CPP/tensor.hpp>
  28 #include <api/CPP/network.hpp>
  29 #include <api/CPP/engine.hpp>
  30 #include "test_utils/test_utils.h"
  31 #include <api/CPP/data.hpp>
  32 #include "instrumentation.h"
  33 #include <test_utils/float16.h>
  34
  35 #include <sstream>
  36 #include <iomanip>
  37
  38 #ifdef WIN32
  39 #pragma warning(disable: 4503)
  40 #endif
  41
  42 using namespace cldnn;
  43 using namespace tests;
  44
  45 #define FERROR 1E-4
  46
  47 namespace {
  48     float sigmoid(float x) {
  49         return 1.f / (1.f + (float)std::exp((float)(-x)));
  50     }
  51 }
  52
  53 struct offset_order {
  54     size_t it, ot, ft, zt;
  55     offset_order(size_t scale, const cldnn_lstm_offset_order& t = cldnn_lstm_offset_order_iofz) {
  56         static const std::map<cldnn_lstm_offset_order, std::vector<size_t>> offset_map{
  57             { cldnn_lstm_offset_order_iofz,{ 0, 1, 2, 3 } },
  58             { cldnn_lstm_offset_order_ifoz,{ 0, 2, 1, 3 } }
  59         };
  60         std::vector<size_t> v = offset_map.at(t);
  61         it = v[0] * scale;
  62         ot = v[1] * scale;
  63         ft = v[2] * scale;
  64         zt = v[3] * scale;
  65     }
  66 };
  67 cldnn_lstm_offset_order default_offset_type = cldnn_lstm_offset_order_iofz;
  68
  69 template<typename T>
  70 T clip(T val, T threshold) {
  71     if (threshold > 0) {
  72         if (val > threshold) return threshold;
  73         if (val < -threshold) return -threshold;
  74     }
  75     return val;
  76 }
  77
  78
  79 template <typename T>
  80 VVVVF<T> lstm_gemm_reference(VVVVF<T>& input, VVVVF<T>& weights, VVVVF<T>& recurrent, VVVVF<T>& bias, VVVVF<T>& hidden,
  81     size_t seq, bool hasBias = true, bool hasHidden = true, size_t dir = 0, size_t input_dir = 0) {
  82     size_t input_size = input[0][0][0].size();
  83     size_t hidden_size = hidden[0][0][0].size();
  84     size_t batch_size = input.size();
  85
  86     // Temporary output from GEMM operations [f, i, o, z]
  87     VVVVF<T> tempGEMM(batch_size, VVVF<T>(1, VVF<T>(1, VF<T>(4 * hidden_size))));
  88     for (size_t b = 0; b < batch_size; ++b) {
  89         for (size_t y = 0; y < 4 * hidden_size; ++y) {
  90             T res = 0;
  91             for (size_t x = 0; x < input_size; ++x) {
  92                 res += (T)weights[0][dir][y][x] * (T)input[b][seq][input_dir][x];
  93             }
  94             if (hasHidden) {
  95                 for (size_t x = 0; x < hidden_size; ++x) {
  96                     res += (T)recurrent[0][dir][y][x] * (T)hidden[b][0][dir][x];
  97                 }
  98             }
  99             if (hasBias) {
 100                 res += (T)bias[0][0][dir][y];
 101             }
 102             tempGEMM[b][0][0][y] = res;
 103         }
 104     }
 105     return tempGEMM;
 106 }
 107
 108 template <typename T>
 109 VVVVF<T> lstm_elt_reference(VVVVF<T>& tempGEMM, VVVVF<T>& cell,
 110                             bool hasCell = true, float clip_threshold = 0,
 111                             bool input_forget = false, size_t dir = 0)
 112 {
 113     size_t hidden_size = tempGEMM[0][0][0].size() / 4;
 114     size_t batch_size = tempGEMM.size();
 115     VVVVF<T> tempOut(batch_size, VVVF<T>(2, VVF<T>(1, VF<T>(hidden_size))));
 116     offset_order off(hidden_size, default_offset_type);
 117
 118     for (size_t b = 0; b < batch_size; ++b) {
 119         T *it = &tempGEMM[b][0][0][off.it];
 120         T *ot = &tempGEMM[b][0][0][off.ot];
 121         T *ft = &tempGEMM[b][0][0][off.ft];
 122         T *zt = &tempGEMM[b][0][0][off.zt];
 123
 124         for (size_t h = 0; h < hidden_size; ++h) {
 125
 126             // Convert all inputs to float for all the elementwise operations. This is done to immitate
 127             // how lstm kernel is performing the elementwise operations.
 128             float fp32_it = (float)it[h];
 129             float fp32_ot = (float)ot[h];
 130             float fp32_ft = (float)ft[h];
 131             float fp32_zt = (float)zt[h];
 132             float val = sigmoid(clip(fp32_it, clip_threshold)) * std::tanh(clip(fp32_zt, clip_threshold));
 133
 134             if (input_forget) {
 135                 val *= (1 - fp32_ft);
 136             }
 137             if (hasCell) {
 138                 val += (float)cell[b][0][dir][h] * sigmoid(clip(fp32_ft, clip_threshold));
 139             }
 140
 141             // Convert back to output data type before storing it into the output buffer. Currently, the output
 142             // data type may be float or FLOAT16 (half)
 143             tempOut[b][0][0][h] = (T)(std::tanh(val) * sigmoid(fp32_ot));
 144             tempOut[b][1][0][h] = (T)val;
 145         }
 146     }
 147     return tempOut;
 148 }
 149
 150 template<typename T>
 151 void print(const std::string& s, VVVVF<T>& input) {
 152     printf("%s -------------\n", s.c_str());
 153     printf("Size = [%d, %d, %d, %d]\n", (int)input.size(), (int)input[0].size(), (int)input[0][0].size(), (int)input[0][0][0].size());
 154     for (size_t b = 0; b < input.size(); ++b) {
 155         for (size_t f = 0; f < input[0].size(); ++f) {
 156             for (size_t y = 0; y < input[0][0].size(); ++y) {
 157                 for (size_t x = 0; x < input[0][0][0].size(); ++x) {
 158                     printf("%f ", input[b][f][y][x]);
 159                 }
 160                 printf("\n");
 161             }
 162         }
 163     }
 164     printf("---------------------------------------\n");
 165 }
 166
 167 // input     = [    batch,  sequence,       direction,      input_size ]
 168 // weights   = [        1, direction, 4 * hidden_size,      input_size ]
 169 // recurrent = [        1, direction, 4 * hidden_size,     hidden_size ]
 170 // biases    = [        1,         1,       direction, 4 * hidden_size ] optional
 171 // cell      = [    batch, direction,               1,     hidden_size ] optional
 172 // hidden    = [    batch, direction,               1,     hidden_size ] optional
 173 // tempGEMM  = [    batch,         1,               1, 4 * hidden_size ] temporary output
 174 // output    = [    batch,  sequence,       direction,     hidden_size ] output
 175 template <typename T>
 176 void lstm_reference(VVVVF<T>& input, VVVVF<T>& hidden, VVVVF<T>& cell,
 177                     VVVVF<T>& weights, VVVVF<T>& recurrent, VVVVF<T>& bias,
 178                     VVVVF<T>& output, VVVVF<T>& last_hidden,
 179                     VVVVF<T>& last_cell, bool hasBias = true,
 180                     bool hasInitialHidden = true, bool hasInitialCell = true,
 181                     float clip_threshold = 0, bool input_forget = false,
 182                     bool scramble_input = true)
 183 {
 184     size_t sequence_len = input[0].size();
 185     size_t dir_len = weights[0].size();
 186     size_t batch = input.size();
 187     size_t input_directions = input[0][0].size();
 188     for (size_t dir = 0; dir < dir_len; ++dir) {
 189         bool tempHasInitialHidden = hasInitialHidden;
 190         bool tempHasInitialCell = hasInitialCell;
 191         for (size_t seq = 0; seq < sequence_len; ++seq) {
 192             size_t seq_id = seq;
 193             size_t input_direction = dir;
 194             if (scramble_input) {
 195                 if (dir > 0) {
 196                     seq_id = input_directions == 1 ? sequence_len - seq - 1 : seq;
 197                     input_direction = input_directions - 1;
 198                 }
 199             }
 200             VVVVF<T> tempGEMM = lstm_gemm_reference(input, weights, recurrent, bias, hidden, seq_id, hasBias, tempHasInitialHidden, dir, input_direction);
 201             VVVVF<T> tempOutput = lstm_elt_reference(tempGEMM, cell, tempHasInitialCell, clip_threshold, input_forget, dir);
 202             // tempOutput[batch][0] = hidden and tempOutput[batch][1] = cell
 203             for (size_t i = 0; i < batch; i++) {
 204                 output[i][seq][dir] = tempOutput[i][0][0];
 205                 hidden[i][0][dir] = tempOutput[i][0][0];
 206                 cell[i][0][dir] = tempOutput[i][1][0];
 207             }
 208             tempHasInitialHidden = true;
 209             tempHasInitialCell = true;
 210         }
 211     }
 212     last_hidden = hidden;
 213     last_cell = cell;
 214 }
 215
 216
 217
 218 template<typename T>
 219 void generic_lstm_gemm_gpu_test(int sequence_len, int direction, int batch_size, int input_size, int hidden_size,
 220     bool hasBias = true, bool hasHidden = true) {
 221     int min_random = -2, max_random = 2;
 222
 223     VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
 224     VVVVF<T> ref_weights = generate_random_4d<T>(1, direction, 4 * hidden_size, input_size, min_random, max_random);
 225     VVVVF<T> ref_recurrent = generate_random_4d<T>(1, direction, 4 * hidden_size, hidden_size, min_random, max_random);
 226     VVVVF<T> ref_bias = generate_random_4d<T>(1, 1, direction, 4 * hidden_size, min_random, max_random);
 227     VVVVF<T> ref_hidden = generate_random_4d<T>(batch_size, direction, 1, hidden_size, min_random, max_random);
 228     VF<T> ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
 229     VF<T> ref_weights_vec = flatten_4d<T>(cldnn::format::bfyx, ref_weights);
 230     VF<T> ref_recurrent_vec = flatten_4d<T>(cldnn::format::bfyx, ref_recurrent);
 231     VF<T> ref_bias_vec = flatten_4d<T>(cldnn::format::bfyx, ref_bias);
 232     VF<T> ref_hidden_vec = flatten_4d<T>(cldnn::format::bfyx, ref_hidden);
 233
 234     VVVVF<T> ref_output = lstm_gemm_reference(ref_input, ref_weights, ref_recurrent, ref_bias, ref_hidden, 0, hasBias, hasHidden);
 235
 236     constexpr auto dt = std::is_same<T, float>::value ? data_types::f32 : data_types::f16;
 237     const auto& engine = get_test_engine();
 238
 239     // If the input is of fp16 type then, the memory will be allocated as such
 240     if (!engine.get_info().supports_fp16)
 241     {
 242         if (dt == data_types::f16)
 243         {
 244             return;
 245         }
 246     }
 247
 248     memory input = memory::allocate(engine, { dt, format::bfyx,     { batch_size,   sequence_len,  input_size,      1 } });
 249     memory weights = memory::allocate(engine, { dt, format::bfyx,   { 1,            direction,     input_size,      4 * hidden_size } });
 250     memory recurrent = memory::allocate(engine, { dt, format::bfyx, { 1,            direction,     hidden_size,     4 * hidden_size } });
 251     memory biases = memory::allocate(engine, { dt, format::bfyx,    { 1,            1,             4 * hidden_size, direction } });
 252     memory hidden = memory::allocate(engine, { dt, format::bfyx,    { batch_size,   direction,     hidden_size,     1 } });
 253
 254     set_values(input, ref_input_vec);
 255     set_values(weights, ref_weights_vec);
 256     set_values(recurrent, ref_recurrent_vec);
 257     set_values(biases, ref_bias_vec);
 258     set_values(hidden, ref_hidden_vec);
 259
 260     topology topology;
 261     topology.add(input_layout("input", input.get_layout()));
 262     topology.add(data("weights", weights));
 263     topology.add(data("recurrent", recurrent));
 264     if (hasBias) {
 265         topology.add(data("biases", biases));
 266     }
 267     if (hasHidden) {
 268         topology.add(input_layout("hidden", hidden.get_layout()));
 269     }
 270
 271     topology.add(lstm_gemm("lstm_gemm", "input", "weights", "recurrent", hasBias ? "biases" : "", hasHidden ? "hidden" : ""));
 272
 273     network network(engine, topology);
 274     network.set_input_data("input", input);
 275     if (hasHidden) {
 276         network.set_input_data("hidden", hidden);
 277     }
 278
 279     auto outputs = network.execute();
 280     EXPECT_EQ(outputs.size(), size_t(1));
 281
 282     auto output = outputs.begin()->second.get_memory();
 283     auto output_ptr = output.pointer<T>();
 284     int i = 0;
 285     for (int b = 0; b < batch_size; ++b) {
 286         for (int x = 0; x < 4 * hidden_size; ++x)
 287             EXPECT_FLOAT_EQ(ref_output[b][0][0][x], output_ptr[i++]);
 288     }
 289 }
 290
 291 template<typename T>
 292 void generic_lstm_elt_gpu_test(int sequence_len, int direction, int batch_size, int input_size, int hidden_size, bool hasCell = true,
 293     T clip_threshold = (T)0.f, bool input_forget = false) {
 294     // tempGEMM  = [        1, direction,           batch, 4 * hidden_size ] input
 295     // cell      = [        1, direction,           batch,     hidden_size ] optional
 296     // output    = [        2, direction,           batch,     hidden_size ] output concat[hidden, cell]
 297     int min_random = -2, max_random = 2;
 298
 299     VVVVF<T> ref_tempGEMM = generate_random_4d<T>(batch_size, direction, 1, 4 * hidden_size, min_random, max_random);
 300     VVVVF<T> ref_cell = generate_random_4d<T>(batch_size, direction, 1, hidden_size, min_random, max_random);
 301     VF<T> ref_tempGEMM_vec = flatten_4d<T>(cldnn::format::bfyx, ref_tempGEMM);
 302     VF<T> ref_cell_vec = flatten_4d<T>(cldnn::format::bfyx, ref_cell);
 303
 304     VVVVF<T> ref_output = lstm_elt_reference(ref_tempGEMM, ref_cell, hasCell, clip_threshold, input_forget);
 305
 306     // We observe some mismatch in down-converting from fp32 to fp16
 307     // between the reference implementation and opencl kernel. This can be
 308     // a simple rounding error. Thus, for fp16 we are increasing our tolerance
 309     // to error from 1E-4 to 1E-2
 310     constexpr float ferror = std::is_same<T, float>::value ? (float)1E-4 : (float)1E-2;
 311     constexpr auto dt = std::is_same<T, float>::value ? data_types::f32 : data_types::f16;
 312     const auto& engine = get_test_engine();
 313
 314     // If the input is of fp16 type then, the memory will be allocated as such
 315     if (!engine.get_info().supports_fp16)
 316     {
 317         if (dt == data_types::f16)
 318         {
 319             return;
 320         }
 321     }
 322
 323     memory tempGEMM = memory::allocate(engine, { dt, format::bfyx,{ batch_size,    direction, 4 * hidden_size, 1 } });
 324     memory cell = memory::allocate(engine, { dt, format::bfyx,{ batch_size,    direction,     hidden_size, 1 } });
 325     set_values(tempGEMM, ref_tempGEMM_vec);
 326     set_values(cell, ref_cell_vec);
 327
 328     topology topology;
 329     topology.add(input_layout("tempGEMM", tempGEMM.get_layout()));
 330     if (hasCell) {
 331         topology.add(input_layout("cell", cell.get_layout()));
 332     }
 333     topology.add(lstm_elt("lstm_elt", "tempGEMM", hasCell ? "cell" : "", clip_threshold, input_forget));
 334
 335     network network(engine, topology);
 336     network.set_input_data("tempGEMM", tempGEMM);
 337     if (hasCell) {
 338         network.set_input_data("cell", cell);
 339     }
 340
 341     auto outputs = network.execute();
 342     EXPECT_EQ(outputs.size(), size_t(1));
 343
 344     auto output = outputs.begin()->second.get_memory();
 345     auto output_ptr = output.pointer<T>();
 346     for (int b = 0; b < batch_size; ++b) {
 347         for (int j = 0; j < 2; ++j) {
 348             for (int x = 0; x < hidden_size; ++x)
 349             {
 350                 auto idx = b * 2 * hidden_size + j * hidden_size + x;
 351                 ASSERT_NEAR(ref_output[b][j][0][x], output_ptr[idx] , ferror);
 352             }
 353         }
 354     }
 355 }
 356
 357 std::string get_string_id(size_t i) {
 358     std::stringstream ss;
 359     ss << std::setw(5) << std::setfill('0') << i;
 360     return ss.str();
 361 }
 362
 363 // --------------- Manually constructed LSTM ----------------------------------------
 364 // This function manually generates an lstm node sequence by conbining lstm_gemm and lstm_elt nodes
 365 // it requires that the output of the lstm_elt node is croped to obtain the corresponding hidden and cell outputs
 366 void generate_lstm_topology(topology& t, memory& input, memory& hidden, memory& cell,
 367     memory& weights, memory& recurrent, memory& biases, int sequence_len,
 368     bool hasBias = true, bool hasInitialHidden = true, bool hasInitialCell = true) {
 369     auto hidden_size = hidden.get_layout().size;
 370     t.add(input_layout("input", input.get_layout()));
 371     std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
 372     std::vector<primitive_id> output_ids_offsets;
 373     for (int i = 0; i < sequence_len; ++i)
 374         input_ids_offsets.push_back({ get_string_id(i),{ 0, i, 0, 0 } });
 375     t.add(split("inputSplit", "input", input_ids_offsets));
 376     t.add(data("weights", weights));
 377     t.add(data("recurrent", recurrent));
 378
 379     std::string biasStr = "";
 380     std::string hiddenStr = "";
 381     std::string cellStr = "";
 382     if (hasBias)
 383     {
 384         t.add(data("biases", biases));
 385         biasStr = "biases";
 386     }
 387     if (hasInitialHidden)
 388     {
 389         t.add(input_layout("hidden", hidden.get_layout()));
 390         hiddenStr = "hidden";
 391     }
 392     if (hasInitialCell)
 393     {
 394         t.add(input_layout("cell", cell.get_layout()));
 395         cellStr = "cell";
 396     }
 397     for (int i = 0; i < sequence_len; ++i) {
 398         std::string lstm_gemm_id = "lstm_gemm" + get_string_id(i);
 399         std::string lstm_elt_id = "lstm_elt" + get_string_id(i);
 400         std::string crop_id = "crop" + get_string_id(i);
 401
 402         t.add(lstm_gemm(lstm_gemm_id, "inputSplit:" + get_string_id(i), "weights", "recurrent", biasStr, hiddenStr));
 403         t.add(lstm_elt(lstm_elt_id, lstm_gemm_id, cellStr));
 404
 405         hiddenStr = crop_id + ":hidden";
 406         t.add(crop(hiddenStr, lstm_elt_id, hidden_size, tensor{ 0,0,0,0 }));
 407         if (i < sequence_len - 1) {
 408             cellStr = crop_id + ":cell";
 409             t.add(crop(cellStr, lstm_elt_id, hidden_size, tensor{ 0,1,0,0 }));
 410         }
 411         output_ids_offsets.push_back(hiddenStr);
 412     }
 413     t.add(concatenation("concatenation", output_ids_offsets, concatenation::along_f));
 414 }
 415
 416
 417 template<typename T>
 418 void generic_lstm_custom_gpu_test(int sequence_len, int direction, int batch_size, int input_size, int hidden_size,
 419     bool hasBias = true, bool hasInitialHidden = true, bool hasInitialCell = true) {
 420     std::cout << "Input Size = " << input_size << " Hidden Size = " << hidden_size << " Sequence Len = " << sequence_len << " Batch Size = " << batch_size << std::endl;
 421     int min_random = -2, max_random = 2;
 422     VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
 423     VVVVF<T> ref_weights = generate_random_4d<T>(1, direction, 4 * hidden_size, input_size, min_random, max_random);
 424     VVVVF<T> ref_recurrent = generate_random_4d<T>(1, direction, 4 * hidden_size, hidden_size, min_random, max_random);
 425     VVVVF<T> ref_bias = generate_random_4d<T>(1, 1, direction, 4 * hidden_size, min_random, max_random);
 426     VVVVF<T> ref_hidden = generate_random_4d<T>(batch_size, direction, 1, hidden_size, min_random, max_random);
 427     VVVVF<T> ref_cell = generate_random_4d<T>(batch_size, direction, 1, hidden_size, min_random, max_random);
 428     VVVVF<T> ref_output(batch_size, VVVF<T>(sequence_len, VVF<T>(direction, VF<T>(hidden_size))));
 429     VVVVF<T> last_hidden(batch_size, VVVF<T>(direction, VVF<T>(1, VF<T>(hidden_size))));
 430     VVVVF<T> last_cell(batch_size, VVVF<T>(direction, VVF<T>(1, VF<T>(hidden_size))));
 431
 432     VF<T> ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
 433     VF<T> ref_weights_vec = flatten_4d<T>(cldnn::format::bfyx, ref_weights);
 434     VF<T> ref_recurrent_vec = flatten_4d<T>(cldnn::format::bfyx, ref_recurrent);
 435     VF<T> ref_bias_vec = flatten_4d<T>(cldnn::format::bfyx, ref_bias);
 436     VF<T> ref_hidden_vec = flatten_4d<T>(cldnn::format::bfyx, ref_hidden);
 437     VF<T> ref_cell_vec = flatten_4d<T>(cldnn::format::bfyx, ref_cell);
 438     lstm_reference(ref_input, ref_hidden, ref_cell, ref_weights, ref_recurrent, ref_bias, ref_output, last_hidden, last_cell,
 439         hasBias, hasInitialHidden, hasInitialCell);
 440
 441     const auto& engine = get_test_engine();
 442     memory input = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ batch_size, sequence_len,  input_size,       1 } });
 443     memory weights = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1,          direction,     input_size,       4 * hidden_size } });
 444     memory recurrent = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1,          direction,     hidden_size,      4 * hidden_size } });
 445     memory biases = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ 1,          1,             4 * hidden_size,  direction } });
 446     memory hidden = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ batch_size, direction,     hidden_size,      1 } });
 447     memory cell = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx,{ batch_size, direction,     hidden_size,      1 } });
 448     set_values(input, ref_input_vec);
 449     set_values(weights, ref_weights_vec);
 450     set_values(recurrent, ref_recurrent_vec);
 451     set_values(biases, ref_bias_vec);
 452     set_values(hidden, ref_hidden_vec);
 453     set_values(cell, ref_cell_vec);
 454
 455     topology topology;
 456     generate_lstm_topology(topology, input, hidden, cell, weights, recurrent, biases, sequence_len,
 457         hasBias, hasInitialHidden, hasInitialCell);
 458
 459     network network(engine, topology);
 460     network.set_input_data("input", input);
 461     if (hasInitialHidden) network.set_input_data("hidden", hidden);
 462     if (hasInitialCell) network.set_input_data("cell", cell);
 463     auto outputs = network.execute();
 464
 465     ASSERT_EQ(outputs.size(), size_t(1));
 466     size_t output_size = outputs.begin()->second.get_memory().size() / sizeof(T);
 467     ASSERT_EQ(output_size, size_t(hidden_size * sequence_len * batch_size * direction));
 468
 469     auto output = outputs.begin()->second.get_memory();
 470     auto output_ptr = output.pointer<T>();
 471     int i = 0;
 472     for (int b = 0; b < batch_size; ++b) {
 473         for (int s = 0; s < sequence_len; ++s) {
 474             for (int x = 0; x < hidden_size; ++x) {
 475                 for (int d = 0; d < direction; ++d) {
 476                     ASSERT_NEAR(ref_output[b][s][d][x], output_ptr[i++], FERROR);
 477                 }
 478             }
 479         }
 480     }
 481 }
 482
 483 // -------------------------------------------------------
 484 template<typename T>
 485 void generic_lstm_gpu_test(int layers, int sequence_len, int direction, int batch_size, int input_size, int hidden_size,
 486                             bool hasBias = true, bool hasInitialHidden = true, bool hasInitialCell = true,
 487                             T clip_threshold = 0, bool input_forget = false) {
 488     std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
 489             << " Sequence Len = " << sequence_len << " Direction = " << direction << " Batch Size = " << batch_size << std::endl;
 490     int min_random = -2, max_random = 2;
 491
 492     VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
 493
 494     std::vector<VVVVF<T>> ref_weights;
 495     std::vector<VVVVF<T>> ref_recurrent;
 496     std::vector<VVVVF<T>> ref_bias;
 497     std::vector<VVVVF<T>> ref_hidden;
 498     std::vector<VVVVF<T>> ref_cell;
 499     std::vector<VVVVF<T>> ref_output;
 500
 501     for (int i = 0; i < layers; ++i) {
 502         ref_weights.push_back(generate_random_4d<T>(1, direction, 4 * hidden_size, i==0 ? input_size : hidden_size, min_random, max_random));
 503         ref_recurrent.push_back(generate_random_4d<T>(1, direction, 4 * hidden_size, hidden_size, min_random, max_random));
 504         ref_bias.push_back(generate_random_4d<T>(1, 1, direction, 4 * hidden_size, min_random, max_random));
 505         ref_hidden.push_back(generate_random_4d<T>(batch_size, 1, direction, hidden_size, min_random, max_random));
 506         ref_cell.push_back(generate_random_4d<T>(batch_size, 1, direction, hidden_size, min_random, max_random));
 507         ref_output.push_back(VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(direction, VF<T>(hidden_size)))));
 508     }
 509
 510     VF<T> ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
 511     std::vector<VF<T>> ref_weights_vec;
 512     std::vector<VF<T>> ref_recurrent_vec;
 513     std::vector<VF<T>> ref_bias_vec;
 514     std::vector<VF<T>> ref_hidden_vec;
 515     std::vector<VF<T>> ref_cell_vec;
 516     for (int i = 0; i < layers; ++i) {
 517         ref_weights_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_weights[i]));
 518         ref_recurrent_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_recurrent[i]));
 519         ref_bias_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_bias[i]));
 520         ref_hidden_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_hidden[i]));
 521         ref_cell_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_cell[i]));
 522     }
 523
 524     VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(direction, VF<T>(hidden_size))));
 525     VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(direction, VF<T>(hidden_size))));
 526
 527     lstm_reference(ref_input, ref_hidden[0], ref_cell[0], ref_weights[0], ref_recurrent[0], ref_bias[0], ref_output[0],
 528                    last_hidden, last_cell, hasBias, hasInitialHidden, hasInitialCell,
 529                    clip_threshold, input_forget, true);
 530
 531     for (int i = 1; i < layers; ++i) {
 532         lstm_reference(ref_output[i - 1], ref_hidden[i], ref_cell[i], ref_weights[i], ref_recurrent[i],
 533                         ref_bias[i], ref_output[i],
 534                         last_hidden, last_cell, hasBias, hasInitialHidden, hasInitialCell,
 535                         clip_threshold, input_forget, false);
 536     }
 537
 538     // We observe some mismatch in down-converting from fp32 to fp16
 539     // between the reference implementation and opencl kernel. This can be
 540     // a simple rounding error. Thus, for fp16 we are increasing our tolerance
 541     // to error from 1E-4 to 1E-2
 542     constexpr float ferror = std::is_same<T, float>::value ? (float)1E-4 : (float)1E-2;
 543     constexpr auto dt = std::is_same<T, float>::value ? data_types::f32 : data_types::f16;
 544     const auto& engine = get_test_engine();
 545
 546     // If the input is of fp16 type then, the memory will be allocated as such
 547     if (!engine.get_info().supports_fp16)
 548     {
 549         if (dt == data_types::f16)
 550         {
 551             return;
 552         }
 553     }
 554
 555     memory input = memory::allocate(engine, { dt, format::bfyx, {batch_size, sequence_len, input_size, 1} });
 556     set_values(input, ref_input_vec);
 557
 558     std::vector<memory> weights;
 559     std::vector<memory> recurrent;
 560     std::vector<memory> biases;
 561     std::vector<memory> hidden;
 562     std::vector<memory> cell;
 563     for(int i = 0; i < layers; ++i) {
 564         weights.push_back(memory::allocate(engine, { dt, format::bfyx, { 1, direction, i==0 ? input_size : hidden_size, 4 * hidden_size } }));
 565         set_values(weights[i], ref_weights_vec[i]);
 566         recurrent.push_back(memory::allocate(engine, { dt, format::bfyx, { 1, direction, hidden_size, 4 * hidden_size } }));
 567         set_values(recurrent[i], ref_recurrent_vec[i]);
 568         if (hasBias) {
 569             biases.push_back(memory::allocate(engine, { dt, format::bfyx, { 1, 1, 4 * hidden_size, direction } }));
 570             set_values(biases[i], ref_bias_vec[i]);
 571         }
 572         if (hasInitialHidden) {
 573             hidden.push_back(memory::allocate(engine, { dt, format::bfyx, { batch_size, 1, hidden_size, direction } }));
 574             set_values(hidden[i], ref_hidden_vec[i]);
 575         }
 576         if (hasInitialCell) {
 577             cell.push_back(memory::allocate(engine, { dt, format::bfyx, { batch_size, 1, hidden_size, direction} }));
 578             set_values(cell[i], ref_cell_vec[i]);
 579         }
 580     }
 581
 582     topology topology;
 583     std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
 584     std::vector<primitive_id> lstm_inputs;
 585     std::vector<primitive_id> output_ids_offsets;
 586
 587     topology.add(input_layout("input", input.get_layout()));
 588     for (int i = 0; i < sequence_len; ++i) {
 589         input_ids_offsets.push_back({get_string_id(i), {0, i, 0, 0}});
 590         lstm_inputs.push_back("inputSplit:"+get_string_id(i));
 591     }
 592     topology.add(split("inputSplit", "input", input_ids_offsets));
 593     cldnn::primitive_id prev_lstm_id;
 594     for(int i = 0; i < layers; ++i) {
 595         std::string sid = get_string_id(i);
 596         std::string lstm_id = "lstm" + sid;
 597         std::string weights_id = "weights" + sid;
 598         std::string recurrent_id = "recurrent" + sid;
 599         std::string biases_id = "biases" + sid;
 600         std::string hidden_id = "hidden" + sid;
 601         std::string cell_id = "cell" + sid;
 602
 603         topology.add(data(weights_id, weights[i]));
 604         topology.add(data(recurrent_id, recurrent[i]));
 605         if (hasBias) topology.add(data(biases_id, biases[i]));
 606         if (hasInitialHidden) topology.add(input_layout(hidden_id, hidden[i].get_layout()));
 607         if (hasInitialCell) topology.add(input_layout(cell_id, cell[i].get_layout()));
 608         if (i == 0) {
 609             topology.add(lstm(lstm_id, lstm_inputs, weights_id, recurrent_id,
 610                             hasBias ? biases_id : "", hasInitialHidden ? hidden_id : "", hasInitialCell ? cell_id : "", "",
 611                             clip_threshold, input_forget, {}, {},
 612                             cldnn_lstm_output::cldnn_lstm_output_sequence, default_offset_type));
 613         }
 614         else {
 615             topology.add(lstm(lstm_id, { prev_lstm_id }, weights_id, recurrent_id,
 616                             hasBias ? biases_id : "", hasInitialHidden ? hidden_id : "", hasInitialCell ? cell_id : "", "",
 617                             clip_threshold, input_forget, {}, {},
 618                             cldnn_lstm_output::cldnn_lstm_output_sequence, default_offset_type));
 619         }
 620         prev_lstm_id = lstm_id;
 621     }
 622
 623     network network(engine, topology);
 624     network.set_input_data("input", input);
 625     for (int i = 0; i < layers; ++i) {
 626         std::string sid = get_string_id(i);
 627         if (hasInitialHidden) network.set_input_data("hidden" + sid, hidden[i]);
 628         if (hasInitialCell) network.set_input_data("cell" + sid, cell[i]);
 629     }
 630     auto outputs = network.execute();
 631     {
 632         ASSERT_EQ(outputs.size(), size_t(1));
 633         size_t output_size = outputs.begin()->second.get_memory().size() / sizeof(T);
 634         ASSERT_EQ(output_size, size_t(hidden_size * sequence_len * batch_size * direction));
 635
 636         auto output = outputs.begin()->second.get_memory();
 637
 638         // Get the output tensor
 639         cldnn::layout output_layout = output.get_layout();
 640         cldnn::tensor output_tensor = output_layout.size;
 641
 642         // Compare the output tensor configuration against the reference value
 643         // Output tensor is configured in bfyx format
 644         ASSERT_EQ(batch_size, output_tensor.batch[0]);
 645         ASSERT_EQ(sequence_len, output_tensor.feature[0]);
 646         ASSERT_EQ(direction, output_tensor.spatial[1]);
 647         ASSERT_EQ(hidden_size, output_tensor.spatial[0]);
 648
 649         auto output_ptr = output.pointer<T>();
 650         int32_t i = 0;
 651         for (int32_t b = 0; b < batch_size; ++b) {
 652             for (int32_t s = 0; s < sequence_len; ++s) {
 653                 for (int32_t d = 0; d < direction; ++d) {
 654                     for (int32_t x = 0; x <  hidden_size; ++x) {
 655                         ASSERT_NEAR(ref_output[layers - 1][b][s][d][x], output_ptr[i++], ferror);
 656                     }
 657                 }
 658             }
 659         }
 660     }
 661 }
 662
 663 // -------------------------------------------------------
 664 template<typename T>
 665 void lstm_gpu_output_test(const cldnn_lstm_output& output_selection, int directions) {
 666     int layers = 1;
 667     int sequence_len = 4;
 668     int batch_size = 3;
 669     int input_size = 3;
 670     int hidden_size = 4;
 671
 672     std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
 673             << " Sequence Len = " << sequence_len << " Directions = " << directions << " Batch Size = " << batch_size
 674                         << " Output selection: " << output_selection << std::endl;
 675     int min_random = -2, max_random = 2;
 676
 677     VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
 678     VVVVF<T> ref_weights = generate_random_4d<T>(1, directions, 4 * hidden_size, input_size, min_random, max_random);
 679     VVVVF<T> ref_recurrent = generate_random_4d<T>(1, directions, 4 * hidden_size, hidden_size, min_random, max_random);
 680     VVVVF<T> ref_bias = generate_random_4d<T>(1, 1, directions, 4 * hidden_size, min_random, max_random);
 681     VVVVF<T> ref_hidden = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
 682     VVVVF<T> ref_cell = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
 683     VVVVF<T> ref_output = VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(directions, VF<T>(hidden_size))));
 684
 685     VF<T> ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
 686     VF<T> ref_weights_vec = flatten_4d<T>(cldnn::format::bfyx, ref_weights);
 687     VF<T> ref_recurrent_vec = flatten_4d<T>(cldnn::format::bfyx, ref_recurrent);
 688     VF<T> ref_bias_vec = flatten_4d<T>(cldnn::format::bfyx, ref_bias);
 689     VF<T> ref_hidden_vec = flatten_4d<T>(cldnn::format::bfyx, ref_hidden);
 690     VF<T> ref_cell_vec = flatten_4d<T>(cldnn::format::bfyx, ref_cell);
 691
 692     VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
 693     VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
 694
 695     lstm_reference(ref_input, ref_hidden, ref_cell, ref_weights, ref_recurrent, ref_bias, ref_output,
 696                    last_hidden, last_cell, true, true, true,
 697                    (T)0, false, true);
 698
 699     const auto& engine = get_test_engine();
 700
 701     memory input = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {batch_size, sequence_len, input_size, 1} });
 702     memory weights = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, input_size , 4 * hidden_size } });
 703     memory recurrent = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, hidden_size, 4 * hidden_size } });
 704     memory biases = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, 1, 4 * hidden_size, directions } });
 705     memory hidden = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, directions } });
 706     memory cell = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, directions } });
 707
 708     set_values(input, ref_input_vec);
 709     set_values(weights, ref_weights_vec);
 710     set_values(recurrent, ref_recurrent_vec);
 711     set_values(biases, ref_bias_vec);
 712     set_values(hidden, ref_hidden_vec);
 713     set_values(cell, ref_cell_vec);
 714
 715     bool emit_last_cell = output_selection == cldnn_lstm_output_hidden_cell ||
 716                           output_selection == cldnn_lstm_output_sequence_cell;
 717     bool emit_last_hidden = output_selection == cldnn_lstm_output_hidden ||
 718                             output_selection == cldnn_lstm_output_hidden_cell;
 719
 720     topology topology;
 721     std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
 722     std::vector<primitive_id> lstm_inputs;
 723     std::vector<primitive_id> output_ids_offsets;
 724
 725     topology.add(input_layout("input", input.get_layout()));
 726     for (int i = 0; i < sequence_len; ++i)
 727     {
 728         input_ids_offsets.push_back({get_string_id(i), {0, i, 0, 0}});
 729         lstm_inputs.push_back("inputSplit:"+get_string_id(i));
 730     }
 731     topology.add(split("inputSplit", "input", input_ids_offsets));
 732     topology.add(data("weights", weights));
 733     topology.add(data("recurrent", recurrent));
 734     topology.add(data("biases", biases));
 735     topology.add(input_layout("hidden", hidden.get_layout()));
 736     topology.add(input_layout("cell", cell.get_layout()));
 737     topology.add(lstm("lstm", lstm_inputs, "weights", "recurrent",
 738                       "biases", "hidden", "cell", "", 0, false, {}, {},
 739                       output_selection, default_offset_type));
 740     if (emit_last_cell)
 741     {
 742         int32_t concatenation_len = emit_last_hidden ? 2 : sequence_len + 1;
 743         tensor hidden_tensor {batch_size, concatenation_len - 1, hidden_size, directions};
 744         tensor cell_tensor {batch_size, 1, hidden_size, directions};
 745         topology.add(crop(emit_last_hidden ? "crop:last_hidden" : "crop:sequence", "lstm", hidden_tensor, tensor {0, 0, 0, 0}));
 746         topology.add(crop("crop:last_cell", "lstm", cell_tensor, tensor {0, concatenation_len - 1, 0, 0}));
 747     }
 748
 749     network network(engine, topology);
 750     network.set_input_data("input", input);
 751     network.set_input_data("hidden", hidden);
 752     network.set_input_data("cell", cell);
 753
 754     auto outputs = network.execute();
 755         uint32_t ref_num_output_primitives = 1;  // Output will return atleast 1 primitive
 756
 757         if (emit_last_cell) {
 758                 // add another primitve to account for cell state if the output selection includes cell state
 759                 ref_num_output_primitives += 1;
 760         }
 761
 762         // check if the number of returned primitives match the expected number of output primitives
 763         ASSERT_EQ(ref_num_output_primitives, outputs.size());
 764
 765         for (auto itr = outputs.begin(); itr != outputs.end(); itr++)
 766         {
 767         auto output_tensor = itr->second.get_memory().get_layout().size;
 768         primitive_id primitive_name = itr->first;
 769
 770                 cldnn::memory output_memory = itr->second.get_memory();
 771         int32_t output_size = (int32_t)(itr->second.get_memory().size() / sizeof(T));
 772                 cldnn::tensor ref_output_tensor;
 773                 VVVVF<T> ref_primitive_output;
 774
 775                 int32_t ref_batch_size = batch_size;
 776                 int32_t ref_hidden_size = hidden_size;
 777                 int32_t ref_directions = directions;
 778
 779         int32_t ref_seq_len = 1;
 780         // Set the reference output against which the primitive's output will be compared
 781                 if (primitive_name.find("crop:last_cell") != std::string::npos)
 782                 {
 783                         ref_primitive_output = last_cell;
 784                 }
 785                 else if (emit_last_hidden || primitive_name.find("crop:last_hidden") != std::string::npos)
 786                 {
 787                         ref_primitive_output = last_hidden;
 788                 }
 789                 else
 790                 {
 791                         ref_seq_len = sequence_len;
 792                         ref_primitive_output = ref_output;
 793                 }
 794
 795                 ref_output_tensor = { ref_batch_size, ref_seq_len, ref_hidden_size, ref_directions };
 796                 int32_t ref_output_size = ref_batch_size * ref_seq_len * ref_hidden_size * ref_directions;
 797
 798                 // The number of elements in reference should match the number of elements in the primitive's output
 799                 ASSERT_EQ(ref_output_size , output_size);
 800
 801         // Compare the output tensor configuration against the reference value
 802         // Output tensor is configured in bfyx format
 803         ASSERT_EQ(ref_batch_size, output_tensor.batch[0]);
 804         ASSERT_EQ(ref_seq_len, output_tensor.feature[0]);               // Sequence length should match
 805                 ASSERT_EQ(ref_directions, output_tensor.spatial[1]);    // directions should match
 806         ASSERT_EQ(ref_hidden_size, output_tensor.spatial[0]);   // input size should match
 807
 808         auto output_ptr = output_memory.pointer<T>();
 809
 810                 int32_t i = 0;
 811                 for (int32_t b = 0; b < ref_batch_size; ++b) {
 812                         for (int32_t s = 0; s < ref_seq_len; ++s) {
 813                                 for (int32_t d = 0; d < ref_directions; ++d) {
 814                                         for (int32_t x = 0; x < ref_hidden_size; ++x) {
 815                         ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR);
 816                     }
 817                 }
 818             }
 819         }
 820     }
 821 }
 822
 823
 824 // -------------------------------------------------------
 825 template<typename T>
 826 void lstm_gpu_format_test(const cldnn::format& format, int directions) {
 827     int layers = 1;
 828     int sequence_len = 6;
 829     int batch_size = 3;
 830     int input_size = 4;
 831     int hidden_size = 5;
 832
 833     cldnn_lstm_output output_selection = cldnn_lstm_output::cldnn_lstm_output_sequence;
 834
 835     std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
 836             << " Sequence Len = " << sequence_len << " Directions = " << directions << " Batch Size = " << batch_size
 837             << " Output selection: " << output_selection << std::endl;
 838     int min_random = -2, max_random = 2;
 839
 840     VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
 841     VVVVF<T> ref_weights = generate_random_4d<T>(1, directions, 4 * hidden_size, input_size, min_random, max_random);
 842     VVVVF<T> ref_recurrent = generate_random_4d<T>(1, directions, 4 * hidden_size, hidden_size, min_random, max_random);
 843     VVVVF<T> ref_bias = generate_random_4d<T>(1, 1, directions, 4 * hidden_size, min_random, max_random);
 844     VVVVF<T> ref_hidden = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
 845     VVVVF<T> ref_cell = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
 846     VVVVF<T> ref_output = VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(directions, VF<T>(hidden_size))));
 847
 848     VF<T> ref_input_vec = flatten_4d<T>(format, ref_input);
 849     VF<T> ref_weights_vec = flatten_4d<T>(cldnn::format::bfyx, ref_weights);
 850     VF<T> ref_recurrent_vec = flatten_4d<T>(cldnn::format::bfyx, ref_recurrent);
 851     VF<T> ref_bias_vec = flatten_4d<T>(cldnn::format::bfyx, ref_bias);
 852     VF<T> ref_hidden_vec = flatten_4d<T>(format, ref_hidden);
 853     VF<T> ref_cell_vec = flatten_4d<T>(format, ref_cell);
 854
 855     VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
 856     VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
 857
 858     lstm_reference(ref_input, ref_hidden, ref_cell, ref_weights, ref_recurrent, ref_bias, ref_output,
 859                    last_hidden, last_cell, true, true, true,
 860                    (T)0, false, true);
 861
 862     const auto& engine = get_test_engine();
 863
 864     memory input = memory::allocate(engine, { type_to_data_type<T>::value,format, {batch_size, sequence_len, input_size, 1} });
 865     memory weights = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, input_size , 4 * hidden_size } });
 866     memory recurrent = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, hidden_size, 4 * hidden_size } });
 867     memory biases = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, 1, 4 * hidden_size, directions } });
 868     memory hidden = memory::allocate(engine, { type_to_data_type<T>::value, format, { batch_size, 1, hidden_size, directions } });
 869     memory cell = memory::allocate(engine, { type_to_data_type<T>::value, format, { batch_size, 1, hidden_size, directions } });
 870
 871     set_values(input, ref_input_vec);
 872     set_values(weights, ref_weights_vec);
 873     set_values(recurrent, ref_recurrent_vec);
 874     set_values(biases, ref_bias_vec);
 875     set_values(hidden, ref_hidden_vec);
 876     set_values(cell, ref_cell_vec);
 877
 878     bool emit_last_cell = output_selection == cldnn_lstm_output_hidden_cell ||
 879                           output_selection == cldnn_lstm_output_sequence_cell;
 880     bool emit_last_hidden = output_selection == cldnn_lstm_output_hidden ||
 881                             output_selection == cldnn_lstm_output_hidden_cell;
 882
 883     topology topology;
 884     std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
 885     std::vector<primitive_id> lstm_inputs;
 886     std::vector<primitive_id> output_ids_offsets;
 887
 888     topology.add(input_layout("input", input.get_layout()));
 889     for (int i = 0; i < sequence_len; ++i)
 890     {
 891         input_ids_offsets.push_back({get_string_id(i), {0, i, 0, 0}});
 892         lstm_inputs.push_back("inputSplit:"+get_string_id(i));
 893     }
 894     topology.add(split("inputSplit", "input", input_ids_offsets));
 895     topology.add(data("weights", weights));
 896     topology.add(data("recurrent", recurrent));
 897     topology.add(data("biases", biases));
 898     topology.add(input_layout("hidden", hidden.get_layout()));
 899     topology.add(input_layout("cell", cell.get_layout()));
 900     topology.add(lstm("lstm"+get_string_id(0), lstm_inputs, "weights", "recurrent",
 901                       "biases", "hidden", "cell", "", 0, false, {}, {},
 902                       output_selection, default_offset_type));
 903
 904     if (emit_last_cell)
 905     {
 906         int32_t concatenation_len = emit_last_hidden ? 2 : sequence_len + 1;
 907         tensor hidden_tensor {batch_size, concatenation_len - 1, hidden_size, directions};
 908         tensor cell_tensor {batch_size, 1, hidden_size, directions};
 909         topology.add(crop(emit_last_hidden ? "crop:last_hidden" : "crop:sequence", "lstm", hidden_tensor, tensor {0, 0, 0, 0}));
 910         topology.add(crop("crop:last_cell", "lstm", cell_tensor, tensor {0, concatenation_len - 1, 0, 0}));
 911     }
 912
 913     network network(engine, topology);
 914     std::map<primitive_id, network_output> outputs;
 915
 916     network.set_input_data("input", input);
 917     network.set_input_data("hidden", hidden);
 918     network.set_input_data("cell", cell);
 919     outputs = network.execute();
 920
 921     uint32_t ref_num_output_primitives = 1;  // Output will return atleast 1 primitive
 922
 923     if (emit_last_cell) {
 924         // add another primitve to account for cell state if the output selection includes cell state
 925         ref_num_output_primitives += 1;
 926     }
 927
 928     // check if the number of returned primitives match the expected number of output primitives
 929     ASSERT_EQ(ref_num_output_primitives, outputs.size());
 930
 931     for (auto itr = outputs.begin(); itr != outputs.end(); itr++)
 932     {
 933         auto output_tensor = itr->second.get_memory().get_layout().size;
 934         primitive_id primitive_name = itr->first;
 935
 936         cldnn::memory output_memory = itr->second.get_memory();
 937         int32_t output_size = (int32_t)(itr->second.get_memory().size() / sizeof(T));
 938         cldnn::tensor ref_output_tensor;
 939         VVVVF<T> ref_primitive_output;
 940
 941         int32_t ref_batch_size = batch_size;
 942         int32_t ref_hidden_size = hidden_size;
 943         int32_t ref_directions = directions;
 944
 945         int32_t ref_seq_len = 1;
 946         // Set the reference output against which the primitive's output will be compared
 947         if (primitive_name.find("crop:last_cell") != std::string::npos)
 948         {
 949             ref_primitive_output = last_cell;
 950         }
 951         else if (emit_last_hidden || primitive_name.find("crop:last_hidden") != std::string::npos)
 952         {
 953             ref_primitive_output = last_hidden;
 954         }
 955         else
 956         {
 957             ref_seq_len = sequence_len;
 958             ref_primitive_output = ref_output;
 959         }
 960
 961         ref_output_tensor = { ref_batch_size, ref_seq_len, ref_hidden_size, ref_directions };
 962         int32_t ref_output_size = ref_batch_size * ref_seq_len * ref_hidden_size * ref_directions;
 963
 964         // The number of elements in reference should match the number of elements in the primitive's output
 965         ASSERT_EQ(ref_output_size , output_size);
 966
 967         // Compare the output tensor configuration against the reference value
 968         // Output tensor is configured in bfyx format
 969         ASSERT_EQ(ref_batch_size, output_tensor.batch[0]);
 970         ASSERT_EQ(ref_seq_len, output_tensor.feature[0]);       // Sequence length should match
 971         ASSERT_EQ(ref_directions, output_tensor.spatial[1]);    // directions should match
 972         ASSERT_EQ(ref_hidden_size, output_tensor.spatial[0]);   // input size should match
 973
 974         auto output_ptr = output_memory.pointer<T>();
 975
 976         int32_t i = 0;
 977         if (format == cldnn::format::bfyx) {
 978             for (int32_t b = 0; b < ref_batch_size; ++b) {
 979                 for (int32_t s = 0; s < ref_seq_len; ++s) {
 980                     for (int32_t d = 0; d < ref_directions; ++d) {
 981                         for (int32_t x = 0; x < ref_hidden_size; ++x) {
 982                             ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR);
 983                         }
 984                     }
 985                 }
 986             }
 987         }
 988         else if(format == cldnn::format::fyxb)
 989         {
 990             for (int32_t s = 0; s < ref_seq_len; ++s) {
 991                 for (int32_t d = 0; d < ref_directions; ++d) {
 992                     for (int32_t x = 0; x < ref_hidden_size; ++x) {
 993                         for (int32_t b = 0; b < ref_batch_size; ++b) {
 994                             ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR);
 995                         }
 996                     }
 997                 }
 998             }
 999         }
1000
1001     }
1002 }
1003
1004 // -------------------------------------------------------
1005 template<typename T>
1006 void lstm_gpu_users_test() {
1007     int sequence_len = 2;
1008     int batch_size = 1;
1009     int input_size = 1;
1010     int hidden_size = 1;
1011     int directions = 1;
1012     int min_random = -2, max_random = 2;
1013
1014     // The following test is designed to test the user dependencies of an LSTM node when replaced by subcomponents
1015     // by the graph compiler.
1016     // The output of an LSTM node is set to last_hidden only. Then we concatenate the last_hidden with the initial_hidden tensor:
1017     // (input, weights, recurrent, bias, initial_hidden, inital_cell) -> LSTM -> last_hidden
1018     // concatenation(last_hidden, initial_hidden)
1019     // If the replacing is is done correctly then the initial_hidden tensor should match the output of the concatenation
1020     // by an offset along the sequence.
1021
1022     VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
1023     VVVVF<T> ref_weights = generate_random_4d<T>(1, directions, 4 * hidden_size, input_size, min_random, max_random);
1024     VVVVF<T> ref_recurrent = generate_random_4d<T>(1, directions, 4 * hidden_size, hidden_size, min_random, max_random);
1025     VVVVF<T> ref_bias = generate_random_4d<T>(1, 1, directions, 4 * hidden_size, min_random, max_random);
1026     VVVVF<T> ref_hidden = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
1027     VVVVF<T> ref_cell = generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random);
1028     VVVVF<T> ref_output = VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(directions, VF<T>(hidden_size))));
1029
1030     VF<T> ref_input_vec = flatten_4d<T>(format::bfyx, ref_input);
1031     VF<T> ref_weights_vec = flatten_4d<T>(format::bfyx, ref_weights);
1032     VF<T> ref_recurrent_vec = flatten_4d<T>(format::bfyx, ref_recurrent);
1033     VF<T> ref_bias_vec = flatten_4d<T>(format::bfyx, ref_bias);
1034     VF<T> ref_hidden_vec = flatten_4d<T>(format::bfyx, ref_hidden);
1035     VF<T> ref_cell_vec = flatten_4d<T>(format::bfyx, ref_cell);
1036
1037     VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
1038     VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))));
1039
1040     const auto& engine = get_test_engine();
1041
1042     memory input = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {batch_size, sequence_len, input_size, 1} });
1043     memory weights = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, input_size , 4 * hidden_size } });
1044     memory recurrent = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, directions, hidden_size, 4 * hidden_size } });
1045     memory biases = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, 1, 4 * hidden_size, directions } });
1046     memory hidden = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, directions } });
1047     memory cell = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, directions } });
1048
1049     set_values(input, ref_input_vec);
1050     set_values(weights, ref_weights_vec);
1051     set_values(recurrent, ref_recurrent_vec);
1052     set_values(biases, ref_bias_vec);
1053     set_values(hidden, ref_hidden_vec);
1054     set_values(cell, ref_cell_vec);
1055
1056     topology topology;
1057     std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
1058     std::vector<primitive_id> lstm_inputs;
1059
1060     topology.add(input_layout("input", input.get_layout()));
1061     for (int i = 0; i < sequence_len; ++i)
1062     {
1063         input_ids_offsets.push_back({get_string_id(i), {0, i, 0, 0}});
1064         lstm_inputs.push_back("inputSplit:"+get_string_id(i));
1065     }
1066     topology.add(split("inputSplit", "input", input_ids_offsets));
1067     topology.add(data("weights", weights));
1068     topology.add(data("recurrent", recurrent));
1069     topology.add(data("biases", biases));
1070     topology.add(input_layout("hidden", hidden.get_layout()));
1071     topology.add(input_layout("cell", cell.get_layout()));
1072     topology.add(lstm("lstm", lstm_inputs, "weights", "recurrent",
1073                       "biases", "hidden", "cell", "", 0, false, {}, {},
1074                       cldnn_lstm_output::cldnn_lstm_output_hidden, default_offset_type));
1075     std::vector<primitive_id> output_ids_offsets {"lstm", "hidden"};
1076     topology.add(concatenation("concatenation", output_ids_offsets, concatenation::along_f));
1077
1078     network network(engine, topology);
1079     std::map<primitive_id, network_output> outputs;
1080
1081     network.set_input_data("input", input);
1082     network.set_input_data("hidden", hidden);
1083     network.set_input_data("cell", cell);
1084     outputs = network.execute();
1085
1086     // check if the number of returned primitives match the expected number of output primitives
1087     ASSERT_EQ(size_t(1), outputs.size());
1088     cldnn::memory output_memory = outputs.begin()->second.get_memory();
1089     auto output_ptr = output_memory.pointer<T>();
1090
1091     int32_t i = 0;
1092     for (int32_t b = 0; b < batch_size; ++b) {
1093         for (int32_t s = 0; s < 1; ++s) {
1094             for (int32_t d = 0; d < directions; ++d) {
1095                 for (int32_t x = 0; x < hidden_size; ++x) {
1096                     int32_t idx = x + hidden_size * (d + directions * ((s+1) + sequence_len * b));
1097                     ASSERT_NEAR(ref_hidden[b][s][d][x], output_ptr[idx], FERROR);
1098                 }
1099             }
1100         }
1101     }
1102 }
1103
1104 // -------------------------------------------------------
1105 template<typename T>
1106 void lstm_gpu_concatenated_input_test(int layers, int sequence_len, int direction,
1107                                                               int batch_size, int input_size, int hidden_size,
1108                                                               bool has_bias = true, bool has_initial_hidden = true,
1109                                                               bool has_initial_cell = true, float clip_threshold = 0,
1110                                                               bool input_forget = false)
1111 {
1112         std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
1113                 << " Sequence Len = " << sequence_len << " Direction = " << direction << " Batch Size = " << batch_size << std::endl;
1114         int min_random = -2, max_random = 2;
1115
1116         VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
1117
1118         std::vector<VVVVF<T>> ref_weights;
1119         std::vector<VVVVF<T>> ref_recurrent;
1120         std::vector<VVVVF<T>> ref_bias;
1121         std::vector<VVVVF<T>> ref_hidden;
1122         std::vector<VVVVF<T>> ref_cell;
1123         std::vector<VVVVF<T>> ref_output;
1124
1125         for (int i = 0; i < layers; ++i) {
1126                 ref_weights.push_back(generate_random_4d<T>(1, direction, 4 * hidden_size, i == 0 ? input_size : hidden_size, min_random, max_random));
1127                 ref_recurrent.push_back(generate_random_4d<T>(1, direction, 4 * hidden_size, hidden_size, min_random, max_random));
1128                 ref_bias.push_back(generate_random_4d<T>(1, 1, direction, 4 * hidden_size, min_random, max_random));
1129                 ref_hidden.push_back(generate_random_4d<T>(batch_size, 1, direction, hidden_size, min_random, max_random));
1130                 ref_cell.push_back(generate_random_4d<T>(batch_size, 1, direction, hidden_size, min_random, max_random));
1131                 ref_output.push_back(VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(direction, VF<T>(hidden_size)))));
1132         }
1133
1134         VF<T> ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
1135
1136         std::vector<VF<T>> ref_weights_vec;
1137         std::vector<VF<T>> ref_recurrent_vec;
1138         std::vector<VF<T>> ref_bias_vec;
1139         std::vector<VF<T>> ref_hidden_vec;
1140         std::vector<VF<T>> ref_cell_vec;
1141         for (int i = 0; i < layers; ++i) {
1142                 ref_weights_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_weights[i]));
1143                 ref_recurrent_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_recurrent[i]));
1144                 ref_bias_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_bias[i]));
1145                 ref_hidden_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_hidden[i]));
1146                 ref_cell_vec.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_cell[i]));
1147         }
1148
1149         VVVVF<T> last_hidden(batch_size, VVVF<T>(1, VVF<T>(direction, VF<T>(hidden_size))));
1150         VVVVF<T> last_cell(batch_size, VVVF<T>(1, VVF<T>(direction, VF<T>(hidden_size))));
1151
1152         lstm_reference(ref_input, ref_hidden[0], ref_cell[0], ref_weights[0], ref_recurrent[0], ref_bias[0], ref_output[0],
1153                 last_hidden, last_cell, has_bias, has_initial_hidden, has_initial_cell,
1154                 clip_threshold, input_forget, true);
1155
1156         for (int i = 1; i < layers; ++i) {
1157                 lstm_reference(ref_output[i - 1], ref_hidden[i], ref_cell[i], ref_weights[i], ref_recurrent[i],
1158                         ref_bias[i], ref_output[i],
1159                         last_hidden, last_cell, has_bias, has_initial_hidden, has_initial_cell,
1160                         clip_threshold, input_forget, false);
1161         }
1162
1163         const auto& engine = get_test_engine();
1164
1165         memory input = memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {batch_size, sequence_len, input_size, 1} });
1166         set_values(input, ref_input_vec);
1167
1168         std::vector<memory> weights;
1169         std::vector<memory> recurrent;
1170         std::vector<memory> biases;
1171         std::vector<memory> hidden;
1172         std::vector<memory> cell;
1173         for (int i = 0; i < layers; ++i) {
1174                 weights.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, direction, i == 0 ? input_size : hidden_size, 4 * hidden_size } }));
1175                 set_values(weights[i], ref_weights_vec[i]);
1176                 recurrent.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, direction, hidden_size, 4 * hidden_size } }));
1177                 set_values(recurrent[i], ref_recurrent_vec[i]);
1178                 if (has_bias) {
1179                         biases.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { 1, 1, 4 * hidden_size, direction } }));
1180                         set_values(biases[i], ref_bias_vec[i]);
1181                 }
1182                 if (has_initial_hidden) {
1183                         hidden.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, direction } }));
1184                         set_values(hidden[i], ref_hidden_vec[i]);
1185                 }
1186                 if (has_initial_cell) {
1187                         cell.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, { batch_size, 1, hidden_size, direction} }));
1188                         set_values(cell[i], ref_cell_vec[i]);
1189                 }
1190         }
1191
1192         topology topology;
1193         std::vector<std::pair<primitive_id, tensor>> input_ids_offsets;
1194         std::vector<primitive_id> lstm_inputs;
1195         std::vector<primitive_id> output_ids_offsets;
1196
1197         topology.add(input_layout("input", input.get_layout()));
1198         cldnn::primitive_id prev_node_id;
1199
1200     for (int i = 0; i < layers; ++i) {
1201                 std::string sid = get_string_id(i);
1202                 std::string lstm_id = "lstm" + sid;
1203                 std::string weights_id = "weights" + sid;
1204                 std::string recurrent_id = "recurrent" + sid;
1205                 std::string biases_id = "biases" + sid;
1206                 std::string hidden_id = "hidden" + sid;
1207                 std::string cell_id = "cell" + sid;
1208                 std::string output_crop_id = "crop:sequence:" + sid;
1209
1210                 topology.add(data(weights_id, weights[i]));
1211                 topology.add(data(recurrent_id, recurrent[i]));
1212                 if (has_bias) topology.add(data(biases_id, biases[i]));
1213                 if (has_initial_hidden) topology.add(input_layout(hidden_id, hidden[i].get_layout()));
1214                 if (has_initial_cell) topology.add(input_layout(cell_id, cell[i].get_layout()));
1215                 if (i == 0) {
1216             topology.add(lstm(lstm_id, { "input" }, weights_id, recurrent_id,
1217                                 has_bias ? biases_id : "", has_initial_hidden ? hidden_id : "", has_initial_cell ? cell_id : "", "",
1218                                 clip_threshold, input_forget, {}, {},
1219                                 cldnn_lstm_output::cldnn_lstm_output_sequence_cell, default_offset_type));
1220                 }
1221                 else {
1222                         topology.add(lstm(lstm_id, { prev_node_id }, weights_id, recurrent_id,
1223                                 has_bias ? biases_id : "", has_initial_hidden ? hidden_id : "", has_initial_cell ? cell_id : "", "",
1224                                 clip_threshold, input_forget, {}, {},
1225                                 cldnn_lstm_output::cldnn_lstm_output_sequence_cell, default_offset_type));
1226                 }
1227
1228         // Crop out the whole output sequence element
1229                 topology.add(crop(output_crop_id, lstm_id, {batch_size, sequence_len, hidden_size, direction}, {0, 0, 0, 0}));
1230
1231        // Save the node id to provide it as input to the next lstm layer
1232                 prev_node_id = output_crop_id;
1233         }
1234
1235         network network(engine, topology);
1236         network.set_input_data("input", input);
1237         for (int i = 0; i < layers; ++i) {
1238                 std::string sid = get_string_id(i);
1239                 if (has_initial_hidden) network.set_input_data("hidden" + sid, hidden[i]);
1240                 if (has_initial_cell) network.set_input_data("cell" + sid, cell[i]);
1241         }
1242         auto outputs = network.execute();
1243         {
1244                 ASSERT_EQ(outputs.size(), size_t(1));
1245                 size_t output_size = outputs.begin()->second.get_memory().size() / sizeof(T);
1246                 ASSERT_EQ(output_size, size_t(hidden_size * sequence_len * batch_size * direction));
1247
1248                 auto output = outputs.begin()->second.get_memory();
1249
1250                 // Get the output tensor
1251                 cldnn::layout output_layout = output.get_layout();
1252                 cldnn::tensor output_tensor = output_layout.size;
1253
1254                 // Compare the output tensor configuration against the reference value
1255                 // Output tensor is configured in bfyx format
1256                 ASSERT_EQ(batch_size, output_tensor.batch[0]);
1257                 ASSERT_EQ(sequence_len, output_tensor.feature[0]);
1258                 ASSERT_EQ(direction, output_tensor.spatial[1]);
1259                 ASSERT_EQ(hidden_size, output_tensor.spatial[0]);
1260
1261                 auto output_ptr = output.pointer<T>();
1262                 int32_t i = 0;
1263                 for (int32_t b = 0; b < batch_size; ++b) {
1264                         for (int32_t s = 0; s < sequence_len; ++s) {
1265                                 for (int32_t d = 0; d < direction; ++d) {
1266                                         for (int32_t x = 0; x < hidden_size; ++x) {
1267                                                 ASSERT_NEAR(ref_output[layers - 1][b][s][d][x], output_ptr[i++], FERROR);
1268                                         }
1269                                 }
1270                         }
1271                 }
1272         }
1273 }
1274
1275 // This test checks chained and stacked LSTM topology. The configuration allows to create
1276 // LSTM topology with multiple layers and can also be chained together.
1277 template<typename T>
1278 void lstm_gpu_chain_test(int batch_size, int input_size, int hidden_size,
1279                          int directions, size_t layers, size_t chains, int sequence_len,
1280                          const cldnn_lstm_output& output_selection)
1281 {
1282     int min_random = -2, max_random = 2;
1283     bool has_bias = false;
1284     bool has_initial_hidden = false;
1285     bool has_initial_cell = false;
1286     float clip_threshold = 0;
1287     bool input_forget = false;
1288
1289     std::cout << "Layers = " << layers << " Input Size = " << input_size << " Hidden Size = " << hidden_size
1290         << " Sequence Len = " << sequence_len << " Directions = " << directions << " Batch Size = " << batch_size
1291         << " Output selection: " << output_selection << std::endl;
1292
1293     VVVVF<T> ref_input = generate_random_4d<T>(batch_size, sequence_len, 1, input_size, min_random, max_random);
1294     std::vector<std::vector< VVVVF<T>>> ref_weights;
1295     std::vector<std::vector< VVVVF<T>>> ref_recurrent;
1296     std::vector<std::vector< VVVVF<T>>> ref_bias;
1297     std::vector<std::vector< VVVVF<T>>> ref_hidden;
1298     std::vector<std::vector< VVVVF<T>>> ref_cell;
1299     std::vector<std::vector< VVVVF<T>>> ref_output;
1300
1301     // Create the 4 dimensional weight, bias, hidden, cell state and output vectors
1302     for (size_t chain = 0; chain < chains; chain++) {
1303
1304         std::vector<VVVVF<T>> per_chain_ref_weights;
1305         std::vector<VVVVF<T>> per_chain_ref_recurrent;
1306         std::vector<VVVVF<T>> per_chain_ref_bias;
1307         std::vector<VVVVF<T>> per_chain_ref_hidden;
1308         std::vector<VVVVF<T>> per_chain_ref_cell;
1309         std::vector<VVVVF<T>> per_chain_ref_output;
1310
1311         for (size_t layer = 0; layer < layers; layer++) {
1312             per_chain_ref_weights.push_back(generate_random_4d<T>(1, directions, 4 * hidden_size, (layer == 0) ? input_size : hidden_size, min_random, max_random));
1313             per_chain_ref_recurrent.push_back(generate_random_4d<T>(1, directions, 4 * hidden_size, hidden_size, min_random, max_random));
1314             per_chain_ref_bias.push_back(generate_random_4d<T>(1, 1, directions, 4 * hidden_size, min_random, max_random));
1315             per_chain_ref_hidden.push_back(generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random));
1316             per_chain_ref_cell.push_back(generate_random_4d<T>(batch_size, 1, directions, hidden_size, min_random, max_random));
1317             per_chain_ref_output.push_back(VVVVF<T>(batch_size, VVVF<T>(sequence_len, VVF<T>(directions, VF<T>(hidden_size)))));
1318         }
1319
1320         ref_weights.push_back(per_chain_ref_weights);
1321         ref_recurrent.push_back(per_chain_ref_recurrent);
1322         ref_bias.push_back(per_chain_ref_bias);
1323         ref_hidden.push_back(per_chain_ref_hidden);
1324         ref_cell.push_back(per_chain_ref_cell);
1325         ref_output.push_back(per_chain_ref_output);
1326     }
1327
1328     VF<T> ref_input_vec;
1329     std::vector<std::vector< VF<T>>> ref_weights_vec;
1330     std::vector<std::vector< VF<T>>> ref_recurrent_vec;
1331     std::vector<std::vector< VF<T>>> ref_bias_vec;
1332     std::vector<std::vector< VF<T>>> ref_hidden_vec;
1333     std::vector<std::vector< VF<T>>> ref_cell_vec;
1334     std::vector<std::vector< VF<T>>> ref_output_vec;
1335
1336     ref_input_vec = flatten_4d<T>(cldnn::format::bfyx, ref_input);
1337
1338     // flatten all the 4 dimensional vectors across chains and layers
1339     for (size_t chain = 0; chain < chains; chain++) {
1340
1341         std::vector<VF<T>> per_chain_ref_weights;
1342         std::vector<VF<T>> per_chain_ref_recurrent;
1343         std::vector<VF<T>> per_chain_ref_bias;
1344         std::vector<VF<T>> per_chain_ref_hidden;
1345         std::vector<VF<T>> per_chain_ref_cell;
1346         std::vector<VF<T>> per_chain_ref_output;
1347
1348         for (size_t layer = 0; layer < layers; layer++) {
1349             per_chain_ref_weights.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_weights[chain][layer]));
1350             per_chain_ref_recurrent.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_recurrent[chain][layer]));
1351             per_chain_ref_bias.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_bias[chain][layer]));
1352             per_chain_ref_hidden.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_hidden[chain][layer]));
1353             per_chain_ref_cell.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_cell[chain][layer]));
1354             per_chain_ref_output.push_back(flatten_4d<T>(cldnn::format::bfyx, ref_output[chain][layer]));
1355         }
1356
1357         ref_weights_vec.push_back(per_chain_ref_weights);
1358         ref_recurrent_vec.push_back(per_chain_ref_recurrent);
1359         ref_bias_vec.push_back(per_chain_ref_bias);
1360         ref_hidden_vec.push_back(per_chain_ref_hidden);
1361         ref_cell_vec.push_back(per_chain_ref_cell);
1362         ref_output_vec.push_back(per_chain_ref_output);
1363     }
1364
1365     std::vector<std::vector<VVVVF<T>>> last_hidden(chains, std::vector<VVVVF<T> >(layers, VVVVF<T>(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))))));
1366     std::vector<std::vector<VVVVF<T>>> last_cell(chains, std::vector<VVVVF<T> >(layers, VVVVF<T>(batch_size, VVVF<T>(1, VVF<T>(directions, VF<T>(hidden_size))))));
1367
1368     for (size_t chain = 0; chain < chains; chain++) {
1369         lstm_reference(ref_input, ref_hidden[chain][0], ref_cell[chain][0], ref_weights[chain][0],
1370                        ref_recurrent[chain][0], ref_bias[chain][0], ref_output[chain][0],
1371                        last_hidden[chain][0], last_cell[chain][0], has_bias,
1372                        chain == 0 ? has_initial_hidden : true,
1373                        chain == 0 ? has_initial_cell : true,
1374                        clip_threshold, input_forget, true);
1375
1376         if (chain < chains - 1)
1377         {
1378             ref_hidden[chain + 1][0] = last_hidden[chain][0];
1379             ref_cell[chain + 1][0] = last_cell[chain][0];
1380         }
1381     }
1382
1383     for (size_t layer = 1; layer < layers; ++layer) {
1384         for (size_t chain = 0; chain < chains; chain++) {
1385             lstm_reference(ref_output[chain][layer - 1], ref_hidden[chain][layer], ref_cell[chain][layer],
1386                            ref_weights[chain][layer], ref_recurrent[chain][layer], ref_bias[chain][layer],
1387                            ref_output[chain][layer], last_hidden[chain][layer], last_cell[chain][layer], has_bias,
1388                            chain == 0 ? has_initial_hidden : true,
1389                            chain == 0 ? has_initial_cell : true,
1390                            clip_threshold, input_forget,
1391                            false);
1392
1393             if (chain < chains - 1)
1394             {
1395                 ref_hidden[chain + 1][layer] = last_hidden[chain][layer];
1396                 ref_cell[chain + 1][layer] = last_cell[chain][layer];
1397             }
1398         }
1399     }
1400
1401     const auto& engine = get_test_engine();
1402     tensor input_tensor = { batch_size, sequence_len, input_size, 1 };
1403     layout layout = { type_to_data_type<T>::value, cldnn::format::bfyx, input_tensor };
1404
1405     memory input = memory::allocate(engine, layout);
1406     set_values(input, ref_input_vec);
1407
1408     // 2-dim vectors to support chain and layers
1409     std::vector<std::vector<memory>> weights;
1410     std::vector<std::vector<memory>> recurrent;
1411     std::vector<std::vector<memory>> biases;
1412     std::vector<std::vector<memory>> hidden;
1413     std::vector<std::vector<memory>> cell;
1414
1415     for (size_t chain = 0; chain < chains; chain++) {
1416         std::vector<memory> per_chain_weights;
1417         std::vector<memory> per_chain_recurrent;
1418         std::vector<memory> per_chain_biases;
1419         std::vector<memory> per_chain_hidden;
1420         std::vector<memory> per_chain_cell;
1421
1422         for (size_t layer = 0; layer < layers; layer++) {
1423             per_chain_weights.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, directions, layer == 0 ? input_size : hidden_size, 4 * hidden_size} }));
1424             set_values(per_chain_weights[layer], ref_weights_vec[chain][layer]);
1425
1426             per_chain_recurrent.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, directions, hidden_size, 4 * hidden_size} }));
1427             set_values(per_chain_recurrent[layer], ref_recurrent_vec[chain][layer]);
1428
1429             if (has_bias)
1430             {
1431                 per_chain_biases.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, 1, 4 * hidden_size, directions} }));
1432                 set_values(per_chain_biases[layer], ref_bias_vec[chain][layer]);
1433             }
1434
1435             if (has_initial_hidden)
1436             {
1437                 per_chain_hidden.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, 1, hidden_size, directions} }));
1438                 set_values(per_chain_hidden[layer], ref_hidden_vec[chain][layer]);
1439             }
1440
1441             if (has_initial_cell)
1442             {
1443                 per_chain_cell.push_back(memory::allocate(engine, { type_to_data_type<T>::value, format::bfyx, {1, 1, hidden_size, directions} }));
1444                 set_values(per_chain_cell[layer], ref_cell_vec[chain][layer]);
1445             }
1446         }
1447
1448         weights.push_back(per_chain_weights);
1449         recurrent.push_back(per_chain_recurrent);
1450         biases.push_back(per_chain_biases);
1451         hidden.push_back(per_chain_hidden);
1452         cell.push_back(per_chain_cell);
1453     }
1454
1455     // Start creating the topology
1456     cldnn::topology topology;
1457     std::vector<std::pair<primitive_id, cldnn::tensor>> input_ids_offsets;
1458     std::vector<primitive_id> lstm_inputs;
1459     std::vector<primitive_id> output_ids_offsets;
1460
1461     topology.add(input_layout("input", input.get_layout()));
1462
1463     for (int feature = 0; feature < sequence_len; feature++) {
1464         input_ids_offsets.push_back({ get_string_id(feature), {0, feature, 0, 0} });
1465         lstm_inputs.push_back("inputSplit:" + get_string_id(feature));
1466     }
1467     topology.add(split("inputSplit", "input", input_ids_offsets));
1468
1469     bool emit_last_hidden = output_selection == cldnn_lstm_output_hidden
1470         || output_selection == cldnn_lstm_output_hidden_cell;
1471
1472     std::vector<cldnn::primitive_id> output_sequence_ids;
1473     std::vector<cldnn::primitive_id> last_hidden_ids;
1474     std::vector<cldnn::primitive_id> last_cell_ids;
1475
1476     for (size_t chain = 0; chain < chains; chain++) {
1477
1478         // Add all the primitives to the network
1479         std::vector<cldnn::primitive_id> prev_output_sequence_ids(output_sequence_ids);
1480         std::vector<cldnn::primitive_id> prev_last_hidden_ids(last_hidden_ids);
1481         std::vector<cldnn::primitive_id> prev_last_cell_ids(last_cell_ids);
1482
1483         // Erase all the temporary primitive id containers
1484         output_sequence_ids.clear();
1485         last_cell_ids.clear();
1486         last_hidden_ids.clear();
1487
1488         for (size_t layer = 0; layer < layers; layer++) {
1489             std::string chain_id = get_string_id(chain);
1490             std::string layer_id = get_string_id(layer);
1491             std::string lstm_id = "lstm:" + chain_id + ":" + layer_id;
1492             std::string weights_id = "weights:" + chain_id + ":" + layer_id;
1493             std::string recurrent_id = "recurrent:" + chain_id + ":" + layer_id;
1494             std::string biases_id = "biases:" + chain_id + ":" + layer_id;
1495             std::string hidden_id = "hidden:" + chain_id + ":" + layer_id;
1496             std::string cell_id = "cell:" + chain_id + ":" + layer_id;
1497             std::string crop_seq_id = "crop:sequence:" + chain_id + ":" + layer_id;
1498             std::string crop_last_cell_id = "crop:last_cell:" + chain_id + ":" + layer_id;
1499             std::string crop_last_hidden_id = "crop:last_hidden:" + chain_id + ":" + layer_id;
1500
1501             primitive_id initial_hidden_id;
1502             primitive_id initial_cell_id;
1503             cldnn_lstm_output output_selection_per_layer;
1504
1505             topology.add(data(weights_id, weights[chain][layer]));
1506             topology.add(data(recurrent_id, recurrent[chain][layer]));
1507             if (has_bias) topology.add(data(biases_id, biases[chain][layer]));
1508
1509             if (chain == 0 && layer == 0)
1510             {
1511                 if (has_initial_hidden) topology.add(input_layout(hidden_id, hidden[chain][layer].get_layout()));
1512                 if (has_initial_cell) topology.add(input_layout(cell_id, cell[chain][layer].get_layout()));
1513             }
1514
1515             // Get the initial hidden and initial cell for each layer for each chain link
1516             if (chain == 0)
1517             {
1518                 initial_hidden_id = has_initial_hidden ? hidden_id : "";
1519                 initial_cell_id = has_initial_cell ? cell_id : "";
1520             }
1521             else
1522             {
1523                 initial_hidden_id = prev_last_hidden_ids[layer];
1524                 initial_cell_id = prev_last_cell_ids[layer];
1525             }
1526
1527             // Output selection for all the layers except the last layer has to have the sequence,
1528             // last hidden and last cell
1529             if (layer < layers - 1)
1530             {
1531                 output_selection_per_layer = cldnn_lstm_output::cldnn_lstm_output_sequence_cell;
1532             }
1533             else
1534             {
1535                 // For the last layer, use the output selection provided by the user
1536                 output_selection_per_layer = output_selection;
1537             }
1538
1539             if (layer == 0)
1540             {
1541                 topology.add(lstm(lstm_id, lstm_inputs, weights_id, recurrent_id,
1542                     has_bias ? biases_id : "",
1543                     initial_hidden_id, initial_cell_id,
1544                     "", clip_threshold, input_forget, {}, {},
1545                     output_selection_per_layer, default_offset_type));
1546             }
1547             else
1548             {
1549                 topology.add(lstm(lstm_id, { output_sequence_ids[layer - 1] }, weights_id, recurrent_id,
1550                     has_bias ? biases_id : "",
1551                     initial_hidden_id, initial_cell_id,
1552                     "", clip_threshold, input_forget, {}, {},
1553                     output_selection_per_layer, default_offset_type));
1554             }
1555
1556             tensor sequence_tensor{ batch_size, sequence_len, hidden_size, directions };
1557             tensor cell_tensor{ batch_size, 1, hidden_size, directions };
1558             tensor last_hidden_tensor{ batch_size, 1, hidden_size, directions };
1559
1560             // For all the layers except the last layer, we need to crop output sequence,
1561             // last hidden and last cell.
1562             // The output sequence goes into the next layer of lstm in a chain link
1563             // The last cell state and last hidden go to the lstm node in the same layer
1564             // next in chain
1565             topology.add(crop(crop_seq_id, lstm_id, sequence_tensor, tensor{ 0, 0, 0, 0 }));  // Add crop to get the sequence
1566             topology.add(crop(crop_last_hidden_id, lstm_id, last_hidden_tensor, tensor{ 0, sequence_len - 1, 0, 0 }));  // Add crop to get the last hidden element
1567             topology.add(crop(crop_last_cell_id, lstm_id, cell_tensor, tensor{ 0, sequence_len, 0, 0 }));  // Add crop to get the last cell element
1568
1569             // Keep a copy of the sequence, last hidden and last cell primitve id for each layer
1570             output_sequence_ids.push_back(crop_seq_id);
1571             last_hidden_ids.push_back(crop_last_hidden_id);
1572             last_cell_ids.push_back(crop_last_cell_id);
1573         }
1574     }
1575
1576     // Creating network out of the above designed topology
1577     cldnn::network network(engine, topology);
1578     network.set_input_data("input", input);
1579     for (size_t layer = 0; layer < layers; layer++) {
1580         std::string sid = get_string_id(layer);
1581         if (has_initial_hidden) network.set_input_data("hidden:000:" + sid, hidden[0][layer]); // 0 is the chain link index
1582         if (has_initial_cell) network.set_input_data("cell:000:" + sid, cell[0][layer]); // 0 is the chain link index
1583     }
1584
1585     auto outputs = network.execute();
1586     for (auto itr = outputs.begin(); itr != outputs.end(); itr++)
1587     {
1588         auto output_tensor = itr->second.get_memory().get_layout().size;
1589         primitive_id primitive_name = itr->first;
1590
1591         // Split the primitive id to get the chain id
1592         // Eg: primitive id: crop:last_cell:XXX:YYY
1593         // XXX is the chain id
1594         // YYY is the layer id
1595         std::string chain_str = primitive_name.substr(primitive_name.find(":", primitive_name.find(":") + 1) + 1, 5);
1596         std::string layer_str = primitive_name.substr(primitive_name.find(":", primitive_name.find(":", primitive_name.find(":") + 1) + 1) + 1, 5);
1597         size_t chain_id = stoi(chain_str);
1598         size_t layer_id = stoi(layer_str);
1599
1600         cldnn::memory output_memory = itr->second.get_memory();
1601         int32_t output_size = (int32_t)(itr->second.get_memory().size() / sizeof(T));
1602         cldnn::tensor ref_output_tensor;
1603         VVVVF<T> ref_primitive_output;
1604
1605         int32_t ref_batch_size = batch_size;
1606         int32_t ref_hidden_size = hidden_size;
1607         int32_t ref_directions = directions;
1608
1609         int32_t ref_seq_len = 1;
1610
1611         // Set the reference output against which the primitive's output will be compared
1612         if (primitive_name.find("crop:last_cell") != std::string::npos)
1613         {
1614             ref_primitive_output = last_cell[chain_id][layer_id];
1615         }
1616         else if (emit_last_hidden || primitive_name.find("crop:last_hidden") != std::string::npos)
1617         {
1618             ref_primitive_output = last_hidden[chain_id][layer_id];
1619         }
1620         else
1621         {
1622             ref_seq_len = sequence_len;
1623             ref_primitive_output = ref_output[chain_id][layers - 1];
1624         }
1625
1626         ref_output_tensor = { ref_batch_size, ref_seq_len, ref_hidden_size, ref_directions };
1627         int32_t ref_output_size = ref_batch_size * ref_seq_len * ref_hidden_size * ref_directions;
1628
1629         // The number of elements in reference should match the number of elements in the primitive's output
1630         ASSERT_EQ(ref_output_size, output_size);
1631
1632         // Compare the output tensor configuration against the reference value
1633         // Output tensor is configured in bfyx format
1634         ASSERT_EQ(ref_batch_size, output_tensor.batch[0]);
1635         ASSERT_EQ(ref_seq_len, output_tensor.feature[0]);               // Sequence length should match
1636         ASSERT_EQ(ref_directions, output_tensor.spatial[1]);    // directions should match
1637         ASSERT_EQ(ref_hidden_size, output_tensor.spatial[0]);   // input size should match
1638
1639         auto output_ptr = output_memory.pointer<T>();
1640
1641         int32_t i = 0;
1642         for (int32_t b = 0; b < ref_batch_size; ++b) {
1643             for (int32_t s = 0; s < ref_seq_len; ++s) {
1644                 for (int32_t d = 0; d < ref_directions; ++d) {
1645                     for (int32_t x = 0; x < ref_hidden_size; ++x) {
1646                         ASSERT_NEAR(ref_primitive_output[b][s][d][x], output_ptr[i++], FERROR);
1647                     }
1648                 }
1649             }
1650         }
1651     }
1652 }
1653
1654
1655 TEST(lstm_gemm_gpu, generic_lstm_gemm_test_f32) {
1656     generic_lstm_gemm_gpu_test<float>(1, 1, 3, 6, 2, true, true);
1657 }
1658
1659 TEST(lstm_gemm_gpu, generic_lstm_gemm_no_bias_f32) {
1660     generic_lstm_gemm_gpu_test<float>(1, 1, 3, 6, 2, false, true);
1661 }
1662
1663 TEST(lstm_gemm_gpu, generic_lstm_gemm_no_hidden_f32) {
1664     generic_lstm_gemm_gpu_test<float>(1, 1, 3, 6, 2, true, false);
1665 }
1666
1667 TEST(lstm_gemm_gpu, generic_lstm_gemm_no_hidden_bias_f32) {
1668     generic_lstm_gemm_gpu_test<float>(1, 1, 3, 6, 2, false, false);
1669 }
1670
1671 // LSTM GEMM tests to test LSTM GEMMV kernel implementation
1672 TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_test_f32) {
1673     generic_lstm_gemm_gpu_test<float>(5, 1, 1, 1024, 1024, true, true);
1674 }
1675
1676 TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_no_bias_f32) {
1677     generic_lstm_gemm_gpu_test<float>(1, 1, 1, 256, 2, false, true);
1678 }
1679
1680 TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_no_hidden_f32) {
1681     generic_lstm_gemm_gpu_test<float>(1, 1, 1, 64, 2, true, false);
1682 }
1683
1684 TEST(lstm_gemm_gpu, gemv_bfyx_1x64_lstm_gemm_no_hidden_bias_f32) {
1685     generic_lstm_gemm_gpu_test<float>(1, 1, 1, 64, 2, false, false);
1686 }
1687
1688 // LSTM ELT Tests
1689 TEST(lstm_elt_gpu, generic_lstm_elt_test_clip_f32) {
1690     generic_lstm_elt_gpu_test<float>(1, 1, 4, 6, 3, true, 0.3f);
1691 }
1692
1693 TEST(lstm_elt_gpu, generic_lstm_elt_test_input_forget_f32) {
1694     generic_lstm_elt_gpu_test<float>(1, 1, 4, 6, 3, true, 0.f, 1);
1695 }
1696
1697 TEST(lstm_elt_gpu, generic_lstm_elt_test_clip_input_forget_f32) {
1698     generic_lstm_elt_gpu_test<float>(1, 1, 4, 6, 3, true, 0.5f, 1);
1699 }
1700
1701 TEST(lstm_elt_gpu, generic_lstm_elt_test_f32) {
1702     generic_lstm_elt_gpu_test<float>(1, 1, 4, 6, 3, true);
1703 }
1704
1705 TEST(lstm_elt_gpu, generic_lstm_elt_no_cell_f32) {
1706     generic_lstm_elt_gpu_test<float>(1, 1, 4, 6, 3, false);
1707 }
1708
1709 TEST(lstm_custom_gpu, generic_lstm_custom_f32) {
1710     generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, true, true, true);
1711 }
1712
1713 TEST(lstm_custom_gpu, generic_lstm_custom_no_biasf32) {
1714     generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, false, true, true);
1715 }
1716
1717 TEST(lstm_custom_gpu, generic_lstm_custom_no_hidden_f32) {
1718     generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, true, false, true);
1719 }
1720
1721 TEST(lstm_custom_gpu, generic_lstm_custom_no_bias_hidden_f32) {
1722     generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, false, false, true);
1723 }
1724
1725 TEST(lstm_custom_gpu, generic_lstm_custom_no_cell_f32) {
1726     generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, true, true, false);
1727 }
1728
1729 TEST(lstm_custom_gpu, generic_lstm_custom_no_bias_cell_f32) {
1730     generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, false, true, false);
1731 }
1732
1733 TEST(lstm_custom_gpu, generic_lstm_custom_no_hidden_cell_f32) {
1734     generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, true, false, false);
1735 }
1736
1737 TEST(lstm_custom_gpu, generic_lstm_custom_no_bias_hidden_cell_f32) {
1738     generic_lstm_custom_gpu_test<float>(3, 1, 3, 3, 2, false, false, false);
1739 }
1740
1741 // generic_lstm_gpu_test paramters:
1742 // layers, sequence, dir, batch, input, hidden, bias, initial_h, initial_cell, threshold, coupled_input_forget
1743 TEST(lstm_gpu, generic_lstm_f32) {
1744     generic_lstm_gpu_test<float>(1, 7, 1, 3, 3, 2, true, true, true);
1745 }
1746
1747 TEST(lstm_gpu, generic_lstm_no_bias_f32) {
1748     generic_lstm_gpu_test<float>(1, 7, 1, 3, 3, 2, false, true, true);
1749 }
1750
1751 TEST(lstm_gpu, generic_lstm_no_hidden_f32) {
1752     generic_lstm_gpu_test<float>(1, 7, 1, 5, 4, 3, true, false, true);
1753 }
1754
1755 TEST(lstm_gpu, generic_lstm_no_bias_hidden_f32) {
1756     generic_lstm_gpu_test<float>(1, 7, 1, 5, 4, 3, false, false, true);
1757 }
1758
1759 TEST(lstm_gpu, generic_lstm_no_cell_f32) {
1760     generic_lstm_gpu_test<float>(1, 7, 1, 5, 4, 3, true, true, false);
1761 }
1762
1763 TEST(lstm_gpu, generic_lstm_no_bias_cell_f32) {
1764     generic_lstm_gpu_test<float>(1, 7, 1, 5, 4, 3, false, true, false);
1765 }
1766
1767 TEST(lstm_gpu, generic_lstm_no_hidden_cell_f32) {
1768     generic_lstm_gpu_test<float>(1, 7, 1, 5, 4, 3, true, false, false);
1769 }
1770
1771 TEST(lstm_gpu, generic_lstm_no_bias_hidden_cell_f32) {
1772     generic_lstm_gpu_test<float>(1, 7, 1, 5, 4, 3, false, false, false);
1773 }
1774
1775 TEST(lstm_gpu, generic_lstm_clip_f32) {
1776     generic_lstm_gpu_test<float>(1, 7, 1, 3, 3, 2, true, true, true, 0.3f, 0);
1777 }
1778
1779 TEST(lstm_gpu, generic_lstm_input_forget_f32) {
1780     generic_lstm_gpu_test<float>(1, 7, 1, 3, 3, 2, true, true, true, 0.f, 1);
1781 }
1782
1783 TEST(lstm_gpu, generic_lstm_clip_input_forget_f32) {
1784     generic_lstm_gpu_test<float>(1, 7, 1, 3, 3, 2, true, true, true, 0.3f, 1);
1785 }
1786
1787 TEST(lstm_gpu, generic_lstm_offset_order_ifoz_f32) {
1788     default_offset_type = cldnn_lstm_offset_order_ifoz;
1789     generic_lstm_gpu_test<float>(1, 7, 1, 3, 3, 2, true, true, true);
1790     default_offset_type = cldnn_lstm_offset_order_iofz;
1791 }
1792
1793 TEST(lstm_gpu, generic_lstm_canonical_f32) {
1794     generic_lstm_gpu_test<float>(1, 1, 1, 1, 1, 1, true, true, true);
1795 }
1796
1797 // bidirectional support
1798 TEST(lstm_gpu, generic_lstm_bi_f32) {
1799     generic_lstm_gpu_test<float>(1, 7, 2, 2, 3, 4, false, false, false);
1800 }
1801
1802 TEST(lstm_gpu, generic_lstm_bi_bias_f32) {
1803     generic_lstm_gpu_test<float>(1, 7, 2, 2, 3, 4, true, false, false);
1804 }
1805
1806 TEST(lstm_gpu, generic_lstm_bi_bias_hidden_f32) {
1807     generic_lstm_gpu_test<float>(1, 7, 2, 2, 3, 4, true, true, false);
1808 }
1809
1810 TEST(lstm_gpu, generic_lstm_bi_bias_hidden_cell_f32) {
1811     generic_lstm_gpu_test<float>(1, 7, 2, 2, 3, 4, true, true, true);
1812 }
1813
1814 // multi-layer support
1815 TEST(lstm_gpu, generic_lstm_stacked_no_seq_f32) {
1816     generic_lstm_gpu_test<float>(4, 1, 1, 3, 3, 2, true, true, true);
1817 }
1818
1819 TEST(lstm_gpu, generic_lstm_stacked_seq_f32) {
1820     generic_lstm_gpu_test<float>(4, 7, 1, 3, 3, 2, true, true, true);
1821 }
1822
1823 TEST(lstm_gpu, generic_lstm_stacked_bi_f32) {
1824     generic_lstm_gpu_test<float>(4, 7, 2, 3, 3, 2, true, true, true);
1825 }
1826
1827 TEST(lstm_gpu, generic_lstm_stacked_seq_bi_f32) {
1828     generic_lstm_gpu_test<float>(4, 7, 2, 3, 3, 2, true, true, true);
1829 }
1830
1831 // optional outputs support
1832 TEST(lstm_gpu, output_test_sequence_f32) {
1833     lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_sequence, 1);
1834 }
1835
1836 TEST(lstm_gpu, output_test_hidden_f32) {
1837     lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_hidden, 1);
1838 }
1839
1840 TEST(lstm_gpu, output_test_hidden_cell_f32) {
1841     lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_hidden_cell, 1);
1842 }
1843
1844 TEST(lstm_gpu, output_test_sequence_cell_f32) {
1845     lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_sequence_cell, 1);
1846 }
1847
1848 TEST(lstm_gpu, output_test_sequence_bi_f32) {
1849     lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_sequence, 2);
1850 }
1851
1852 TEST(lstm_gpu, output_test_hidden_bi_f32) {
1853     lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_hidden, 2);
1854 }
1855
1856 TEST(lstm_gpu, output_test_hidden_cell_bi_f32) {
1857     lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_hidden_cell, 2);
1858 }
1859
1860 TEST(lstm_gpu, output_test_sequence_cell_bi_f32) {
1861     lstm_gpu_output_test<float>(cldnn_lstm_output::cldnn_lstm_output_sequence_cell, 2);
1862 }
1863
1864 // format tests
1865 TEST(lstm_gpu, lstm_gpu_format_bfyx_f32) {
1866     lstm_gpu_format_test<float>(cldnn::format::bfyx, 1);
1867 }
1868
1869 TEST(lstm_gpu, lstm_gpu_format_bfyx_bi_f32) {
1870     lstm_gpu_format_test<float>(cldnn::format::bfyx, 2);
1871 }
1872
1873 TEST(lstm_gpu, lstm_gpu_format_fyxb_f32) {
1874     lstm_gpu_format_test<float>(cldnn::format::fyxb, 1);
1875 }
1876
1877 TEST(lstm_gpu, lstm_gpu_format_fyxb_bi_f32) {
1878     lstm_gpu_format_test<float>(cldnn::format::fyxb, 2);
1879 }
1880
1881 // test for LSTM users' dependencies
1882 TEST(lstm_gpu, lstm_users_f32) {
1883     lstm_gpu_users_test<float>();
1884 }
1885
1886 // Test for LSTM with concatenated input
1887 TEST(lstm_gpu, generic_lstm_concatenated_input) {
1888     lstm_gpu_concatenated_input_test<float>(1, 2, 2, 1, 1, 1, true, true, true);
1889 }
1890
1891 TEST(lstm_gpu, generic_lstm_concatenated_input_multi_layer) {
1892     lstm_gpu_concatenated_input_test<float>(5, 5, 2, 1, 1, 4, true, true, true);
1893 }
1894
1895 // test for LSTM with chain and stack (multilayer)
1896 TEST(lstm_gpu, generic_lstm_chained_unidirectional_f32) {
1897     // batch size = 1
1898     // input size = 2
1899     // hidden size = 4
1900     // directions = 1
1901     // layers = 1
1902     // chains = 1
1903     // sequence length = 1
1904     // output selection = output sequence and cell
1905     lstm_gpu_chain_test<float>(1, 2, 4, 1, 1, 2, 1, cldnn_lstm_output::cldnn_lstm_output_sequence_cell);
1906 }
1907
1908 TEST(lstm_gpu, generic_lstm_chained_bidirectional_f32) {
1909     // batch size = 1
1910     // input size = 2
1911     // hidden size = 4
1912     // directions = 2
1913     // layers = 1
1914     // chains = 1
1915     // sequence length = 1
1916     // output selection = output sequence and cell
1917     lstm_gpu_chain_test<float>(1, 2, 4, 2, 1, 1, 1, cldnn_lstm_output::cldnn_lstm_output_sequence_cell);
1918 }
1919
1920 TEST(lstm_gpu, generic_lstm_chained_no_stack_bidirectional_f32) {
1921     // batch size = 2
1922     // input size = 2
1923     // hidden size = 4
1924     // directions = 2
1925     // layers = 1
1926     // chains = 2
1927     // sequence length = 5
1928     // output selection = output sequence and cell
1929     lstm_gpu_chain_test<float>(2, 2, 4, 2, 1, 2, 5, cldnn_lstm_output::cldnn_lstm_output_sequence_cell);
1930 }
1931
1932 TEST(lstm_gpu, generic_lstm_chained_stacked_bidirectional_f32) {
1933     // batch size = 2
1934     // input size = 2
1935     // hidden size = 4
1936     // directions = 2
1937     // layers = 4
1938     // chains = 2
1939     // sequence length = 5
1940     // output selection = output sequence and cell
1941     lstm_gpu_chain_test<float>(2, 2, 4, 2, 4, 2, 5, cldnn_lstm_output::cldnn_lstm_output_sequence_cell);
1942 }
1943
1944 // FP16 Half precision tests
1945 TEST(lstm_gemm_gpu, generic_lstm_gemm_test_f16) {
1946     generic_lstm_gemm_gpu_test<FLOAT16>(1, 1, 3, 6, 2, true, true);
1947 }
1948
1949 TEST(lstm_gemm_gpu, generic_lstm_gemm_no_bias_f16) {
1950     generic_lstm_gemm_gpu_test<FLOAT16>(1, 1, 3, 6, 2, false, true);
1951 }
1952
1953 TEST(lstm_gemm_gpu, generic_lstm_gemm_no_hidden_f16) {
1954     generic_lstm_gemm_gpu_test<FLOAT16>(1, 1, 3, 6, 2, true, false);
1955 }
1956
1957 TEST(lstm_gemm_gpu, generic_lstm_gemm_no_hidden_bias_f16) {
1958     generic_lstm_gemm_gpu_test<FLOAT16>(1, 1, 3, 6, 2, false, false);
1959 }
1960
1961 TEST(lstm_elt_gpu, generic_lstm_elt_test_clip_f16) {
1962     generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, true, 0.3f);
1963 }
1964
1965 TEST(lstm_elt_gpu, generic_lstm_elt_test_input_forget_f16) {
1966     generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, true, 0.f, 1);
1967 }
1968
1969 TEST(lstm_elt_gpu, generic_lstm_elt_test_clip_input_forget_f16) {
1970     generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, true, 0.5f, 1);
1971 }
1972
1973 TEST(lstm_elt_gpu, generic_lstm_elt_test_f16) {
1974     generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, true);
1975 }
1976
1977 TEST(lstm_elt_gpu, generic_lstm_elt_no_cell_f16) {
1978     generic_lstm_elt_gpu_test<FLOAT16>(1, 1, 4, 6, 3, false);
1979 }
1980
1981 TEST(lstm_gpu, generic_lstm_f16) {
1982     generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true);
1983 }
1984
1985 TEST(lstm_gpu, generic_lstm_no_bias_f16) {
1986     generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, false, true, true);
1987 }
1988
1989 TEST(lstm_gpu, generic_lstm_no_hidden_f16) {
1990     generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, true, false, true);
1991 }
1992
1993 TEST(lstm_gpu, generic_lstm_no_bias_hidden_f16) {
1994     generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, false, false, true);
1995 }
1996
1997 TEST(lstm_gpu, generic_lstm_no_cell_f16) {
1998     generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, true, true, false);
1999 }
2000
2001 TEST(lstm_gpu, generic_lstm_no_bias_cell_f16) {
2002     generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, false, true, false);
2003 }
2004
2005 TEST(lstm_gpu, generic_lstm_no_hidden_cell_f16) {
2006     generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, true, false, false);
2007 }
2008
2009 TEST(lstm_gpu, generic_lstm_no_bias_hidden_cell_f16) {
2010     generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 5, 4, 3, false, false, false);
2011 }
2012
2013 TEST(lstm_gpu, generic_lstm_clip_f16) {
2014     generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true, 0.3f, 0);
2015 }
2016
2017 TEST(lstm_gpu, generic_lstm_input_forget_f16) {
2018     generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true, 0.f, 1);
2019 }
2020
2021 TEST(lstm_gpu, generic_lstm_clip_input_forget_f16) {
2022     generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true, 0.3f, 1);
2023 }
2024
2025 TEST(lstm_gpu, generic_lstm_offset_order_ifoz_f16) {
2026     default_offset_type = cldnn_lstm_offset_order_ifoz;
2027     generic_lstm_gpu_test<FLOAT16>(1, 7, 1, 3, 3, 2, true, true, true);
2028     default_offset_type = cldnn_lstm_offset_order_iofz;
2029 }
2030
2031 TEST(lstm_gpu, generic_lstm_canonical_f16) {
2032     generic_lstm_gpu_test<FLOAT16>(1, 1, 1, 1, 1, 1, true, true, true);
2033 }
2034
2035 // bidirectional support
2036 TEST(lstm_gpu, generic_lstm_bi_bias_f16) {
2037     generic_lstm_gpu_test<FLOAT16>(1, 7, 2, 2, 3, 4, true, false, false);
2038 }
2039
2040 TEST(lstm_gpu, generic_lstm_bi_bias_hidden_f16) {
2041     generic_lstm_gpu_test<FLOAT16>(1, 7, 2, 2, 3, 4, true, true, false);
2042 }
2043
2044 TEST(lstm_gpu, generic_lstm_bi_bias_hidden_cell_f16) {
2045     generic_lstm_gpu_test<FLOAT16>(1, 7, 2, 2, 3, 4, true, true, true);
2046 }
2047
2048 // multi-layer support
2049 TEST(lstm_gpu, generic_lstm_stacked_seq_f16) {
2050     generic_lstm_gpu_test<FLOAT16>(4, 7, 1, 3, 3, 2, true, true, true);
2051 }
2052
2053 TEST(lstm_gpu, generic_lstm_stacked_bi_f16) {
2054     generic_lstm_gpu_test<FLOAT16>(4, 7, 2, 3, 3, 2, true, true, true);
2055 }
2056
2057 // TODO: Add tests for the following:
2058 // integration testing using multi-layer and chained LSTMs
2059 // LSTMs single input
2060 // optional activation list
2061